In [6]:
import pandas as pd

In [10]:
def clean_movies_data(movies):
    """Clean movies data - minimal approach"""
    genre_columns = [
        "unknown",
        "Action",
        "Adventure",
        "Animation",
        "Children's",
        "Comedy",
        "Crime",
        "Documentary",
        "Drama",
        "Fantasy",
        "Film-Noir",
        "Horror",
        "Musical",
        "Mystery",
        "Romance",
        "Sci-Fi",
        "Thriller",
        "War",
        "Western",
    ]

    movies.columns = [
        "movie_id",
        "movie_title",
        "release_date",
        "video_release_date",
        "imdb_url",
    ] + genre_columns

    # Extract year from title and clean
    movies["release_year"] = movies["movie_title"].str.extract(r"\((\d{4})\)")
    movies["movie_title"] = movies["movie_title"].str.replace(
        r"\s*\(\d{4}\)\s*", "", regex=True
    )
    movies["release_year"] = pd.to_numeric(movies["release_year"], errors="coerce")

    # Remove unnecessary columns
    final_columns = ["movie_id", "movie_title", "release_year"] + genre_columns
    result = movies[final_columns]

    # Basic checks
    print(f"Movies: {len(result)}")
    print(f"Duplicate movie_ids: {result['movie_id'].duplicated().sum()}")
    print(f"Invalid release years: {result['release_year'].isna().sum()}")

    return result


In [3]:
def clean_ratings_data(df, valid_movie_ids, valid_user_ids):
    """Clean ratings data - minimal approach"""
    df.columns = ['user_id', 'item_id', 'rating', 'timestamp']
    
    original_count = len(df)
    
    # Check what's being removed at each step
    invalid_ratings = df[~df['rating'].between(1, 5)]
    invalid_users = df[~df['user_id'].isin(valid_user_ids)]
    invalid_movies = df[~df['item_id'].isin(valid_movie_ids)]
    
    print(f"Invalid ratings (out of range): {len(invalid_ratings)}")
    if len(invalid_ratings) > 0:
        print("Ratings out of range:")
        print(invalid_ratings)
    
    print(f"Invalid user_ids: {len(invalid_users)}")
    if len(invalid_users) > 0:
        print("Ratings with invalid user_ids:")
        print(invalid_users)
    
    print(f"Invalid movie_ids: {len(invalid_movies)}")
    if len(invalid_movies) > 0:
        print("Ratings with invalid movie_ids:")
        print(invalid_movies)
    
    # Basic validation
    df = df[df['rating'].between(1, 5)]
    df = df[df['user_id'].isin(valid_user_ids)]
    df = df[df['item_id'].isin(valid_movie_ids)]
    
    removed_count = original_count - len(df)
    print(f"Ratings: {len(df)}")
    print(f"Invalid ratings removed: {removed_count}")
    
    return df

In [4]:
def clean_users_data(df, valid_occupations):
    """Clean users data - minimal approach"""
    df.columns = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
    
    # Store original count
    original_count = len(df)
    
    # Basic validation
    df = df[df['age'].between(5, 100)]
    age_removed = original_count - len(df)
    original_count = len(df)  # Update count
    
    df = df[df['gender'].isin(['M', 'F'])]
    gender_removed = original_count - len(df)
    original_count = len(df)  # Update count

    # Dummy variables for gender
    df['gender'] = df['gender'].map({'M': 1, 'F': 0})
    
    # VALIDATE OCCUPATIONS against u.occupation list
    invalid_occupations = df[~df['occupation'].isin(valid_occupations)]
    occupation_removed = len(invalid_occupations)
    
    if occupation_removed > 0:
        print(f"Invalid occupations found: {occupation_removed}")
        print(f"Invalid occupation types: {invalid_occupations['occupation'].unique().tolist()}")
        # Keep only valid occupations
        df = df[df['occupation'].isin(valid_occupations)]
    
    print(f"Users cleaning report:")
    print(f"  Original users: {original_count}")
    print(f"  Removed due to age: {age_removed}")
    print(f"  Removed due to gender: {gender_removed}")
    print(f"  Removed due to occupation: {occupation_removed}")
    print(f"  Final users: {len(df)}")
    
    return df

In [5]:
# Load all data
movies_df = pd.read_csv("../data/u.item", sep="|", encoding="latin-1", header=None)
ratings_df = pd.read_csv("../data/u.data", sep="\t", header=None)
users_df = pd.read_csv("../data/u.user", sep="|", header=None)
occupations = pd.read_csv("../data/u.occupation", header=None)[0].tolist()

# Clean data
movies_clean = clean_movies_data(movies_df)

# CHECK which movie has invalid release year
invalid_year_movies = movies_clean[movies_clean["release_year"].isna()]
print("Movies with invalid release years:")
print(invalid_year_movies[["movie_id", "movie_title", "release_year"]])

# Also check the original data for this movie
if len(invalid_year_movies) > 0:
    invalid_movie_id = invalid_year_movies["movie_id"].iloc[0]
    original_movie_row = movies_df[movies_df.iloc[:, 0] == invalid_movie_id]
    print(f"\nOriginal data for movie_id {invalid_movie_id}:")
    print(original_movie_row.iloc[0].tolist()[:5])  # Show first 5 columns

# NOW REMOVE movies with invalid years
movies_clean = movies_clean[movies_clean["release_year"].notna()]
print(f"\nAfter removing invalid years: {len(movies_clean)} movies")

# Continue with other cleaning
users_clean = clean_users_data(users_df, occupations)
ratings_clean = clean_ratings_data(
    ratings_df, set(movies_clean["movie_id"]), set(users_clean["user_id"])
)

# Save as Parquet
movies_clean.to_parquet("../results/movies.parquet", index=False)
users_clean.to_parquet("../results/users.parquet", index=False)
ratings_clean.to_parquet("../results/ratings.parquet", index=False)

FileNotFoundError: [Errno 2] No such file or directory: '../data/u.item'