In [None]:
import pandas as pd
import os

def clean_ratings():
    # Load raw data
    ratings = pd.read_csv(
        "raw_data/title.ratings.tsv", 
        sep='\t',
        dtype={'tconst': str, 'averageRating': float, 'numVotes': int}
    )
    
    # Filter valid ratings
    ratings = ratings[
        (ratings['averageRating'].between(1, 10)) & 
        (ratings['numVotes'] > 10)  # Minimum vote threshold
    ]
    
    # Standardize column names
    ratings = ratings.rename(columns={
        'tconst': 'imdb_id',          
        'averageRating': 'avg_rating',    
        'numVotes': 'vote_count'    
    })
    
    # Select and order columns
    ratings = ratings[['imdb_id', 'avg_rating', 'vote_count']]
    
    # Ensure the directory exists
    os.makedirs("data/cleaned", exist_ok=True)
    
    # Save cleaned data
    ratings.to_csv("data/cleaned/ratings_clean.csv", index=False)
    print(f" Saved cleaned ratings ({len(ratings)} records)")
    
    # Load films dataset
    films = pd.read_csv("datasets/films.csv")  
    
    # Filter ratings to match films dataset
    filtered_ratings = ratings[ratings['imdb_id'].isin(films['imdb_id'])]  
    
    # Save filtered data
    filtered_ratings.to_csv("data/cleaned/filtered_ratings.csv", index=False) 
    print(f" Saved filtered ratings ({len(filtered_ratings)} records)")  

clean_ratings()


 Saved cleaned ratings (1209144 records)
 Saved filtered ratings (4862 records)
