# Data Cleaning of Ratings Dataset

In [1]:
import pandas as pd

### Loading in Dataset

In [2]:
df_ratings = pd.read_csv("assets/ratings.csv")

df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264


---
# Cleaning Data

### Functions for removing cleaning:

In [3]:
def drop_users(df, min_ratings, max_ratings):
    """
    Drop users with more or less (or equal) ratings than respective threshold
    """
    counts = df['userId'].value_counts()

    # filter out users with less than min_ratings (less relevant due to not having many movie recommendations)
    df_filtered = df[df["userId"].isin(counts[counts >= min_ratings].index)]

    # filter out users with more than max_ratings (outlier data)
    # (also less relevant due to having too many movie recommendations, and high likelihood of spam bots)
    df_filtered = df_filtered[df_filtered["userId"].isin(counts[counts <= max_ratings].index)]

    return df_filtered

In [4]:
def drop_movies(df, min_ratings):
    """
    Drop movies with less total ratings than threshold
    """
    counts = df['movieId'].value_counts()

    
    df_filtered = df[df["movieId"].isin(counts[counts >= min_ratings].index)]

    return df_filtered

In [5]:
def drop_low_average(df, min_average):
    """
    Drop users with less average rating than threshold
    """

    # Get the mean rating for each user
    user_mean_ratings = df.groupby('userId')['rating'].mean()

    # Get the userIds with mean rating above N
    user_ids_to_keep = user_mean_ratings[user_mean_ratings >= min_average].index

    # Filter the DataFrame to keep only the desired userIds
    df_filtered = df[df['userId'].isin(user_ids_to_keep)]

    return df_filtered

In [6]:
def drop_ratings(df, min_rating):
    """
    Drop ratings below threshold
    """

    # removing scores of less than min_rating (not relevant to recommend to anyone)
    df_filtered = df.loc[df['rating'] >= min_rating]

    return df_filtered

### Using functions to remove data:

In [7]:
# dropping the timestamp column as it is never used in the recommendation system
df_filtered = df_ratings.drop(columns = "timestamp")

Filter out users with less than n ratings  
Reasoning: likely does not add much to overall performance  

Filtering out users with more than N ratings  
Reasoning: likely a lot of bots with this amount of rating  

In [8]:
df_filtered = drop_users(df_filtered, 2, 2000)

Filter out movies with less than N ratings  
Reasoning: extremely niche movies, mostly old movies that nobody ever watches, will reduce size of matrix without losing valuable data

In [9]:
df_filtered = drop_movies(df_filtered, 5)

Filter out users with average rating below N  
Reasoning: mostly low ratings do not add much to recommendations, likely removes a lot of trolls with low ratings

In [10]:
df_filtered = drop_low_average(df_filtered, 2)

Filter out ratings below N  
Reasoning: high ratings are more relevant for giving accurate suggestions

In [11]:
df_filtered = drop_ratings(df_filtered, 3.5)

# Overview of Data Removed

In [12]:
original = len(df_ratings) # length of dataset pre-filter
new = len(df_filtered) # length of dataset post-filter

# printing change in length of dataset
print(f"Length of dataset reduced from {original:,} to {new:,} (a reduction of {original - new:,} rows or {(original - new) / original * 100:.2f}%)\n")

original = len(df_ratings["userId"].unique()) # amt users in dataset pre-filter
new = len(df_filtered["userId"].unique()) # amt users in dataset post-filter

# print changes in amt users
print(f"Amount users reduced from {original:,} to {new:,} (a reduction of {original - new:,} users or {(original - new) / original * 100:.2f}%)\n")

original = len(df_ratings["movieId"].unique()) # amt movies in dataset pre-filter
new = len(df_filtered["movieId"].unique()) # amt movies in dataset post-filter

# print changes in amt movies
print(f"Amount movies reduced from {original:,} to {new:,} (a reduction of {original - new:,} movies or {(original - new) / original * 100:.2f}%)\n")

Length of dataset reduced from 27,753,444 to 16,626,813 (a reduction of 11,126,631 rows or 40.09%)

Amount users reduced from 283,228 to 273,120 (a reduction of 10,108 users or 3.57%)

Amount movies reduced from 53,889 to 25,674 (a reduction of 28,215 movies or 52.36%)



# Save Cleaned Data
Store as .csv and import in other files if they need to run a smaller dataset

In [13]:
df_filtered.to_csv("assets/ratings_clean.csv")