# Preprocessing file for film and ratings csv

In [1]:
# Environment Setup
import pandas as pd
from pathlib import Path

In [3]:
# Loading Raw Data
# Loading the original MovieLens 'small' dataset
# Note: Using raw data to demonstrate the cleaning pipeline
movie_path = '../data/raw/movies.csv'
rating_path = '../data/raw/ratings.csv'

movie_df = pd.read_csv(movie_path)
rating_df = pd.read_csv(rating_path)

print(f"Loaded {len(movie_df)} movies and {len(rating_df)} ratings.")
display(movie_df.head())

Loaded 9742 movies and 100836 ratings.


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
# Data Cleaning Pipeline
# Removing missing values and duplicates to ensure data integrity
movie_df.dropna(inplace=True)
movie_df.drop_duplicates(inplace=True)

rating_df.dropna(inplace=True) 
rating_df.drop_duplicates(inplace=True)

# Normalizing genres: converting the pipe-separated string into a Python list
# This facilitates the subsequent One-Hot Encoding process
movie_df['genres'] = movie_df['genres'].str.split('|')

print("Cleaning complete. Data integrity verified.")
display(movie_df.head())

Cleaning complete. Data integrity verified.


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]


In [5]:
# One-Hot Encoding (Feature Engineering)
# To calculate cosine similarity, we need a numerical representation of genres.
# Here we expand the 'genres' list into binary columns (0 or 1).

# Extracting the unique set of all genres present in the dataset
all_genres = sorted(set(genre for sublist in movie_df['genres'] for genre in sublist))

for genre in all_genres:
    # Creating a new column for each genre: 1 if the movie belongs to it, 0 otherwise
    movie_df[genre] = movie_df['genres'].apply(lambda x: int(genre in x))

print(f"Feature Engineering complete. Added {len(all_genres)} binary genre columns.")
display(movie_df.head())

Feature Engineering complete. Added 20 binary genre columns.


Unnamed: 0,movieId,title,genres,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]",0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),"[Comedy, Romance]",0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),[Comedy],0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# CELL 5: Exporting Processed Data
# Saving the cleaned and encoded data for the FastAPI backend and Recommender logic
output_dir = Path('../data/cleaned')
output_dir.mkdir(parents=True, exist_ok=True) # Ensure directory exists

movie_df.to_csv(output_dir / 'movies.csv', index=False)
rating_df.to_csv(output_dir / 'ratings.csv', index=False)

print("Processed datasets exported successfully to /data/cleaned/")

Processed datasets exported successfully to /data/cleaned/
