# Preprocessing file for film and ratings csv

### Data cleaning (remove NaN an duplicates)

In [11]:
import pandas as pd

In [17]:
# File paths
movie_path = 'raw/movies.csv'
rating_path = 'raw/ratings.csv'

# Load CSV files into DataFrames
movie_df = pd.read_csv(movie_path)
rating_df = pd.read_csv(rating_path)

# Display the first few rows of each DataFrame
print("Movies DataFrame:")
print(movie_df)
print("\nRatings DataFrame:")
print(rating_df)

Movies DataFrame:
       movieId                               title  \
0            1                    Toy Story (1995)   
1            2                      Jumanji (1995)   
2            3             Grumpier Old Men (1995)   
3            4            Waiting to Exhale (1995)   
4            5  Father of the Bride Part II (1995)   
...        ...                                 ...   
87580   292731           The Monroy Affaire (2022)   
87581   292737          Shelter in Solitude (2023)   
87582   292753                         Orca (2023)   
87583   292755              The Angry Breed (1968)   
87584   292757           Race to the Summit (2023)   

                                            genres  
0      Adventure|Animation|Children|Comedy|Fantasy  
1                       Adventure|Children|Fantasy  
2                                   Comedy|Romance  
3                             Comedy|Drama|Romance  
4                                           Comedy  
...            

In [19]:
# Clean data
movie_df.dropna(inplace=True)
movie_df.drop_duplicates(inplace=True)

rating_df.dropna(inplace=True) 
rating_df.drop_duplicates(inplace=True)

movie_df['genres'] = movie_df['genres'].str.split('|').tolist()

print("\nCleaned Movies DataFrame:")
print(movie_df)
print("\nCleaned Ratings DataFrame:")
print(rating_df)


Cleaned Movies DataFrame:
       movieId                               title  \
0            1                    Toy Story (1995)   
1            2                      Jumanji (1995)   
2            3             Grumpier Old Men (1995)   
3            4            Waiting to Exhale (1995)   
4            5  Father of the Bride Part II (1995)   
...        ...                                 ...   
87580   292731           The Monroy Affaire (2022)   
87581   292737          Shelter in Solitude (2023)   
87582   292753                         Orca (2023)   
87583   292755              The Angry Breed (1968)   
87584   292757           Race to the Summit (2023)   

                                                  genres  
0      [Adventure, Animation, Children, Comedy, Fantasy]  
1                         [Adventure, Children, Fantasy]  
2                                      [Comedy, Romance]  
3                               [Comedy, Drama, Romance]  
4                            

### Transform into similarity matrix to permit content-based recommendation

In [24]:
# One-hot encode genres
all_genres = set(genre for sublist in movie_df['genres'] for genre in sublist)
for genre in all_genres:
    movie_df[genre] = movie_df['genres'].apply(lambda x: int(genre in x))

print("\nOne-Hot Encoded Movies DataFrame:")
print(movie_df)


One-Hot Encoded Movies DataFrame:
       movieId                               title  \
0            1                    Toy Story (1995)   
1            2                      Jumanji (1995)   
2            3             Grumpier Old Men (1995)   
3            4            Waiting to Exhale (1995)   
4            5  Father of the Bride Part II (1995)   
...        ...                                 ...   
87580   292731           The Monroy Affaire (2022)   
87581   292737          Shelter in Solitude (2023)   
87582   292753                         Orca (2023)   
87583   292755              The Angry Breed (1968)   
87584   292757           Race to the Summit (2023)   

                                                  genres  Fantasy  War  \
0      [Adventure, Animation, Children, Comedy, Fantasy]        1    0   
1                         [Adventure, Children, Fantasy]        1    0   
2                                      [Comedy, Romance]        0    0   
3                   

In [26]:
# Save cleaned data
movie_df.to_csv('cleaned/movies_cleaned.csv', index=False)
rating_df.to_csv('cleaned/ratings_cleaned.csv', index=False)