# Preprocessing file for film and ratings csv

### Data cleaning (remove NaN an duplicates)

In [27]:
import pandas as pd

In [None]:
# File paths
movie_path = '..data/raw/movies.csv'
rating_path = '..data/raw/ratings.csv'

# Load CSV files into DataFrames
movie_df = pd.read_csv(movie_path)
rating_df = pd.read_csv(rating_path)

# Display the first few rows of each DataFrame
print("Movies DataFrame:")
print(movie_df)
print("\nRatings DataFrame:")
print(rating_df)

Movies DataFrame:
      movieId                                      title  \
0           1                           Toy Story (1995)   
1           2                             Jumanji (1995)   
2           3                    Grumpier Old Men (1995)   
3           4                   Waiting to Exhale (1995)   
4           5         Father of the Bride Part II (1995)   
...       ...                                        ...   
9737   193581  Black Butler: Book of the Atlantic (2017)   
9738   193583               No Game No Life: Zero (2017)   
9739   193585                               Flint (2017)   
9740   193587        Bungo Stray Dogs: Dead Apple (2018)   
9741   193609        Andrew Dice Clay: Dice Rules (1991)   

                                           genres  
0     Adventure|Animation|Children|Comedy|Fantasy  
1                      Adventure|Children|Fantasy  
2                                  Comedy|Romance  
3                            Comedy|Drama|Romance  
4

In [29]:
# Clean data
movie_df.dropna(inplace=True)
movie_df.drop_duplicates(inplace=True)

rating_df.dropna(inplace=True) 
rating_df.drop_duplicates(inplace=True)

movie_df['genres'] = movie_df['genres'].str.split('|').tolist()

print("\nCleaned Movies DataFrame:")
print(movie_df)
print("\nCleaned Ratings DataFrame:")
print(rating_df)


Cleaned Movies DataFrame:
      movieId                                      title  \
0           1                           Toy Story (1995)   
1           2                             Jumanji (1995)   
2           3                    Grumpier Old Men (1995)   
3           4                   Waiting to Exhale (1995)   
4           5         Father of the Bride Part II (1995)   
...       ...                                        ...   
9737   193581  Black Butler: Book of the Atlantic (2017)   
9738   193583               No Game No Life: Zero (2017)   
9739   193585                               Flint (2017)   
9740   193587        Bungo Stray Dogs: Dead Apple (2018)   
9741   193609        Andrew Dice Clay: Dice Rules (1991)   

                                                 genres  
0     [Adventure, Animation, Children, Comedy, Fantasy]  
1                        [Adventure, Children, Fantasy]  
2                                     [Comedy, Romance]  
3                   

### Transform into one-hot encoding to permit content-based recommendation

In [30]:
# One-hot encode genres
all_genres = set(genre for sublist in movie_df['genres'] for genre in sublist)
for genre in all_genres:
    movie_df[genre] = movie_df['genres'].apply(lambda x: int(genre in x))

print("\nOne-Hot Encoded Movies DataFrame:")
print(movie_df)


One-Hot Encoded Movies DataFrame:
      movieId                                      title  \
0           1                           Toy Story (1995)   
1           2                             Jumanji (1995)   
2           3                    Grumpier Old Men (1995)   
3           4                   Waiting to Exhale (1995)   
4           5         Father of the Bride Part II (1995)   
...       ...                                        ...   
9737   193581  Black Butler: Book of the Atlantic (2017)   
9738   193583               No Game No Life: Zero (2017)   
9739   193585                               Flint (2017)   
9740   193587        Bungo Stray Dogs: Dead Apple (2018)   
9741   193609        Andrew Dice Clay: Dice Rules (1991)   

                                                 genres  Fantasy  War  Action  \
0     [Adventure, Animation, Children, Comedy, Fantasy]        1    0       0   
1                        [Adventure, Children, Fantasy]        1    0       0   
2

In [31]:
# Save cleaned data
movie_df.to_csv('cleaned/movies_cleaned.csv', index=False)
rating_df.to_csv('cleaned/ratings_cleaned.csv', index=False)