## Data Preprocessing

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [16]:
# Load datasets with a different encoding to avoid UnicodeDecodeError
users = pd.read_csv('Movielens/users.dat', sep='::', header=None, names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'], engine='python', encoding='ISO-8859-1')
movies = pd.read_csv('Movielens/movies.dat', sep='::', header=None, names=['MovieID', 'Title', 'Genres'], engine='python', encoding='ISO-8859-1')
ratings = pd.read_csv('Movielens/ratings.dat', sep='::', header=None, names=['UserID', 'MovieID', 'Rating', 'Timestamp'], engine='python', encoding='ISO-8859-1')


In [17]:
# Display the first few rows of each DataFrame
print("Users DataFrame:")
print(users.head())
print("\nMovies DataFrame:")
print(movies.head())
print("\nRatings DataFrame:")
print(ratings.head())


Users DataFrame:
   UserID Gender  Age  Occupation Zip-code
0       1      F    1          10    48067
1       2      M   56          16    70072
2       3      M   25          15    55117
3       4      M   45           7    02460
4       5      M   25          20    55455

Movies DataFrame:
   MovieID                               Title                        Genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy

Ratings DataFrame:
   UserID  MovieID  Rating  Timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824

In [18]:
merged_data = pd.merge(pd.merge(ratings, users, on='UserID'), movies, on='MovieID')


In [19]:
# Initialize LabelEncoders
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()
gender_encoder = LabelEncoder()
genre_encoder = LabelEncoder()

# Fit and transform the 'UserID', 'MovieID', and 'Gender' columns
merged_data['User'] = user_encoder.fit_transform(merged_data['UserID'])
merged_data['Movie'] = movie_encoder.fit_transform(merged_data['MovieID'])
merged_data['Gender'] = gender_encoder.fit_transform(merged_data['Gender'])

# For 'Genres', split by '|' and encode each genre
merged_data['Genres'] = merged_data['Genres'].str.split('|')
all_genres = set(g for genres in merged_data['Genres'] for g in genres)
genre_encoder.fit(list(all_genres))

# Create a binary matrix for genres
for genre in genre_encoder.classes_:
    merged_data[genre] = merged_data['Genres'].apply(lambda x: 1 if genre in x else 0)


In [20]:
merged_data.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Gender,Age,Occupation,Zip-code,Title,Genres,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1193,5,978300760,0,1,10,48067,One Flew Over the Cuckoo's Nest (1975),[Drama],...,0,0,0,0,0,0,0,0,0,0
1,1,661,3,978302109,0,1,10,48067,James and the Giant Peach (1996),"[Animation, Children's, Musical]",...,0,0,0,1,0,0,0,0,0,0
2,1,914,3,978301968,0,1,10,48067,My Fair Lady (1964),"[Musical, Romance]",...,0,0,0,1,0,1,0,0,0,0
3,1,3408,4,978300275,0,1,10,48067,Erin Brockovich (2000),[Drama],...,0,0,0,0,0,0,0,0,0,0
4,1,2355,5,978824291,0,1,10,48067,"Bug's Life, A (1998)","[Animation, Children's, Comedy]",...,0,0,0,0,0,0,0,0,0,0
