### 1.Data Understanding

In [44]:
#import libraries
import pandas as pd

In [45]:
# load data
#movies data
movies_df=pd.read_csv(r"C:\Users\David\Documents\PHASE 4 PROJECT\movies.csv")
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [46]:
#ratings data
ratings_df=pd.read_csv(r"C:\Users\David\Documents\PHASE 4 PROJECT\ratings.csv")
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [47]:
#links data
links_df=pd.read_csv(r"C:\Users\David\Documents\PHASE 4 PROJECT\links.csv")
links_df.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


### 2. Data preprocessing 

In [48]:
#Merging DataFrames to create a comprehensive dataset
# Merge ratings with movies to get movie titles
merged_df = pd.merge(ratings_df, movies_df, on='movieId')

# merge links
merged_df = pd.merge(merged_df, links_df, on='movieId')

# Preview the merged DataFrame
print(merged_df.head())


   userId  movieId  rating   timestamp             title  \
0       1        1     4.0   964982703  Toy Story (1995)   
1       5        1     4.0   847434962  Toy Story (1995)   
2       7        1     4.5  1106635946  Toy Story (1995)   
3      15        1     2.5  1510577970  Toy Story (1995)   
4      17        1     4.5  1305696483  Toy Story (1995)   

                                        genres  imdbId  tmdbId  
0  Adventure|Animation|Children|Comedy|Fantasy  114709   862.0  
1  Adventure|Animation|Children|Comedy|Fantasy  114709   862.0  
2  Adventure|Animation|Children|Comedy|Fantasy  114709   862.0  
3  Adventure|Animation|Children|Comedy|Fantasy  114709   862.0  
4  Adventure|Animation|Children|Comedy|Fantasy  114709   862.0  


In [49]:
# Handling missing values.
merged_df.isnull().sum()

userId        0
movieId       0
rating        0
timestamp     0
title         0
genres        0
imdbId        0
tmdbId       13
dtype: int64

There are 13 missing values in tmdbId column. The other  columns have no missing values. I will use forwad fill to handle the missing values.

In [50]:
## forward fill
merged_df['tmdbId'] = merged_df['tmdbId'].fillna(method='ffill')
print(merged_df.columns)


Index(['userId', 'movieId', 'rating', 'timestamp', 'title', 'genres', 'imdbId',
       'tmdbId'],
      dtype='object')


In [51]:
#checking for data types
merged_df.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
title         object
genres        object
imdbId         int64
tmdbId       float64
dtype: object

title and genres are categorical.

In [52]:
#Dealing with Categorical Variables
#One-Hot Encoding for Genres:
# One-hot encoding for genres
genres_dummies = merged_df['genres'].str.get_dummies(sep='|')
#TF-IDF for Titles: to convert titles into numeric
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
title_matrix = tfidf.fit_transform(merged_df['title'])


In [53]:
#Spliting the Data
from sklearn.model_selection import train_test_split


train_df, test_df = train_test_split(merged_df, test_size=0.2, random_state=42)


In [54]:
#Feature Engineering
#I will use content based filtering
#Convert sparse matrix to DataFrame
title_df = pd.DataFrame(title_matrix.toarray(), columns=tfidf.get_feature_names_out())

#Concatenate with merged_df
merged_df = pd.concat([merged_df.drop('title', axis=1), title_df], axis=1)

print(merged_df)

        userId  movieId  rating   timestamp  \
0            1        1     4.0   964982703   
1            5        1     4.0   847434962   
2            7        1     4.5  1106635946   
3           15        1     2.5  1510577970   
4           17        1     4.5  1305696483   
...        ...      ...     ...         ...   
100831     610   160341     2.5  1479545749   
100832     610   160527     4.5  1479544998   
100833     610   160836     3.0  1493844794   
100834     610   163937     3.5  1493848789   
100835     610   163981     3.5  1493850155   

                                             genres   imdbId    tmdbId   00  \
0       Adventure|Animation|Children|Comedy|Fantasy   114709     862.0  0.0   
1       Adventure|Animation|Children|Comedy|Fantasy   114709     862.0  0.0   
2       Adventure|Animation|Children|Comedy|Fantasy   114709     862.0  0.0   
3       Adventure|Animation|Children|Comedy|Fantasy   114709     862.0  0.0   
4       Adventure|Animation|Children|Com

### 3. Choosing a Recommendation Algorithm

I will work with content based filtering

In [55]:
# Combine the TF-IDF features from titles and one-hot encoded genres.
from scipy.sparse import hstack

# Combine title and genres
combined_features = hstack([title_matrix, genres_dummies])
