In [1]:
# import necessary libraries 
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns


In [4]:
# Load datasets
movies = pd.read_csv("movies.csv") 
links = pd.read_csv("links.csv")
tags = pd.read_csv("tags.csv")  
ratings = pd.read_csv("ratings.csv") 





In [5]:
print(ratings.info())
print(movies.info())
print(links.info())
print(tags.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9742 non-null   int64  
 1   imdbId   9742 no

In [6]:
print (ratings.head())

   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


In [11]:
print (links.head())

   movieId  imdbId   tmdbId
0        1  114709    862.0
1        2  113497   8844.0
2        3  113228  15602.0
3        4  114885  31357.0
4        5  113041  11862.0


In [12]:
print(tags.head())

   userId  movieId              tag   timestamp
0       2    60756            funny  1445714994
1       2    60756  Highly quotable  1445714996
2       2    60756     will ferrell  1445714992
3       2    89774     Boxing story  1445715207
4       2    89774              MMA  1445715200


In [13]:
print (movies.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [None]:
# Merge ratings with movie titles and genres
ratings_movies = pd.merge(ratings, movies, on="movieId", how="left")
print(ratings_movies.head())

   userId  movieId  rating  timestamp                        title  \
0       1        1     4.0  964982703             Toy Story (1995)   
1       1        3     4.0  964981247      Grumpier Old Men (1995)   
2       1        6     4.0  964982224                  Heat (1995)   
3       1       47     5.0  964983815  Seven (a.k.a. Se7en) (1995)   
4       1       50     5.0  964982931   Usual Suspects, The (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                               Comedy|Romance  
2                        Action|Crime|Thriller  
3                             Mystery|Thriller  
4                       Crime|Mystery|Thriller  


In [10]:
# Merge tags into the ratings_movies dataset
ratings_movies_tags = pd.merge(ratings_movies, tags[['movieId','tag']], on="movieId", how="left")
print(ratings_movies_tags.head())

   userId  movieId  rating  timestamp                    title  \
0       1        1     4.0  964982703         Toy Story (1995)   
1       1        1     4.0  964982703         Toy Story (1995)   
2       1        1     4.0  964982703         Toy Story (1995)   
3       1        3     4.0  964981247  Grumpier Old Men (1995)   
4       1        3     4.0  964981247  Grumpier Old Men (1995)   

                                        genres    tag  
0  Adventure|Animation|Children|Comedy|Fantasy  pixar  
1  Adventure|Animation|Children|Comedy|Fantasy  pixar  
2  Adventure|Animation|Children|Comedy|Fantasy    fun  
3                               Comedy|Romance  moldy  
4                               Comedy|Romance    old  


In [14]:
#merge links into the ratings_movies_tags dataset
ratings_movies_tags_links = pd.merge(ratings_movies_tags, links, on="movieId", how="left")
print(ratings_movies_tags_links.head())

   userId  movieId  rating  timestamp                    title  \
0       1        1     4.0  964982703         Toy Story (1995)   
1       1        1     4.0  964982703         Toy Story (1995)   
2       1        1     4.0  964982703         Toy Story (1995)   
3       1        3     4.0  964981247  Grumpier Old Men (1995)   
4       1        3     4.0  964981247  Grumpier Old Men (1995)   

                                        genres    tag  imdbId   tmdbId  
0  Adventure|Animation|Children|Comedy|Fantasy  pixar  114709    862.0  
1  Adventure|Animation|Children|Comedy|Fantasy  pixar  114709    862.0  
2  Adventure|Animation|Children|Comedy|Fantasy    fun  114709    862.0  
3                               Comedy|Romance  moldy  113228  15602.0  
4                               Comedy|Romance    old  113228  15602.0  


In [15]:
 print(ratings_movies_tags_links.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 285762 entries, 0 to 285761
Data columns (total 9 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     285762 non-null  int64  
 1   movieId    285762 non-null  int64  
 2   rating     285762 non-null  float64
 3   timestamp  285762 non-null  int64  
 4   title      285762 non-null  object 
 5   genres     285762 non-null  object 
 6   tag        233213 non-null  object 
 7   imdbId     285762 non-null  int64  
 8   tmdbId     285749 non-null  float64
dtypes: float64(2), int64(4), object(3)
memory usage: 19.6+ MB
None


In [20]:
#check for missing values
print(ratings_movies_tags_links.isnull().sum())



userId           0
movieId          0
rating           0
timestamp        0
title            0
genres           0
tag          52549
imdbId           0
tmdbId          13
dtype: int64


In [23]:
#fill missing tags with 'No Tag'
ratings_movies_tags_links['tag'] = ratings_movies_tags_links['tag'].fillna('No Tag')



In [30]:
#drop the missing tmdbId rows
ratings_movies_tags_links = ratings_movies_tags_links.dropna(subset=['tmdbId'])
ratings_movies_tags_links['tmdbId'] = ratings_movies_tags_links['tmdbId'].fillna(-1)


In [31]:
#check for missing values
print(ratings_movies_tags_links.isnull().sum())

userId       0
movieId      0
rating       0
timestamp    0
title        0
genres       0
tag          0
imdbId       0
tmdbId       0
dtype: int64


In [21]:
#checck for duplicate rows
print(ratings_movies_tags_links.duplicated().sum())
ratings_movies_tags_links = ratings_movies_tags_links.drop_duplicates()
print(ratings_movies_tags_links.duplicated().sum())

13807
0


In [33]:
print("Duplicates before cleaning:", ratings_movies_tags_links.duplicated().sum())

Duplicates before cleaning: 0


In [39]:
# Split genres only once (when they are strings)
if isinstance(ratings_movies_tags_links['genres'].iloc[0], str):
    ratings_movies_tags_links['genres'] = ratings_movies_tags_links['genres'].apply(lambda x: x.split('|'))

# Confirm the transformation
print(ratings_movies_tags_links[['title','genres','tag']].head())

                     title                                             genres  \
0         Toy Story (1995)  [Adventure, Animation, Children, Comedy, Fantasy]   
2         Toy Story (1995)  [Adventure, Animation, Children, Comedy, Fantasy]   
3  Grumpier Old Men (1995)                                  [Comedy, Romance]   
4  Grumpier Old Men (1995)                                  [Comedy, Romance]   
5              Heat (1995)                          [Action, Crime, Thriller]   

      tag  
0   pixar  
2     fun  
3   moldy  
4     old  
5  No Tag  


In [40]:
ratings_movies_tags_links.info()

<class 'pandas.core.frame.DataFrame'>
Index: 271942 entries, 0 to 285761
Data columns (total 9 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     271942 non-null  int64  
 1   movieId    271942 non-null  int64  
 2   rating     271942 non-null  float64
 3   timestamp  271942 non-null  int64  
 4   title      271942 non-null  object 
 5   genres     271942 non-null  object 
 6   tag        271942 non-null  object 
 7   imdbId     271942 non-null  int64  
 8   tmdbId     271942 non-null  float64
dtypes: float64(2), int64(4), object(3)
memory usage: 20.7+ MB


In [45]:
# Create a userâ€“item matrix with users as rows and movies as columns
user_item_matrix = ratings_movies_tags_links.pivot_table(
    index='userId',
    columns='movieId',
    values='rating'
)

# Quick check
print(user_item_matrix.shape)
print(user_item_matrix.head())


(610, 9716)
movieId  1       2       3       4       5       6       7       8       \
userId                                                                    
1           4.0     NaN     4.0     NaN     NaN     4.0     NaN     NaN   
2           NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
3           NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
4           NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
5           4.0     NaN     NaN     NaN     NaN     NaN     NaN     NaN   

movieId  9       10      ...  193565  193567  193571  193573  193579  193581  \
userId                   ...                                                   
1           NaN     NaN  ...     NaN     NaN     NaN     NaN     NaN     NaN   
2           NaN     NaN  ...     NaN     NaN     NaN     NaN     NaN     NaN   
3           NaN     NaN  ...     NaN     NaN     NaN     NaN     NaN     NaN   
4           NaN     NaN  ...     NaN     NaN     NaN     NaN  

In [50]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity between movies
item_similarity = cosine_similarity(user_item_matrix.T.fillna(0))

# Convert to DataFrame for readability
item_similarity_df = pd.DataFrame(
    item_similarity,
    index=user_item_matrix.columns,
    columns=user_item_matrix.columns
)

# Function to recommend similar movies
def recommend_similar_movies(movie_id, top_n=5):
    similar_scores = item_similarity_df[movie_id].sort_values(ascending=False)
    similar_scores = similar_scores.drop(movie_id) 
    return similar_scores.head(top_n)

print(recommend_similar_movies(1, top_n=5))

movieId
3114    0.572601
480     0.565637
780     0.564262
260     0.557388
356     0.547096
Name: 1, dtype: float64


In [51]:
# Load movies metadata
movies = pd.read_csv("movies.csv")

# Merge recommendations with titles
def recommend_similar_movies(movie_id, top_n=5):
    similar_scores = item_similarity_df[movie_id].sort_values(ascending=False)
    similar_scores = similar_scores.drop(movie_id).head(top_n)
    recommendations = pd.DataFrame({
        "movieId": similar_scores.index,
        "similarity": similar_scores.values
    })
    return recommendations.merge(movies, on="movieId")
print(recommend_similar_movies(1, top_n=5))

   movieId  similarity                                      title  \
0     3114    0.572601                         Toy Story 2 (1999)   
1      480    0.565637                       Jurassic Park (1993)   
2      780    0.564262       Independence Day (a.k.a. ID4) (1996)   
3      260    0.557388  Star Wars: Episode IV - A New Hope (1977)   
4      356    0.547096                        Forrest Gump (1994)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1             Action|Adventure|Sci-Fi|Thriller  
2             Action|Adventure|Sci-Fi|Thriller  
3                      Action|Adventure|Sci-Fi  
4                     Comedy|Drama|Romance|War  


In [54]:
def recommend_for_user(user_id, top_n=5):
    # Get movies rated by the user
    user_ratings = user_item_matrix.loc[user_id]
    liked_movies = user_ratings[user_ratings > 4].index 
    
    # Collect recommendations
    all_recommendations = pd.Series(dtype=float)
    for movie in liked_movies:
        recs = recommend_similar_movies(movie, top_n=top_n)
        all_recommendations = all_recommendations.append(recs)
    
    # Aggregate scores and sort
    all_recommendations = all_recommendations.groupby(all_recommendations.index).mean()
    all_recommendations = all_recommendations.drop(liked_movies, errors="ignore")
    
    return all_recommendations.sort_values(ascending=False).head(top_n)



In [56]:
def recommend_similar_movies(movie_id, top_n=5, genre_filter=None):
    similar_scores = item_similarity_df[movie_id].sort_values(ascending=False)
    similar_scores = similar_scores.drop(movie_id)
    
    recommendations = pd.DataFrame({
        "movieId": similar_scores.index,
        "similarity": similar_scores.values
    }).merge(movies, on="movieId")
    
    # Apply genre filter if provided
    if genre_filter:
        recommendations = recommendations[
            recommendations["genres"].str.contains(genre_filter, case=False)
        ]
    
    return recommendations.head(top_n)
print(recommend_similar_movies(1, top_n=5, genre_filter="Comedy"))


    movieId  similarity                      title  \
0      3114    0.572601         Toy Story 2 (1999)   
4       356    0.547096        Forrest Gump (1994)   
8      1265    0.534169       Groundhog Day (1993)   
9      1270    0.530381  Back to the Future (1985)   
10     4306    0.527977               Shrek (2001)   

                                               genres  
0         Adventure|Animation|Children|Comedy|Fantasy  
4                            Comedy|Drama|Romance|War  
8                              Comedy|Fantasy|Romance  
9                             Adventure|Comedy|Sci-Fi  
10  Adventure|Animation|Children|Comedy|Fantasy|Ro...  
