In [36]:
# The necessary imports
import pandas as pd
import numpy as np

# Imports for data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Content-Based Filtering Preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel

# Collaborative Filtering Preprocessing
from surprise import SVD, Dataset, Reader, accuracy
from surprise.model_selection import train_test_split, GridSearchCV

# For evaluation metrics
from collections import defaultdict

In [37]:
tags = pd.read_csv('Datasets/Movielens_data/tags.csv')
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [38]:
movies = pd.read_csv('Datasets/Movielens_data/movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [39]:
ratings = pd.read_csv('Datasets/Movielens_data/ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [40]:
raw_data = pd.read_csv('Datasets/merged_movie_data.csv')
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102695 entries, 0 to 102694
Data columns (total 10 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   movieId           102695 non-null  int64  
 1   title             102695 non-null  object 
 2   genres            102695 non-null  object 
 3   imdbId            102695 non-null  int64  
 4   tmdbId            102682 non-null  float64
 5   userId            102677 non-null  float64
 6   rating            102677 non-null  float64
 7   timestamp_rating  102677 non-null  float64
 8   tag               3476 non-null    object 
 9   timestamp_tag     3476 non-null    float64
dtypes: float64(5), int64(2), object(3)
memory usage: 7.8+ MB


In [41]:
raw_data.duplicated().sum()

0

In [42]:
raw_data.isnull().sum()

movieId                 0
title                   0
genres                  0
imdbId                  0
tmdbId                 13
userId                 18
rating                 18
timestamp_rating       18
tag                 99219
timestamp_tag       99219
dtype: int64

In [43]:
# Filling missing tags with a placeholder 
raw_data['tag'] = raw_data['tag'].fillna('no_tag')
raw_data['timestamp_tag'] = raw_data['timestamp_tag'].fillna(0)  # 0 will indicate no tag

# Dropping rows where there is no rating or userId 
raw_data.dropna(subset=['userId', 'rating'], inplace=True)

# Dropping the tmdbId missing values. They are only 13
raw_data.dropna(subset=['tmdbId'], inplace=True)

raw_data.isnull().sum()

movieId             0
title               0
genres              0
imdbId              0
tmdbId              0
userId              0
rating              0
timestamp_rating    0
tag                 0
timestamp_tag       0
dtype: int64

In [44]:
# Remove leading/trailing spaces and make titles title-case
raw_data['title'] = raw_data['title'].str.strip().str.title()
raw_data.head()

Unnamed: 0,movieId,title,genres,imdbId,tmdbId,userId,rating,timestamp_rating,tag,timestamp_tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,1.0,4.0,964982700.0,no_tag,0.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,5.0,4.0,847435000.0,no_tag,0.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,7.0,4.5,1106636000.0,no_tag,0.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,15.0,2.5,1510578000.0,no_tag,0.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,17.0,4.5,1305696000.0,no_tag,0.0


In [45]:
# Creating dummy columns from the 'genres' string
genre_dummies = raw_data['genres'].str.get_dummies(sep='|')

# Joining the new dummy columns
data = pd.concat([raw_data, genre_dummies], axis=1)
data.head()

Unnamed: 0,movieId,title,genres,imdbId,tmdbId,userId,rating,timestamp_rating,tag,timestamp_tag,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,1.0,4.0,964982700.0,no_tag,0.0,...,0,0,0,0,0,0,0,0,0,0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,5.0,4.0,847435000.0,no_tag,0.0,...,0,0,0,0,0,0,0,0,0,0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,7.0,4.5,1106636000.0,no_tag,0.0,...,0,0,0,0,0,0,0,0,0,0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,15.0,2.5,1510578000.0,no_tag,0.0,...,0,0,0,0,0,0,0,0,0,0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,17.0,4.5,1305696000.0,no_tag,0.0,...,0,0,0,0,0,0,0,0,0,0


In [46]:
# Creating a unique list of movies for content-based filtering
movies_unique = data.groupby('title').agg({
    'movieId': 'first',
    'genres': 'first', 
    'tag': lambda x: ' '.join(set(x.replace('no_tag', ''))).strip()
    }).reset_index()

# Creating a combined string for the vectorizer
movies_unique['combined_features'] = (
    movies_unique['genres'].str.replace('|', ' ') + " " + 
    movies_unique['tag']
)

# Previewing the data 
movies_unique[['title', 'combined_features']].head()

Unnamed: 0,title,combined_features
0,'71 (2014),Action Drama Thriller War
1,"'Burbs, The (1989)",Comedy
2,'Hellboy': The Seeds Of Creation (2004),Action Adventure Comedy Documentary Fantasy
3,'Night Mother (1986),Drama
4,'Round Midnight (1986),Drama Musical


In [47]:
# Initializing the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', min_df=2)

# Replacing 'no_tag' with an empty string 
tfidf_matrix = tfidf_vectorizer.fit_transform(movies_unique['combined_features'])
print(f"Matrix Shape: {tfidf_matrix.shape}")

# Calculating the Cosine Similarity Matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

Matrix Shape: (9713, 711)


In [48]:
# Ensuring movies_unique index is reset so it aligns perfectly with the matrix
movies_unique = movies_unique.reset_index(drop=True)
indices = pd.Series(movies_unique.index, index=movies_unique['title']).drop_duplicates()

def get_content_recommendations(title, num_recommendations=10):
    # Checking if the movie exists in our index
    if title not in indices:
        return f"Movie '{title}' not found in the database."
    
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores in descending order
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the top N most similar movies
    sim_scores = sim_scores[1:num_recommendations+1]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top N most similar movies
    return movies_unique['title'].iloc[movie_indices]

In [49]:
# Getting a sample recommendation
print(get_content_recommendations('Toy Story (1995)'))

1446                                 Bug'S Life, A (1998)
3668                     Guardians Of The Galaxy 2 (2017)
242        Adventures Of Rocky And Bullwinkle, The (2000)
551                                           Antz (1998)
638     Asterix And The Vikings (Astérix Et Les Viking...
2723                     Emperor'S New Groove, The (2000)
5727                                         Moana (2016)
5773                                Monsters, Inc. (2001)
7650                               Shrek The Third (2007)
8277                       Tale Of Despereaux, The (2008)
Name: title, dtype: object


In [50]:
# Defining the Reader and load into Surprise format
reader = Reader(rating_scale=(1, 5)) 
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Split into Train and Test samples
trainset, testset = train_test_split(data, test_size=0.20)

# Initialize the SVD Model
algorithm = SVD()
algorithm.fit(trainset)
predictions = algorithm.test(testset)

print(f"Model Accuracy (RMSE): {accuracy.rmse(predictions)}")

RMSE: 0.8711
Model Accuracy (RMSE): 0.8710917589235663


In [51]:
# Predict a rating for a specific User and Movie; rating for User 15 on Movie 55
pred = algorithm.predict(15, 55) 
print(f"Predicted Rating for User {pred.uid} on Movie {pred.iid}: {pred.est:.2f}")

Predicted Rating for User 15 on Movie 55: 3.45


In [None]:
# Defining the parameter grid
param_grid = {
    'n_factors': [50, 100, 150], # The number of latent factors (dimensionality of user/item vectors)
    'n_epochs': [20, 30], # The number of iterations of the SGD procedure
    'lr_all': [0.002, 0.005, 0.01], # The learning rate for all parameters
    'reg_all': [0.02, 0.1, 0.4]  # The regularization term for all parameters to prevent overfitting
}

# Initializing GridSearchCV
gridSearch = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

# Fitting the grid search 
gridSearch.fit(data)
print(f"Best RMSE score: {gridSearch.best_score['rmse']}")
print(f"Best parameters: {gridSearch.best_params['rmse']}")

# Using the best parameters to build the final model
best_algo = gridSearch.best_estimator['rmse']
trainset = data.build_full_trainset()
best_algo.fit(trainset)

# Test a prediction with the optimized model
uid = 1  
iid = 55 
pred = best_algo.predict(uid, iid)
print(f"Optimized Predicted Rating for User {uid} on Movie {iid}: {pred.est:.2f}")