In [1]:
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics import *
from surprise import SVDpp, Dataset, Reader

warnings.filterwarnings('ignore')

In [2]:
movies = pd.read_csv('../b. Datasets/MovieLens Dataset/movies.csv')
ratings = pd.read_csv('../b. Datasets/MovieLens Dataset/ratings.csv')

In [3]:
movies.shape, ratings.shape

((9742, 3), (100836, 4))

In [4]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [5]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [6]:
display(movies.head(2))
display(movies.tail(2))

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


Unnamed: 0,movieId,title,genres
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation
9741,193609,Andrew Dice Clay: Dice Rules (1991),Comedy


In [7]:
display(ratings.head(2))
display(ratings.tail(2))

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247


Unnamed: 0,userId,movieId,rating,timestamp
100834,610,168252,5.0,1493846352
100835,610,170875,3.0,1493846415


In [8]:
movies.isna().sum()

movieId    0
title      0
genres     0
dtype: int64

In [9]:
ratings.isna().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

### Content-Based Filtering Component

In [10]:
tfidf_vectoriser = TfidfVectorizer(stop_words='english')
movies['genres'] = movies['genres'].str.replace('|', " ")
genre_matrix = tfidf_vectoriser.fit_transform(movies['genres'])

In [11]:
cosine_sim = linear_kernel(genre_matrix, genre_matrix)

### Collaborative Filtering Component

In [12]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

In [13]:
svdpp_model = SVDpp(n_factors=100, n_epochs=20, lr_all=0.005)
trainset = data.build_full_trainset()
svdpp_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x2901c1abad0>

### Hybridisation Techniques
#### Weighted Hybrid Approach

In [14]:
def hybrid_recommendation(user_id, movie_id, content_weight=0.5, collab_weight=0.5):
    # Content-based score
    index = movies.index[movies['movieId'] == movie_id].tolist()[0]
    content_scores = list(enumerate(cosine_sim[index]))
    
    # Collaborative score
    collab_pred = svdpp_model.predict(user_id, movie_id).est
    
    # Combine scores
    hybrid_scores = []
    for i, score in content_scores:
        hybrid = (content_weight * score) + (collab_weight * collab_pred)
        hybrid_scores.append((i, hybrid))
    
    return sorted(hybrid_scores, key=lambda x: x[1], reverse=True)[:10]

In [15]:
user_id = 610
movie_id = 168252
recommendations = hybrid_recommendation(user_id, movie_id)
print(f"Recommended movies:")
movies.iloc[[i[0] for i in recommendations]]['title']

Recommended movies:


507                     Terminator 2: Judgment Day (1991)
595                                      Barb Wire (1996)
1056               Star Trek V: The Final Frontier (1989)
1261                             Starship Troopers (1997)
1767    King Kong vs. Godzilla (Kingukongu tai Gojira)...
1906                Beneath the Planet of the Apes (1970)
1907             Battle for the Planet of the Apes (1973)
1908            Conquest of the Planet of the Apes (1972)
1909            Escape from the Planet of the Apes (1971)
1921                                Wing Commander (1999)
Name: title, dtype: object