In [3]:
import pandas as pd
import numpy as np

<h3>Ratings Data</h3>

In [4]:
ratings = pd.read_csv('data\\ratings.csv')

In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,110,1.0
1,1,147,4.5
2,1,858,5.0
3,1,1221,5.0
4,1,1246,5.0


In [6]:
ratings.isna().sum()

userId     0
movieId    0
rating     0
dtype: int64

In [7]:
ratings.duplicated().sum()

0

In [8]:
ratings.describe()

Unnamed: 0,userId,movieId,rating
count,1048575.0,1048575.0,1048575.0
mean,5378.787,16158.89,3.537035
std,3064.842,31267.38,1.045203
min,1.0,1.0,0.5
25%,2728.0,1080.0,3.0
50%,5383.0,2642.0,3.5
75%,8060.0,6643.0,4.0
max,10656.0,176271.0,5.0


In [9]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 3 columns):
 #   Column   Non-Null Count    Dtype  
---  ------   --------------    -----  
 0   userId   1048575 non-null  int64  
 1   movieId  1048575 non-null  int64  
 2   rating   1048575 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 24.0 MB


<h3>Movies Data</h3>

In [10]:
movies = pd.read_csv('data\\movies_dataset.csv').drop('Unnamed: 0', axis=1)

In [11]:
movies.head()

Unnamed: 0,id,title,genres,status,overview,popularity,original_language,vote_average,vote_count
0,862,Toy Story,"Animation,Comedy,Family",Released,"Led by Woody, Andy's toys live happily in his ...",21.946943,en,7.7,5415
1,8844,Jumanji,"Adventure,Fantasy,Family",Released,When siblings Judy and Peter discover an encha...,17.015539,en,6.9,2413
2,15602,Grumpier Old Men,"Romance,Comedy",Released,A family wedding reignites the ancient feud be...,11.7129,en,6.5,92
3,31357,Waiting to Exhale,"Comedy,Drama,Romance",Released,"Cheated on, mistreated and stepped on, the wom...",3.859495,en,6.1,34
4,11862,Father of the Bride Part II,Comedy,Released,Just when George Banks has recovered from his ...,8.387519,en,5.7,173


In [12]:
movies = movies[['id', 'title', 'genres']]

In [13]:
movies.sample(5)

Unnamed: 0,id,title,genres
28561,99545,Night Monster,"Drama,Horror,Family"
31314,139349,The Premonition,"Thriller,Science Fiction,Horror"
9586,89065,Beautiful City,Drama
24996,117942,Girls Gone Dead,"Horror,Comedy"
29921,79698,The Lovers,"Action,Adventure,Science Fiction,Romance"


In [14]:
movies.duplicated().sum()

13

In [15]:
movies.drop_duplicates(keep='first' ,inplace=True)

In [16]:
movies.duplicated().sum()

0

In [17]:
movies.isna().sum()

id        0
title     0
genres    0
dtype: int64

In [18]:
movies.describe()

Unnamed: 0,id
count,42993.0
mean,104599.8
std,112354.6
min,2.0
25%,25037.0
50%,55735.0
75%,143883.0
max,2012929.0


In [19]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 42993 entries, 0 to 43005
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      42993 non-null  int64 
 1   title   42993 non-null  object
 2   genres  42993 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.3+ MB


In [20]:
n_users = ratings.userId.nunique()
n_movies = ratings.movieId.nunique()
print(f"Users: {n_users} | Movies: {n_movies}")

Users: 10656 | Movies: 20499


In [21]:
Ratings = ratings.pivot(index = 'userId', columns ='movieId', values = 'rating').fillna(0)

In [22]:
R = Ratings.to_numpy()
R

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [5., 3., 0., ..., 0., 0., 0.]])

In [23]:
user_ratings_mean = np.mean(R, axis = 1)

In [24]:
Ratings_demeaned = R - user_ratings_mean.reshape(-1, 1)

In [25]:
Ratings_demeaned

array([[-5.63442119e-03, -5.63442119e-03, -5.63442119e-03, ...,
        -5.63442119e-03, -5.63442119e-03, -5.63442119e-03],
       [-3.56114932e-03, -3.56114932e-03, -3.56114932e-03, ...,
        -3.56114932e-03, -3.56114932e-03, -3.56114932e-03],
       [-1.51226889e-03, -1.51226889e-03, -1.51226889e-03, ...,
        -1.51226889e-03, -1.51226889e-03, -1.51226889e-03],
       ...,
       [-1.46348602e-04, -1.46348602e-04, -1.46348602e-04, ...,
        -1.46348602e-04, -1.46348602e-04, -1.46348602e-04],
       [-1.17469145e-01, -1.17469145e-01, -1.17469145e-01, ...,
        -1.17469145e-01, -1.17469145e-01, -1.17469145e-01],
       [ 4.95755891e+00,  2.95755891e+00, -4.24410947e-02, ...,
        -4.24410947e-02, -4.24410947e-02, -4.24410947e-02]])

<h2>SVD</h2>

In [26]:
from scipy.sparse.linalg import svds

In [27]:
min(Ratings_demeaned.shape)

10656

In [28]:
U, S, V_T = svds(Ratings_demeaned, k = 100)

In [29]:
S.size

100

In [30]:
S = np.diag(S)

In [31]:
print(f"U: {U.shape}\nS: {S.shape}\nV_T: {V_T.shape}")

U: (10656, 100)
S: (100, 100)
V_T: (100, 20499)


In [32]:
user_ratings_mean.reshape(-1,1)

array([[0.00563442],
       [0.00356115],
       [0.00151227],
       ...,
       [0.00014635],
       [0.11746914],
       [0.04244109]])

In [33]:
user_predict_ratings = U@S@V_T + user_ratings_mean.reshape(-1, 1)

In [34]:
user_predict_ratings

array([[ 1.75105847e-01,  1.99183327e-01, -4.38495281e-02, ...,
        -5.79402677e-03, -1.57851614e-03, -2.99743558e-03],
       [ 6.06318036e-01,  1.01559191e-01,  7.60303464e-01, ...,
         4.29589455e-05, -1.59294042e-04,  1.10782709e-03],
       [ 1.87215703e-01,  1.75021358e-01, -5.91507574e-02, ...,
        -1.15538814e-03,  4.68378510e-04, -1.96378293e-03],
       ...,
       [-2.86752476e-02,  3.90189449e-02, -6.76071005e-03, ...,
        -3.18722336e-04, -1.51743145e-04, -1.52776563e-04],
       [-2.98209258e-01,  1.77635203e+00,  7.09042449e-01, ...,
         1.42486967e-02,  1.55673134e-02,  1.82626838e-02],
       [ 5.52136385e+00,  1.44076244e+00,  5.95933022e-01, ...,
        -7.98897216e-03, -5.01743681e-04, -4.64693518e-03]])

In [35]:
preds = pd.DataFrame(user_predict_ratings, columns = Ratings.columns)
preds.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,175773,175775,175777,175779,175945,175975,176165,176211,176219,176271
0,0.175106,0.199183,-0.04385,-0.006798,0.14244,-0.416807,0.008762,0.032692,0.070656,-0.068476,...,0.00267,0.00267,0.00267,0.00267,-0.001404,0.001001,-0.007033,-0.005794,-0.001579,-0.002997
1,0.606318,0.101559,0.760303,0.02495,0.773324,0.663787,0.93173,-0.012692,0.260241,0.190174,...,0.001542,0.001542,0.001542,0.001542,-0.002429,0.00062,-0.00061,4.3e-05,-0.000159,0.001108
2,0.187216,0.175021,-0.059151,0.009359,-0.073509,-0.152125,-0.126669,0.016371,-0.036067,-0.01774,...,0.003072,0.003072,0.003072,0.003072,0.002721,8.3e-05,-5.6e-05,-0.001155,0.000468,-0.001964
3,0.575216,0.084471,0.070319,-0.040922,0.114621,0.532867,0.015265,0.004625,0.01724,0.069533,...,0.001383,0.001383,0.001383,0.001383,-0.006248,0.002694,0.00915,-0.003028,-9.5e-05,-0.007285
4,0.682836,-0.037107,0.151477,0.022419,0.113821,-0.157964,0.178911,-0.005377,0.046891,-0.010297,...,0.000961,0.000961,0.000961,0.000961,0.004524,0.001788,0.001166,0.004354,0.002887,-0.000581


<h2>Filter and Get recommend for user</h2>

In [36]:
# get recommandation for user with id - sort values in descending order to get the highest rated movies
user_predictions = preds.iloc[0].sort_values(ascending=False)
user_predictions

movieId
858      5.112751
1221     3.675026
2959     3.510113
58559    2.795143
79132    1.818393
           ...   
6       -0.416807
608     -0.428457
380     -0.431674
4306    -0.445058
34      -0.524257
Name: 0, Length: 20499, dtype: float64

In [37]:
user_rating = ratings[ratings.userId == 1]
user_rating

Unnamed: 0,userId,movieId,rating
0,1,110,1.0
1,1,147,4.5
2,1,858,5.0
3,1,1221,5.0
4,1,1246,5.0
5,1,1968,4.0
6,1,2762,4.5
7,1,2918,5.0
8,1,2959,4.0
9,1,4226,4.0


In [38]:
data = user_rating.merge(movies, how='inner', left_on = 'movieId', right_on = 'id').sort_values(['rating'], ascending=False)
data

Unnamed: 0,userId,movieId,rating,id,title,genres
2,1,858,5.0,858,Sleepless in Seattle,"Comedy,Drama,Romance"
3,1,1246,5.0,1246,Rocky Balboa,Drama
10,1,96821,5.0,96821,Caesar Must Die,"Drama,Documentary"
1,1,147,4.5,147,The 400 Blows,Drama
5,1,2762,4.5,2762,Young and Innocent,"Drama,Crime"
4,1,1968,4.0,1968,Fools Rush In,"Drama,Comedy,Romance"
6,1,2959,4.0,2959,License to Wed,Comedy
7,1,4226,4.0,4226,Shriek If You Know What I Did Last Friday the ...,Comedy
9,1,58559,4.0,58559,Confession of a Child of the Century,Drama
8,1,54503,3.5,54503,The Mystery of Chess Boxing,"Action,Foreign"


In [39]:
# get the movies that the user has not watches
movies[~movies['id'].isin(data['id'])]

Unnamed: 0,id,title,genres
0,862,Toy Story,"Animation,Comedy,Family"
1,8844,Jumanji,"Adventure,Fantasy,Family"
2,15602,Grumpier Old Men,"Romance,Comedy"
3,31357,Waiting to Exhale,"Comedy,Drama,Romance"
4,11862,Father of the Bride Part II,Comedy
...,...,...,...
43001,222848,Caged Heat 3000,Science Fiction
43002,30840,Robin Hood,"Drama,Action,Romance"
43003,439050,Subdue,"Drama,Family"
43004,111109,Century of Birthing,Drama


In [40]:
movies[~movies['id']
        .isin(data['id'])].merge(pd.DataFrame(user_predictions)
                                      .reset_index(), how = 'left', left_on = 'id', right_on = 'movieId').rename(columns = {0: 'Predictions'}).sort_values('Predictions', ascending = False).iloc[:20, :-1]

Unnamed: 0,id,title,genres,movieId
17992,8368,The Garden of Eden,Drama,8368.0
41061,48780,Boat,Foreign,48780.0
16602,54001,The Traveler,Drama,54001.0
27078,44191,Loose Screws,Comedy,44191.0
5419,40815,On Guard,"Drama,Adventure",40815.0
2978,1213,The Talented Mr. Ripley,"Thriller,Crime,Drama",1213.0
29165,74458,Mere Brother Ki Dulhan,"Drama,Comedy,Romance",74458.0
9339,4896,Muxmäuschenstill,"Comedy,Drama",4896.0
5386,1682,Mothra vs. Godzilla,"Fantasy,Science Fiction,Action,Adventure",1682.0
41785,68954,Longitude,"TV Movie,Drama,History",68954.0


In [41]:
def recommend_movies(userID, movies, original_ratings, num_recommendations=10,predictions=preds):
    idx = userID - 1
    user_predictions = predictions.iloc[idx].sort_values(ascending=False)

    user = original_ratings[original_ratings.userId == userID]
    data = (user.merge(movies, how = 'left', left_on = 'movieId', right_on = 'id').sort_values(['rating'], ascending=False))
    
    recommendations = movies[~movies['id']
        .isin(data['id'])].merge(pd.DataFrame(user_predictions)
                                      .reset_index(), how = 'left', left_on = 'id', right_on = 'movieId').rename(columns = {0: 'Predictions'}).sort_values('Predictions', ascending = False).iloc[:num_recommendations, :-1]

    return recommendations

In [42]:
recommend = recommend_movies(1, movies, ratings, 10, preds)

In [43]:
recommend

Unnamed: 0,id,title,genres,movieId
17992,8368,The Garden of Eden,Drama,8368.0
41061,48780,Boat,Foreign,48780.0
16602,54001,The Traveler,Drama,54001.0
27078,44191,Loose Screws,Comedy,44191.0
5419,40815,On Guard,"Drama,Adventure",40815.0
2978,1213,The Talented Mr. Ripley,"Thriller,Crime,Drama",1213.0
29165,74458,Mere Brother Ki Dulhan,"Drama,Comedy,Romance",74458.0
9339,4896,Muxmäuschenstill,"Comedy,Drama",4896.0
5386,1682,Mothra vs. Godzilla,"Fantasy,Science Fiction,Action,Adventure",1682.0
41785,68954,Longitude,"TV Movie,Drama,History",68954.0
