In [6]:
import pandas as pd
import numpy as np
import os
data_path = 'ml-20m/'
movies_filename = 'movies.csv'
ratings_filename = 'ratings.csv'
# The dataset is to be download from kaggle and the path mentioned above is to be according to the data path you stored
df_movies = pd.read_csv(
    os.path.join(data_path, movies_filename),
#     movies_filename,
    usecols=['movieId', 'title'],
    dtype={'movieId': 'int32', 'title': 'str'})

df_ratings = pd.read_csv(
    os.path.join(data_path, ratings_filename),
#     ratings_filename,
    usecols=['userId', 'movieId', 'rating'],
    
    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

In [7]:
df_ratings=df_ratings[:2000000]
df_movie_features = df_ratings.pivot(
    index='userId',
    columns='movieId',
    values='rating'
).fillna(0)

In [8]:
df_movie_features.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,130073,130075,130219,130462,130490,130496,130512,130642,130644,130768
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:

R = df_movie_features.to_numpy()
user_ratings_mean = np.mean(R, axis = 1)
R_demeaned = R - user_ratings_mean.reshape(-1, 1)

In [18]:
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(R_demeaned, k = 50)

In [19]:
sigma = np.diag(sigma)

In [20]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

In [21]:
preds_df = pd.DataFrame(all_user_predicted_ratings, columns = df_movie_features.columns)
preds_df.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,130073,130075,130219,130462,130490,130496,130512,130642,130644,130768
0,-1.032346,0.816793,0.065381,-0.06929,-0.089822,0.488403,-0.417955,0.045865,-0.139047,-0.341765,...,0.000437,-0.000912,-0.005743,-0.006717,-0.004173,-0.002835,-0.009607,-0.006602,-0.001092,-0.002192
1,0.87841,0.018586,0.350085,0.069684,0.219138,0.3793,0.40433,0.008961,0.102663,-0.336461,...,0.001779,-0.000374,-0.001408,0.00251,0.002424,-0.000837,-0.004802,0.001165,-0.000476,0.000287
2,2.004481,0.853215,-0.124904,0.033099,-0.197898,0.725822,-0.070369,-0.04364,-0.004807,0.117874,...,0.00396,-0.004785,-0.001086,0.001811,0.005158,0.001003,0.019518,-0.001381,-0.001391,0.000514
3,-0.730042,0.575878,0.316379,-0.04377,0.175547,0.804852,0.024102,0.05393,0.182835,1.027774,...,0.00122,0.001608,0.002619,0.001079,0.003182,0.001052,0.001691,0.001608,0.002383,0.001787
4,1.487785,1.324297,1.370937,0.112847,1.282806,0.726788,1.453506,0.217864,0.285127,1.278291,...,-0.000932,-0.001433,-5e-05,0.002574,-0.002194,-0.00184,-0.003786,0.001491,0.0016,0.001771


In [24]:
def recommend_movies(preds_df, userID, movies_df, original_ratings_df, num_recommendations=5):
    
    # Get and sort the user's predictions
    user_row_number = userID - 1 # UserID starts at 1, not 0
    sorted_user_predictions = preds_df.iloc[user_row_number].sort_values(ascending=False) # UserID starts at 1

    # Get the user's data and merge in the movie information.
    user_data = original_ratings_df[original_ratings_df.userId == (userID)]
    user_full = (user_data.merge(movies_df, how = 'left', left_on = 'movieId', right_on = 'movieId').
                     sort_values(['rating'], ascending=False)
                 )
    print(user_full)
    recommendations = (movies_df[~movies_df['movieId'].isin(user_full['movieId'])]).merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left', left_on = 'movieId',
               right_on = 'movieId').rename(columns = {user_row_number: 'Predictions'}).sort_values('Predictions', ascending = False).iloc[:num_recommendations, :-1]
                      

    return user_full, recommendations


In [25]:
already_rated, predictions = recommend_movies(preds_df, 330, df_movies, df_ratings, 10)
#This is for the user Id =330

     userId  movieId  rating                                   title
11      330       17     5.0            Sense and Sensibility (1995)
28      330       47     5.0             Seven (a.k.a. Se7en) (1995)
30      330       50     5.0              Usual Suspects, The (1995)
200     330      588     5.0                          Aladdin (1992)
154     330      349     5.0         Clear and Present Danger (1994)
..      ...      ...     ...                                     ...
135     330      305     1.0    Ready to Wear (Pret-A-Porter) (1994)
206     330      611     1.0            Hellraiser: Bloodline (1996)
98      330      234     1.0                     Exit to Eden (1994)
95      330      231     1.0  Dumb & Dumber (Dumb and Dumber) (1994)
129     330      291     1.0                    Poison Ivy II (1996)

[220 rows x 4 columns]


In [26]:
already_rated.head(10)

Unnamed: 0,userId,movieId,rating,title
11,330,17,5.0,Sense and Sensibility (1995)
28,330,47,5.0,Seven (a.k.a. Se7en) (1995)
30,330,50,5.0,"Usual Suspects, The (1995)"
200,330,588,5.0,Aladdin (1992)
154,330,349,5.0,Clear and Present Danger (1994)
190,330,509,5.0,"Piano, The (1993)"
44,330,110,5.0,Braveheart (1995)
131,330,296,5.0,Pulp Fiction (1994)
0,330,1,4.0,Toy Story (1995)
183,330,457,4.0,"Fugitive, The (1993)"


In [27]:
predictions# For the user ID 330

Unnamed: 0,movieId,title
159,293,Léon: The Professional (a.k.a. The Professiona...
178,329,Star Trek: Generations (1994)
397,608,Fargo (1996)
187,342,Muriel's Wedding (1994)
304,497,Much Ado About Nothing (1993)
99,163,Desperado (1995)
200,368,Maverick (1994)
280,468,Englishman Who Went Up a Hill But Came Down a ...
130,223,Clerks (1994)
1,5,Father of the Bride Part II (1995)


In [28]:
already_rated, predictions = recommend_movies(preds_df, 9, df_movies, df_ratings, 10)
#This is for the user Id =9

    userId  movieId  rating                                         title
5        9     1997     5.0                          Exorcist, The (1973)
28       9     4148     5.0                               Hannibal (2001)
19       9     3798     5.0                      What Lies Beneath (2000)
1        9      858     5.0                         Godfather, The (1972)
16       9     2959     5.0                             Fight Club (1999)
0        9      356     4.0                           Forrest Gump (1994)
15       9     2841     4.0                         Stir of Echoes (1999)
29       9     4369     4.0              Fast and the Furious, The (2001)
4        9     1923     4.0           There's Something About Mary (1998)
25       9     4022     4.0                              Cast Away (2000)
10       9     2706     4.0                           American Pie (1999)
32       9     4509     3.0                    Great Outdoors, The (1988)
23       9     3994     3.0           

In [29]:
predictions#For userID 9

Unnamed: 0,movieId,title
1192,1221,"Godfather: Part II, The (1974)"
3469,3578,Gladiator (2000)
2662,2762,"Sixth Sense, The (1999)"
2756,2858,American Beauty (1999)
586,593,"Silence of the Lambs, The (1991)"
2604,2700,"South Park: Bigger, Longer and Uncut (1999)"
475,480,Jurassic Park (1993)
1227,1258,"Shining, The (1980)"
2976,3081,Sleepy Hollow (1999)
1185,1213,Goodfellas (1990)
