In [1]:
import numpy as np
import pandas as pd

### Creating a small data for exploration

In [2]:
#Created data has userId, movieId,
R = {"userId":['u1','u2','u3','u1'], "movieId":['m1', 'm2', 'm3','m3'], "rating":[4.0, 3.5, 5.0, 2.5]}
ratings = pd.DataFrame(R)
print(ratings)

  userId movieId  rating
0     u1      m1     4.0
1     u2      m2     3.5
2     u3      m3     5.0
3     u1      m3     2.5


In [3]:
#pivot table to create a matrix
rating_mat = ratings.pivot( index='userId', columns='movieId', values = 'rating').fillna(0)
rating_mat

movieId,m1,m2,m3
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
u1,4.0,0.0,2.5
u2,0.0,3.5,0.0
u3,0.0,0.0,5.0


### calculating svd using numpy

In [4]:
from numpy.linalg import svd
u, s, v=svd(rating_mat, full_matrices=False)

In [5]:
u

array([[ 0.66733032,  0.        , -0.74476187],
       [ 0.        ,  1.        ,  0.        ],
       [ 0.74476187,  0.        ,  0.66733032]])

In [6]:
s=np.diag(s)

In [7]:
v

array([[ 0.44365377,  0.        ,  0.89619826],
       [-0.        ,  1.        ,  0.        ],
       [-0.89619826,  0.        ,  0.44365377]])

In [8]:
np.dot(np.dot(u, s), v)

array([[4.0000000e+00, 0.0000000e+00, 2.5000000e+00],
       [0.0000000e+00, 3.5000000e+00, 0.0000000e+00],
       [4.4408921e-16, 0.0000000e+00, 5.0000000e+00]])

### calculating svd using scipy

In [9]:
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(rating_mat, k=2)

In [10]:
U

array([[ 0.00000000e+00,  6.67330323e-01],
       [-1.00000000e+00,  4.69551527e-17],
       [ 0.00000000e+00,  7.44761868e-01]])

In [11]:
sigma = np.diag(sigma)
sigma

array([[3.5       , 0.        ],
       [0.        , 6.01667662]])

In [12]:
Vt

array([[ 0.00000000e+00, -1.00000000e+00,  0.00000000e+00],
       [ 4.43653774e-01,  2.73145866e-17,  8.96198264e-01]])

### Creating recommender system

In [13]:
movies_df = pd.read_csv("ml-latest-small/movies.csv")
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [14]:
ratings = pd.read_csv("ml-latest-small/ratings.csv")
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [15]:
users = ratings['userId'].unique()
movies = ratings['movieId'].unique()
print("No of users in dataset: {0}".format(len(users)))
print("No of movies in dataset: {0}".format(len(movies)))

No of users in dataset: 610
No of movies in dataset: 9724


In [16]:
#creating a pivoted dataframe
rating_mat = ratings.pivot( index='userId', columns='movieId', values = 'rating').fillna(0)

#creating matrix
rating_np = rating_mat.to_numpy()

#normalising the data by each user
user_rating_mean =np.mean(rating_np, axis=1)
rating_demeaned = rating_np - user_rating_mean.reshape(-1, 1)

In [17]:
rating_mat

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,2.5,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(rating_mat, k=50)
sigma = np.diag(sigma)

In [19]:
all_users_predicted_ratings =np.dot(np.dot(U, sigma), Vt) + user_rating_mean.reshape(-1, 1)
preds_df = pd.DataFrame(all_users_predicted_ratings, columns=rating_mat.columns)

In [20]:
preds_df

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
0,2.286047,0.497849,0.942361,0.021810,-0.442104,2.625837,-0.783056,0.078954,0.301145,1.710933,...,0.079191,0.082760,0.075622,0.075622,0.079191,0.075622,0.079191,0.079191,0.079191,0.045187
1,0.221584,0.016596,0.042517,0.029027,0.195539,-0.048885,0.095081,0.035572,0.059875,-0.140193,...,0.030670,0.027971,0.033369,0.033369,0.030670,0.033369,0.030670,0.030670,0.030670,0.043741
2,0.023164,0.044495,0.060294,0.009970,0.004192,0.124442,0.002309,0.010508,0.014516,-0.051514,...,0.008157,0.008388,0.007927,0.007927,0.008157,0.007927,0.008157,0.008157,0.008157,0.009240
3,2.091051,-0.315902,-0.211406,0.172844,0.202291,0.338745,0.551647,0.114945,0.090273,0.056997,...,0.080946,0.080665,0.081227,0.081227,0.080946,0.081227,0.080946,0.080946,0.080946,0.057517
4,1.353168,0.789408,0.081032,0.130334,0.291448,0.600934,0.267502,0.147988,-0.069856,1.051815,...,0.012047,0.012676,0.011417,0.011417,0.012047,0.011417,0.012047,0.012047,0.012047,0.010356
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,2.898614,0.351018,0.432383,0.498759,0.241598,-0.233902,2.773381,0.413764,0.336119,0.237603,...,0.383429,0.388564,0.378294,0.378294,0.383429,0.378294,0.383429,0.383429,0.383429,0.361757
606,2.890988,1.444786,0.399888,0.079857,-0.207771,1.580116,-0.009736,0.055171,0.141993,1.459949,...,0.076911,0.076325,0.077497,0.077497,0.076911,0.077497,0.076911,0.076911,0.076911,0.040069
607,2.577478,2.970281,2.532039,0.288024,0.420071,3.983925,0.239405,0.490332,0.397484,2.715355,...,0.255326,0.257114,0.253538,0.253538,0.255326,0.253538,0.255326,0.255326,0.255326,0.360595
608,0.795626,0.542586,0.110418,0.038038,0.102850,0.229013,0.093009,0.073424,-0.053675,1.261902,...,0.013428,0.013287,0.013569,0.013569,0.013428,0.013569,0.013428,0.013428,0.013428,0.006950


In [21]:
def recommend_movies(prediction_df, userID, movies_df, original_ratings_df, num_recommendation=5):
    '''recommend n movies.'''
    
    
    user_row_number = int(userID)-1
    
    sorted_user_predictions = prediction_df.iloc[user_row_number].sort_values(ascending=False)

    #get the user's data and merge with movies data
    user_data = original_ratings_df[original_ratings_df.userId == (userID)]
    user_full = (user_data.merge(movies_df, how='left', left_on = 'movieId', right_on = 'movieId').sort_values(['rating'], ascending=False))
    
    print("User {0} has already rated {1} movies.".format(userID, user_full.shape[0]))
    print("Recommending the highest {0} predicted ratings not already rated.".format(num_recommendation))
    
    #recommend the highest predicted rating movies that the hasn't seen yet.
    recommendations = (movies_df[~movies_df['movieId'].isin(user_full['movieId'])].merge(pd.DataFrame(sorted_user_predictions).reset_index(), how='left', left_on='movieId', right_on='movieId').rename(columns={user_row_number:'predictions'}).sort_values('predictions', ascending=False).iloc[:num_recommendation, :-1])
    
    return recommendations


In [22]:
recommend_movies(preds_df, 1, movies_df, ratings, 11)

User 1 has already rated 232 movies.
Recommending the highest 11 predicted ratings not already rated.


Unnamed: 0,movieId,title,genres
736,1036,Die Hard (1988),Action|Crime|Thriller
844,1221,"Godfather: Part II, The (1974)",Crime|Drama
974,1387,Jaws (1975),Action|Horror
1328,1968,"Breakfast Club, The (1985)",Comedy|Drama
615,858,"Godfather, The (1972)",Crime|Drama
874,1259,Stand by Me (1986),Adventure|Drama
1927,2804,"Christmas Story, A (1983)",Children|Comedy
1416,2080,Lady and the Tramp (1955),Animation|Children|Comedy|Romance
2765,4011,Snatch (2000),Comedy|Crime|Thriller
1417,2081,"Little Mermaid, The (1989)",Animation|Children|Comedy|Musical|Romance
