In [None]:
'''
Logic:
1. make an user/movie matrix, keeping ratings as the values.

2. de-mean the data, i.e. normalize the ratings values by each user mean.

3. import svds and set an eigenvalue, k = some number
                                      where k -> top features that describes the user best.
                                      [0 < k < n]

   SVD is an algorithm that decomposes a matrix R into the best lower rank (i.e. smaller/simpler) parts of the original 
   matrix R. Mathematically, it decomposes R into two unitary matrices and a diagonal matrix:
     
         R = UΣV^T
         where R is user ratings matrix, U is the user “features” matrix, Σ is the diagonal matrix of singular values 
         (essentially weights), and V^T is the movie “features” matrix.



4. apply matrix multiplication and add the user means back to original values.

5. return the movies with the highest predicted rating that the specified user has not already rated.

'''

In [1]:
import numpy as np
import pandas as pd

In [34]:
r_cols = ['user_id','movie_id','rating']
ratings = pd.read_csv("C:\\Users\\DHRUBAJIT\\Desktop\\Datasets\\movielens\\ml-100k\\u.data",names=r_cols,usecols=range(3), sep='\t')

m_cols = ['movie_id','title']
movies = pd.read_csv("C:/Users/DHRUBAJIT/Desktop/Datasets/movielens/ml-100k/u.item",names=m_cols, sep='|',usecols=range(2),encoding='latin-1')

In [11]:
movies.head()

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [12]:
ratings.head()

Unnamed: 0,movie_id,title,user_id,rating
0,1,Toy Story (1995),308,4
1,1,Toy Story (1995),287,5
2,1,Toy Story (1995),148,4
3,1,Toy Story (1995),280,4
4,1,Toy Story (1995),66,3


In [3]:
ratings = pd.merge(movies, ratings)
ratings.head()

Unnamed: 0,movie_id,title,user_id,rating
0,1,Toy Story (1995),308,4
1,1,Toy Story (1995),287,5
2,1,Toy Story (1995),148,4
3,1,Toy Story (1995),280,4
4,1,Toy Story (1995),66,3


In [4]:
df = ratings.pivot_table(index='user_id', columns='movie_id', values='rating').fillna(0)
df.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
'''de-mean the data (normalize by each users mean) and convert it from a dataframe to a numpy array.'''

df_new = df.as_matrix()
df_new_mean = np.mean(df_new, axis=1)
df_demeaned = df_new - df_new_mean.reshape(-1,1)
df_demeaned

array([[ 4.41617122,  2.41617122,  3.41617122, ..., -0.58382878,
        -0.58382878, -0.58382878],
       [ 3.86325803, -0.13674197, -0.13674197, ..., -0.13674197,
        -0.13674197, -0.13674197],
       [-0.08977408, -0.08977408, -0.08977408, ..., -0.08977408,
        -0.08977408, -0.08977408],
       ..., 
       [ 4.9470868 , -0.0529132 , -0.0529132 , ..., -0.0529132 ,
        -0.0529132 , -0.0529132 ],
       [-0.20035672, -0.20035672, -0.20035672, ..., -0.20035672,
        -0.20035672, -0.20035672],
       [-0.34066587,  4.65933413, -0.34066587, ..., -0.34066587,
        -0.34066587, -0.34066587]])

In [6]:
from scipy.sparse.linalg import svds
U, sigma, V = svds(df_demeaned, 50)

In [7]:
# E returned is just the values instead of a diagonal matrix. So i will convert it to the diagonal matrix form.

sigma = np.diag(sigma)

In [9]:
#matrix multiplication and also converting the user means back to original values.

all_users_pred = np.dot(np.dot(U,sigma),V) + df_new_mean.reshape(-1,1)
preds_df = pd.DataFrame(all_users_pred, columns = df.columns)
preds_df.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
0,6.488436,2.959503,1.634987,3.024467,1.656526,1.659506,3.630469,0.240669,1.791518,3.347816,...,0.011976,-0.092017,-0.074553,-0.060985,0.009427,-0.035641,-0.039227,-0.037434,-0.025552,0.023513
1,2.347262,0.129689,-0.098917,0.328828,0.159517,0.481361,0.213002,0.097908,1.8921,0.671,...,0.003943,-0.026939,-0.03546,-0.029883,-0.027153,-0.015244,-0.008277,-0.01176,0.011639,-0.046924
2,0.291905,-0.26383,-0.151454,-0.179289,0.013462,-0.088309,-0.057624,0.568764,-0.018506,0.280742,...,-0.028964,-0.031622,0.045513,0.026089,-0.021705,0.002282,0.032363,0.017322,-0.006644,-0.00948
3,0.36641,-0.443535,0.041151,-0.007616,0.055373,-0.080352,0.299015,-0.010882,-0.160888,-0.118834,...,0.020069,0.015981,-0.000182,0.005593,0.026634,0.023562,0.036405,0.029984,0.015612,-0.008713
4,4.263488,1.937122,0.052529,1.04935,0.652765,0.002836,1.730461,0.870584,0.341027,0.569055,...,0.019973,-0.053521,-0.017242,-0.007137,-0.038987,0.010338,0.004869,0.007603,-0.020575,0.00333


In [35]:
def recommend_movies(preds_df,userid,moviesdf,originalratings, num=10):
    
    #sort user predictions
    user_row = userid - 1
    sorted_user_pred = preds_df.iloc[user_row].sort_values(ascending=False) 
    
    #extract sorted movies details
    user_data = originalratings[originalratings.user_id == (userid)]
    user_data_full = user_data.merge(moviesdf, how='left', left_on = 'movie_id',right_on = 'movie_id').sort_values(['rating'], ascending=False)
    
    print("User {0} has already rated {1} movies:".format(userid,user_data_full.shape[0]))
    print("Recommending top {0} movies that user has not rated :".format(num))
    
    
    #recommending movies which the user hasn't seen yet
    recommendations = moviesdf[~moviesdf['movie_id'].isin(user_data_full['movie_id'])]
    recommendations = recommendations.merge(pd.DataFrame(sorted_user_pred).reset_index(), how = 'left',left_on = 'movie_id',right_on = 'movie_id')
    recommendations = recommendations.rename(columns = {user_row: 'Predictions'})
    recommendations = recommendations.sort_values('Predictions', ascending = False).iloc[:num, :-1]
    
    return user_data_full, recommendations

In [36]:
already_seen, recom = recommend_movies(preds_df,837,movies,ratings,10)

User 837 has already rated 46 movies:
Recommending top 10 movies that user has not rated :


In [37]:
#printing all movies that the user has already seen
already_seen

Unnamed: 0,user_id,movie_id,rating,title
44,837,740,5,Jane Eyre (1996)
37,837,1009,5,Stealing Beauty (1996)
9,837,283,5,Emma (1996)
36,837,125,5,Phenomenon (1996)
11,837,289,5,Evita (1996)
24,837,151,5,Willy Wonka and the Chocolate Factory (1971)
23,837,258,4,Contact (1997)
14,837,20,4,Angels and Insects (1995)
22,837,294,4,Liar Liar (1997)
27,837,328,4,Conspiracy Theory (1997)


In [38]:
#recommending top 10 movies
recom

Unnamed: 0,movie_id,title
11,14,"Postino, Il (1994)"
0,1,Toy Story (1995)
107,116,Cold Comfort Farm (1995)
116,126,"Spitfire Grill, The (1996)"
42,50,Star Wars (1977)
238,255,My Best Friend's Wedding (1997)
440,471,Courage Under Fire (1996)
92,100,Fargo (1996)
258,282,"Time to Kill, A (1996)"
704,742,Ransom (1996)
