In [1]:
import pandas as pd
import numpy as np
import logging
import sys
from time import time
import pickle
import re
from pandas.io.json import json_normalize
import json

from sklearn.utils.extmath import randomized_svd

from scipy import sparse
from scipy.sparse.linalg import svds

from collections import defaultdict
from collections import Counter
import math

In [2]:
from surprise import SVD, accuracy
from surprise.model_selection import cross_validate, train_test_split
from surprise import Dataset
from surprise import Reader
from surprise.prediction_algorithms.knns import KNNWithZScore, KNNBaseline
from surprise.prediction_algorithms.matrix_factorization import NMF

In [3]:
def convert_ids(ids_in_csv):
    return pd.to_numeric(ids_in_csv, errors='coerce').astype('int64')

def convert_to_float(ids_in_csv):
    return pd.to_numeric(ids_in_csv, errors='coerce').astype('float64')

def to_json(csv_entry):
    return json.loads(re.sub('\'', '"', csv_entry))

In [5]:
#movies_df has all the metadata of the movies and ratings_df has ratings given by user to movies

movies_df = pd.read_csv('./data/movies_metadata.csv'
                        , converters={'id': lambda x: convert_ids(x), 'imdb_id': lambda x: convert_ids(x)}
                       ,usecols=['id', 'original_title', 'budget', 'genres','spoken_languages', 'title'])
movies_df.drop_duplicates(subset ="id", 
                     keep = 'first', inplace = True)
ratings_df = pd.read_csv('./data/ratings_small.csv')

In [6]:
###May need Fuzzy matching, but for now:
# Do not know if this is actually required
movies_df = movies_df[movies_df.spoken_languages == """[{'iso_639_1': 'en', 'name': 'English'}]"""]

In [7]:
ratings_df.nunique()
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [11]:
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)
trainset = data.build_full_trainset()
testset = trainset.build_anti_testset()

In [None]:
algo = SVD(verbose=True)
algo.fit(trainset)

cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, n_jobs=-1, verbose=True)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19


In [9]:
#for testing we separate the first user as new user and perform the new method of folding in
new_ratings_df = ratings_df[ratings_df['userId'] == 1]
new_ratings_df['userId'] = new_ratings_df['userId'].apply(lambda x : 672)
ratings_df = pd.concat([ratings_df, new_ratings_df], ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [10]:
users = algo.pu
items = algo.qi
user_bias = algo.bu
item_bias = algo.bi
(trainset.ur)[671] = (trainset.ur)[0]
trainset._raw2inner_id_users[672] = len(trainset._raw2inner_id_users)

NameError: name 'algo' is not defined

In [87]:
def create_new_user(ruid,items,item_bias, n_epochs) :
    
    init_mean=0
    init_std_dev=.1
    lr_all=.005
    reg_all=.02
    n_factors = 100
    global_mean = trainset.global_mean
    bu = 0
    bi = item_bias
    pu = np.random.mtrand._rand.normal(init_mean, init_std_dev,
                    (n_factors))
    qi = items

    for current_epoch in range(n_epochs):
        
        #print("Processing epoch {}".format(current_epoch))
        u = trainset.to_inner_uid(ruid)
        for i, r in trainset.ur[trainset.to_inner_uid(ruid)]:

            # compute current error
            dot = 0  # <q_i, p_u>
            for f in range(n_factors):
                dot += qi[i, f] * pu[f]
            err = r - (global_mean + bu + bi[i] + dot)

           
            bu += lr_all * (err - reg_all * bu)
            #bi[i] += lr_bi * (err - reg_bi * bi[i])

            # update factors
            for f in range(n_factors):
                puf = pu[f]
                qif = qi[i, f]
                pu[f] += lr_all * (err * qif - reg_all * puf)
                #qi[i, f] += lr_qi * (err * puf - reg_qi * qif)

    return bu, pu

In [101]:
bias, puser = create_new_user(672,items,item_bias, 20)

In [102]:
#RMSE value for the new user
s = 0
count = 0
for i, r in trainset.ur[trainset.to_inner_uid(672)]:

    # compute current error
    dot = 0  # <q_i, p_u>
    #for f in range(100):
    dot += items[i].dot(puser)
    err = r - (trainset.global_mean + bias + item_bias[i] + dot)
    s += err*err
    count += 1
rmse = (s/count)**0.5
rmse

0.6702519497962203

In [103]:
#RMSE value for the first user predicted by the Surprise package
s = 0
count = 0
for i, r in trainset.ur[trainset.to_inner_uid(1)]:

    # compute current error
    dot = 0  # <q_i, p_u>
    #for f in range(100):
    dot += items[i].dot(users[0])
    err = r - (trainset.global_mean + user_bias[0] + item_bias[i] + dot)
    s += err*err
    count += 1
rmse = (s/count)**0.5
rmse

0.6749813518191998

In [104]:
#Comparing the 2 users 
# 1. predicted by us
# 2. predicted by Surprise package
sample = users[0]
s = 0 
for i in items :
    diff = (i.dot(sample) + user_bias[0]) - (i.dot(puser) + bias)
    s += diff*diff

rmse = (s/len(items))**0.5
rmse

0.12772042694191502

In [99]:
def estimate(users, items, user_bias, item_bias, u, i):
    '''
    gives the estimated ratings for user u 
    for the i movie
    '''
    u = trainset.to_inner_uid(u)
    i = trainset.to_inner_iid(i)
    est = trainset.global_mean
    est += user_bias[u]
    est += item_bias[i]
    est += np.dot(items[i], users[u])
    return est

In [105]:
for i in range(10) :
    dot = 0
    dot += items[trainset.to_inner_iid(i+1)].dot(puser)
    pred = trainset.global_mean + bias + item_bias[trainset.to_inner_iid(i+1)] + dot
    print(estimate(users, items, user_bias, item_bias, 1, i+1), pred)

2.9750578193593684 2.8755019498155256
2.472489202302553 2.432414511006186
2.2955376068848983 2.6089077503127363
1.9374403693587554 2.0732469625934957
2.5126905672950173 2.299663539364208
3.1074249307293758 3.0754382927691566
2.566600302266024 2.4449387264671394
2.6448106113355956 2.705356842972059
2.2556052110640956 2.2843204329086246
2.5589962256065197 2.548677017108624


In [34]:
predictions = algo.test(testset)


NameError: name 'predicitons' is not defined

In [36]:
predictions[:5]

[Prediction(uid=1, iid=10, r_ui=3.543608255669773, est=2.583825142683057, details={'was_impossible': False}),
 Prediction(uid=1, iid=17, r_ui=3.543608255669773, est=3.3001508729281652, details={'was_impossible': False}),
 Prediction(uid=1, iid=39, r_ui=3.543608255669773, est=2.6397472673265545, details={'was_impossible': False}),
 Prediction(uid=1, iid=47, r_ui=3.543608255669773, est=3.124481506629175, details={'was_impossible': False}),
 Prediction(uid=1, iid=50, r_ui=3.543608255669773, est=3.5812670200280556, details={'was_impossible': False})]

# Performing the normal sparse SVD and folding in as discussed in the research paper

In [26]:
# The required matrix but in df form. index is userid and columns are the movieids
# to use it as a matrix use utility_df.values

utility_df = ratings_df.pivot(
    index='userId',
    columns='movieId',
    values='rating'
).fillna(0)#.T.fillna(utility_df.mean(axis=1)).T

# R = utility_df.values
# user_ratings_mean = np.mean(R, axis = 1)
# R_demeaned = R - user_ratings_mean.reshape(-1, 1)
# utility_df = pd.DataFrame(R_demeaned, columns = utility_df.columns)
# utility_df = utility_df.fillna(0)

In [44]:
utility_df.values[1][39]

0.0

In [47]:
utility_mat[1][17]

-0.05936992174582546

In [311]:
utility_df = utility_df[:-1]
utility_df.shape

(670, 9066)

In [217]:
#userid - 1 will be the first index and movieid -1 will be the second index
new_user = (utility_df.values)[670]

In [323]:
R = sparse.csc_matrix(utility_df.values,dtype=float)

In [327]:
#perform SVD
U, Sigma, VT = randomized_svd(R,n_components=300,n_iter=5,random_state=None)
#U, Sigma, VT = svds(R, k=50)

In [328]:
#u,s,vt = svds(R_demeaned, k = 50)

#all_user_predicted_ratings = u.dot(np.diag(s).dot(vt))

In [329]:
all_user_predicted_ratings = U.dot(np.diag(Sigma).dot(VT))

In [330]:
all_user_predicted_ratings[320][0]

3.0236492268034794

In [331]:
#convert the matrix to df same as our utility_df but this one is preds_df
preds_df = pd.DataFrame(all_user_predicted_ratings, columns = utility_df.columns)
preds_df.shape

(671, 9066)

In [332]:
preds_df.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
0,-0.059185,-0.014793,0.079709,0.01521,-0.006548,0.083757,-0.005198,-0.034477,0.084171,0.068655,...,-0.002105,0.000191,0.014952,3.3e-05,4.9e-05,0.012873,0.022428,0.018057,0.010834,-0.00421
1,0.088541,0.533357,-0.221907,0.157336,0.071208,-0.393328,-0.123973,-0.022313,-0.09315,2.919186,...,-0.002066,0.001901,-0.005194,0.003754,0.005632,-0.012864,-0.007791,-0.024718,-0.014831,-0.004133
2,0.149916,0.142651,-0.28649,-0.041239,-0.02095,-0.064682,-0.180242,0.151253,0.180552,-0.044916,...,-0.005369,-0.001261,-0.006647,0.004244,0.006366,0.031325,-0.009971,0.026513,0.015908,-0.010737
3,-0.156537,0.089854,-0.155541,-0.086598,-0.173313,0.05153,-0.053802,-0.012273,-0.153094,4.409769,...,-0.000659,-0.0008,0.004366,-0.00024,-0.00036,-0.007977,0.00655,0.007178,0.004307,-0.001318
4,-0.241869,0.718017,3.468791,0.001517,-0.288426,-0.436644,0.101569,-0.047121,-0.208962,-0.157794,...,0.002972,-0.002046,0.010542,0.000938,0.001408,0.002627,0.015813,0.061654,0.036992,0.005943


# Completed the Collaborative Filtering part now lets Recommend

In [206]:
def get_viewed_movies(userId) :
    ''' 
    aggregate all movies that the user watched 
    input : userid
    output : a list of movie ids that the user has already rated
    '''
    
    users_viewed_movies = ratings_df[ratings_df['userId'] == userId].sort_values(['rating'], ascending=False)
    
    
    #print(str(userId) +  " has viewed following films")
    
    #r = movies_df.merge(users_viewed_movies, left_on='id', right_on='movieId').sort_values(['rating'],ascending=False)
    
#     for row in users_viewed_movies.itertuples():
#         rating = row[1]
#         print(str(row[3]) + " " + str(row[2]) + '\t' + str(rating))
#    print(r.head())
    return users_viewed_movies['movieId'].tolist()

In [167]:
p = get_viewed_movies(321)
print(232 in p)
#ratings_df[ratings_df['userId'] == 321]

True


In [207]:
def recommend_movies(preds_df, userID, movies_df, original_ratings_df, num_recommendations=5):
    
    # Get and sort the user's predictions
    
    # UserID starts at 1, not 0
    user_row_number = userID - 1 
    
    sorted_user_predictions = preds_df.iloc[user_row_number].sort_values(ascending=False)
    print(sorted_user_predictions.head())
    
    # Get the user's data and merge in the movie information.
    viewed_movieids = get_viewed_movies(userID)
               
    all_movieids = sorted_user_predictions.index.values.tolist()
    predicted_movieids = [value for value in all_movieids if value not in viewed_movieids]
    #print(type(predicted_movieids))
    recommendations = movies_df[movies_df['id'].isin(predicted_movieids)]
    recommendations = recommendations.merge(sorted_user_predictions, left_on='id', right_index=True).sort_values([userID-1],ascending=False)
    return predicted_movieids,recommendations

In [333]:
pm , predictions = recommend_movies(preds_df, 321, movies_df,ratings_df, 10)
print(319 in pm)
predictions.head()

movieId
21      5.338008
296     5.147531
551     4.864173
913     4.705847
2396    4.704830
Name: 320, dtype: float64
False


Unnamed: 0,budget,genres,id,original_title,spoken_languages,title,movieId,320
3823,55000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",1597,Meet the Parents,"[{'iso_639_1': 'en', 'name': 'English'}]",Meet the Parents,1597,0.794845
7823,0,"[{'id': 27, 'name': 'Horror'}, {'id': 878, 'na...",3019,Dr. Jekyll and Mr. Hyde,"[{'iso_639_1': 'en', 'name': 'English'}]",Dr. Jekyll and Mr. Hyde,3019,0.78803
1578,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",1959,Swept from the Sea,"[{'iso_639_1': 'en', 'name': 'English'}]",Swept from the Sea,1959,0.779274
3245,777000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",838,American Graffiti,"[{'iso_639_1': 'en', 'name': 'English'}]",American Graffiti,838,0.755256
6141,0,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",6016,The Good Thief,"[{'iso_639_1': 'en', 'name': 'English'}]",The Good Thief,6016,0.741755


In [116]:
new_user = (utility_df.values)[670]
new_user.shape

(9066,)

In [128]:
def add_new_user_to_collab(new_user,U,Sigma,VT,all_user_predicted_ratings) :
    
    #sparse matrix -> U.Sigma.VT
    #q = sparse_user. V. Sigma^-1
    q = new_user.dot(VT.transpose().dot(np.linalg.inv(np.diag(Sigma))))
    
    U = np.append(U,[q],axis=0)
    
    new = q.dot(np.diag(Sigma).dot(VT))
    all_user_predicted_ratings = np.append(all_user_predicted_ratings,[new],axis=0) 
    preds_df = pd.DataFrame(all_user_predicted_ratings, columns = utility_df.columns)
    print(preds_df.shape)
    return U, all_user_predicted_ratings, preds_df

In [225]:
U, all_user_predicted_ratings, preds_df = add_new_user_to_collab(new_user,U,Sigma,VT,all_user_predicted_ratings)

(671, 9066)


In [229]:
p = get_viewed_movies(671)
print(1 in pm)
print(1 in p)
preds_df[671][]

False
True


-0.15975394960516634

In [177]:

pm , predictions = recommend_movies(preds_df, 671, movies_df,ratings_df, 10)
print(1136 in pm)
predictions.head()

movieId
1136    5.159708
551     5.125028
5952    5.070815
2291    5.067528
589     5.054715
Name: 670, dtype: float64
False


Unnamed: 0,budget,genres,id,original_title,spoken_languages,title,movieId,670
10785,0,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",1544,Imagine Me & You,"[{'iso_639_1': 'en', 'name': 'English'}]",Imagine Me & You,1544,0.276112
17,4000000,"[{'id': 80, 'name': 'Crime'}, {'id': 35, 'name...",5,Four Rooms,"[{'iso_639_1': 'en', 'name': 'English'}]",Four Rooms,5,0.195217
1578,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",1959,Swept from the Sea,"[{'iso_639_1': 'en', 'name': 'English'}]",Swept from the Sea,1959,0.193697
385,45000,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",315,"Faster, Pussycat! Kill! Kill!","[{'iso_639_1': 'en', 'name': 'English'}]","Faster, Pussycat! Kill! Kill!",315,0.186071
1167,32350000,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",1892,Return of the Jedi,"[{'iso_639_1': 'en', 'name': 'English'}]",Return of the Jedi,1892,0.184441


In [173]:
#TODO : 
#add new ratings to rating_df
#implement the new item version
#create python scripts