In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from surprise import SVD
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate, train_test_split
from surprise import accuracy

import movieLens_util
from numpy.linalg import svd

In [2]:
df_data, df_movie_meta, df_catalog, df_user_meta = movieLens_util.load_movieLens_dataset('ml-100k')

In [3]:
## load dataframe into surprise dataset object
## this time also need Reader object since need to scale the rating range

reader = Reader(rating_scale=(1,5))
algo = SVD()
data_input = Dataset.load_from_df(df_data[['user_id','item_id','rating']], reader)

trainset = data_input.build_full_trainset() 
testset = trainset.build_testset() ## testset is same as trainset

algo.fit(trainset)
predictions = algo.test(testset) # get prediction rating matrix of trainset

## load prediction into rating dataframe
model_pred_rating_matrix = pd.DataFrame([[i.uid, i.iid, i.est] for i in predictions], 
                                        columns=['user_id','movie_id','svd_rating'])


In [4]:
model_pred_rating_matrix.shape

(100000, 3)

In [5]:
## use un-rating part as testset for prediction

anti_testset = trainset.build_anti_testset()
anti_predictions = algo.test(anti_testset)

model_pred_unrating_matrix = pd.DataFrame([[i.uid, i.iid, i.est] for i in anti_predictions],
                              columns=['user_id','movie_id','svd_rating'])


In [6]:
model_pred_unrating_matrix.shape ## total rating count: user(943)*movie(1682) = 1586126

(1486126, 3)

In [7]:
## fill all missing rating by prediction

full_rating_matrix = pd.concat([model_pred_rating_matrix, model_pred_unrating_matrix], ignore_index=True)
full_rating_matrix.shape

(1586126, 3)

In [9]:
## convert full_rating_matrix into pivot first

pivot_rating_matrix = pd.pivot_table(full_rating_matrix, values='svd_rating', index='user_id', columns='movie_id')
pivot_rating_matrix.shape

(943, 1682)

### After getting dense prediction rating matrix, then we can use svd to split this matrix rather than SGD training

In [117]:
## then we can get using SVD to split this dense prediction matrix 

from numpy.linalg import svd

dense_rating_matrix = pivot_rating_matrix.to_numpy()

U, S, Vt = svd(dense_rating_matrix, full_matrices=False)

Sigma = np.diag(S)
print(U.shape, S.shape, Vt.shape)

(943, 943) (943,) (943, 1682)


#### Above Eigenmatrix is 943*943, but actually we use top K highest eigenvalue part for reduce dimonsion

## 1. Folding in technique for cold start
1. get new user rating matrix q
2. get trained SVD matrix: U, S, V
3. project q into user latent factor matrix space: q' = qVS-1
4. fold q' into user latent factor matrix U

### First, find the top K latent factor, this part can be training by SGD, and we can use fine-tunning to find best topK eigenvector

In [316]:
U_topK = U[:,:100]
U_topK.shape

(943, 100)

In [312]:
S_topK = Sigma[:100,:100]
S_topK.shape

(100, 100)

In [322]:
Vt_topK = Vt[:100,:]
Vt_topK.shape

(100, 1682)

In [323]:
inverse_S_topK = np.linalg.inv(S_topK)

project_newUser = np.dot(np.dot(new_user_rating_matrix.reshape(1,1682), Vt_topK.transpose()), inverse_S_topK)

In [327]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity_list = []

for i in range(942):
    cos_sim_val = cosine_similarity(project_newUser, U_topK[i,:].reshape(1,100))
    cosine_similarity_list.append(cos_sim_val)
    
cosine_similarity_list = np.array(cosine_similarity_list)

In [334]:
cosine_similarity_list[165]=0
np.argmax(cosine_similarity_list)

723

## 2. Another solution based on a paper. New user missing value imputation - by zeros from another paper

In [247]:
## Solve cold start issue

## new user cold start issue
df_data_new_user166 = df_data[df_data.user_id==166]

In [248]:
new_user_rating_matrix = np.zeros((1682))
for i in df_data_new_user166.item_id:
    new_user_rating_matrix[i-1]=df_data_new_user166[df_data_new_user166.item_id==i].rating

In [249]:
user166_matrix = dense_rating_matrix[165,:]

In [250]:
new_user_projection = np.dot(new_user_rating_matrix, Vt.transpose())
Sigma_with_newUser = np.vstack((Sigma, new_user_projection))
Uf, Sf, Vtf = svd(Sigma_with_newUser, full_matrices=False)
Sigma_f = np.diag(Sf)

In [251]:
zeros = np.zeros((943,1))
New_user_matrix = np.append(U,zeros,axis=1)
zeros = np.zeros((1,944))
zeros[0,943]=1
New_user_matrix = np.append(New_user_matrix,zeros,axis=0)

In [252]:
New_prediction_rating_matrix = np.dot(New_user_matrix, Uf)
New_prediction_rating_matrix = np.dot(New_prediction_rating_matrix, Sigma_f)
New_prediction_rating_matrix = np.dot(New_prediction_rating_matrix, np.dot(Vtf, Vt))

In [254]:
user943_matrix = New_prediction_rating_matrix[943,:] + algo.trainset.global_mean
for idx, val in np.ndenumerate(user943_matrix):
    if val>5:
        user943_matrix[idx] = 5
    if val<=1:
        user943_matrix[idx] = 1
    