In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

In [2]:
#reader, dataset, normpred, baseline, knn, nmf?
from surprise import Reader, Dataset, SVD, SVDpp, NormalPredictor, BaselineOnly, KNNBasic, NMF, accuracy
from surprise.dataset import DatasetAutoFolds
from surprise.model_selection import cross_validate, KFold ,GridSearchCV , RandomizedSearchCV
from surprise.similarities import cosine
#import surprise.accuracy

from sklearn.model_selection import train_test_split, LeaveOneOut
from surprise.model_selection import train_test_split as tts
from scipy.spatial.distance import cosine, correlation
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

In [3]:
ratings = pd.read_csv('../../Data/ratings_processed.csv', index_col=[0], parse_dates=['timestamp'])
movies = pd.read_csv('../../Data/movies_processed.csv', index_col=[0])

In [4]:
surprisedf = pd.DataFrame()
surprisedf['userID'] = ratings.userId
surprisedf['itemID'] = ratings.movieId
surprisedf['ratings'] =ratings.rating

In [5]:
reader = Reader(rating_scale=(.5, 5))

In [6]:
data = Dataset.load_from_df(surprisedf, reader)

In [7]:
trainset, testset = tts(data, test_size=.25)

In [8]:
sim_options = {'name': 'cosine',
               #'user_based': False  # compute  similarities between items
               }

In [9]:
algo = KNNBasic(sim_options=sim_options)
#cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)    takes a long time, performs poorly

In [10]:
algo = NormalPredictor()
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)    

Evaluating RMSE, MAE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.4230  1.4311  1.4339  1.4186  1.4273  1.4268  0.0055  
MAE (testset)     1.1367  1.1403  1.1451  1.1313  1.1423  1.1392  0.0048  
Fit time          0.09    0.11    0.11    0.11    0.10    0.10    0.01    
Test time         0.08    0.14    0.09    0.08    0.08    0.10    0.02    


{'test_rmse': array([1.42302908, 1.43109997, 1.43390731, 1.41856638, 1.42733541]),
 'test_mae': array([1.1366545 , 1.14032841, 1.14510894, 1.13133635, 1.14232809]),
 'fit_time': (0.08576202392578125,
  0.10672831535339355,
  0.11394286155700684,
  0.11170768737792969,
  0.10471677780151367),
 'test_time': (0.08377623558044434,
  0.13962197303771973,
  0.08875799179077148,
  0.08477067947387695,
  0.0797872543334961)}

In [11]:
algo = SVD()
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8720  0.8742  0.8743  0.8757  0.8726  0.8738  0.0013  
MAE (testset)     0.6706  0.6734  0.6729  0.6732  0.6696  0.6719  0.0015  
Fit time          3.04    3.05    3.08    3.06    3.03    3.05    0.02    
Test time         0.10    0.15    0.09    0.09    0.15    0.11    0.03    


{'test_rmse': array([0.87200795, 0.87419464, 0.874274  , 0.87569695, 0.8725771 ]),
 'test_mae': array([0.67057526, 0.67337724, 0.67289296, 0.6732282 , 0.66964759]),
 'fit_time': (3.0359489917755127,
  3.0528907775878906,
  3.075406789779663,
  3.0648272037506104,
  3.032898187637329),
 'test_time': (0.0967409610748291,
  0.14759588241577148,
  0.08976030349731445,
  0.08876276016235352,
  0.1466081142425537)}

In [12]:
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x284e690baf0>

In [13]:
algo.test(testset)

[Prediction(uid=189, iid=33794, r_ui=3.5, est=3.761714065221536, details={'was_impossible': False}),
 Prediction(uid=346, iid=44555, r_ui=4.0, est=3.8460082842609338, details={'was_impossible': False}),
 Prediction(uid=346, iid=2712, r_ui=4.0, est=3.790050918842525, details={'was_impossible': False}),
 Prediction(uid=330, iid=377, r_ui=0.5, est=3.5814599970628396, details={'was_impossible': False}),
 Prediction(uid=480, iid=1889, r_ui=3.5, est=3.3712769539657663, details={'was_impossible': False}),
 Prediction(uid=280, iid=3176, r_ui=4.5, est=4.084753777696627, details={'was_impossible': False}),
 Prediction(uid=282, iid=6059, r_ui=4.5, est=3.815028627757928, details={'was_impossible': False}),
 Prediction(uid=148, iid=5816, r_ui=4.0, est=3.805755774734982, details={'was_impossible': False}),
 Prediction(uid=137, iid=954, r_ui=4.5, est=4.058539523807049, details={'was_impossible': False}),
 Prediction(uid=448, iid=1378, r_ui=3.0, est=3.0391841801517785, details={'was_impossible': False

In [14]:
import pickle

In [15]:
!mkdir models

In [16]:
filename = './models/SVD_one_to_five.sav'
pickle.dump(algo, open(filename, 'wb'))

In [17]:
algo = SVDpp()
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8587  0.8721  0.8532  0.8605  0.8554  0.8600  0.0066  
MAE (testset)     0.6587  0.6675  0.6544  0.6599  0.6577  0.6596  0.0043  
Fit time          361.58  367.45  365.91  366.88  364.02  365.17  2.14    
Test time         5.85    5.95    5.98    6.37    5.87    6.00    0.19    


{'test_rmse': array([0.85865036, 0.87214402, 0.85323078, 0.86049696, 0.85539988]),
 'test_mae': array([0.65873515, 0.66745097, 0.65435092, 0.65992307, 0.6577393 ]),
 'fit_time': (361.58263301849365,
  367.45033025741577,
  365.90826749801636,
  366.88000106811523,
  364.0191912651062),
 'test_time': (5.845977783203125,
  5.9535276889801025,
  5.977588415145874,
  6.369399785995483,
  5.869545936584473)}

In [18]:
algo = SVDpp()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x284ea310a60>

In [19]:
filename = './models/SVD++_one_to_five.sav'
pickle.dump(algo, open(filename, 'wb'))

In [20]:
data2 = Dataset.load_builtin('ml-100k')

In [21]:
algo = SVD()
cross_validate(algo, data2, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9408  0.9340  0.9318  0.9382  0.9360  0.9362  0.0031  
MAE (testset)     0.7414  0.7352  0.7365  0.7396  0.7367  0.7379  0.0023  
Fit time          3.07    3.04    3.03    3.03    3.03    3.04    0.02    
Test time         0.10    0.15    0.09    0.15    0.09    0.12    0.03    


{'test_rmse': array([0.94077916, 0.93404583, 0.93180305, 0.93823023, 0.93602084]),
 'test_mae': array([0.74138   , 0.73518533, 0.73649819, 0.73955489, 0.73672066]),
 'fit_time': (3.072866916656494,
  3.03745436668396,
  3.0253612995147705,
  3.031636953353882,
  3.0300066471099854),
 'test_time': (0.09574389457702637,
  0.15462040901184082,
  0.09029889106750488,
  0.15059757232666016,
  0.0907590389251709)}

In [22]:
surprisedf['new_ratings'] = np.where(surprisedf.ratings > 3, 1, 0)
surprisedf['new_ratings'] = np.where(surprisedf.ratings < 2.5, -1, surprisedf.new_ratings)

In [23]:
reader = Reader(rating_scale=(-1, 1))
data = Dataset.load_from_df(surprisedf[['userID', 'itemID', 'new_ratings']], reader)

In [24]:
trainset, testset = tts(data, test_size=.25)

In [25]:
algo = SVD()
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.6208  0.6157  0.6238  0.6208  0.6240  0.6210  0.0030  
MAE (testset)     0.4859  0.4830  0.4861  0.4881  0.4882  0.4863  0.0019  
Fit time          3.07    3.10    3.30    3.24    3.23    3.19    0.09    
Test time         0.17    0.10    0.19    0.10    0.17    0.14    0.04    


{'test_rmse': array([0.62076334, 0.61567141, 0.62382609, 0.62078324, 0.62403475]),
 'test_mae': array([0.4858949 , 0.48304322, 0.48612535, 0.48806769, 0.48819361]),
 'fit_time': (3.0718274116516113,
  3.0998008251190186,
  3.304046869277954,
  3.2361834049224854,
  3.2301199436187744),
 'test_time': (0.16555452346801758,
  0.0997316837310791,
  0.18653130531311035,
  0.0967416763305664,
  0.17257452011108398)}

In [26]:
algo.fit(trainset)
algo.test(testset) #the ultimate problem becomes that these are ratings predictions, so hit rate not present!

[Prediction(uid=216, iid=2109, r_ui=-1.0, est=0.43978039232989097, details={'was_impossible': False}),
 Prediction(uid=191, iid=246, r_ui=1.0, est=0.6632919256965114, details={'was_impossible': False}),
 Prediction(uid=462, iid=8464, r_ui=0.0, est=0.603586511567877, details={'was_impossible': False}),
 Prediction(uid=380, iid=69306, r_ui=0.0, est=0.25233657759687456, details={'was_impossible': False}),
 Prediction(uid=318, iid=27773, r_ui=1.0, est=1, details={'was_impossible': False}),
 Prediction(uid=238, iid=4946, r_ui=1.0, est=0.5375598890904844, details={'was_impossible': False}),
 Prediction(uid=198, iid=5607, r_ui=1.0, est=0.2474911972881284, details={'was_impossible': False}),
 Prediction(uid=469, iid=2662, r_ui=0.0, est=0.39518967854186476, details={'was_impossible': False}),
 Prediction(uid=414, iid=421, r_ui=0.0, est=0.20714397159132522, details={'was_impossible': False}),
 Prediction(uid=66, iid=5673, r_ui=1.0, est=0.8028007618523914, details={'was_impossible': False}),
 Pre

In [27]:
filename = './models/SVD_negone_to_one.sav'
pickle.dump(algo, open(filename, 'wb'))

In [28]:
algo = SVDpp()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x284e675be80>

In [29]:
filename = './models/SVD++_negone_to_one.sav'
pickle.dump(algo, open(filename, 'wb'))

In [30]:
algo.predict(80, 648)

Prediction(uid=80, iid=648, r_ui=None, est=0.9025041322655403, details={'was_impossible': False})

In [31]:
algo.predict(80, 42011)

Prediction(uid=80, iid=42011, r_ui=None, est=0.42198964347545403, details={'was_impossible': False})

In [32]:
test = algo.predict(80, 140956)
test

Prediction(uid=80, iid=140956, r_ui=None, est=0.859111884667473, details={'was_impossible': False})

In [33]:
#can use the predict on the fitted algorithm to predict score for all movies for all users. Then take all 1's and try to
#find a way to rank them. ie treat this stage as query and then ranking model next

In [34]:
results = []
for i in surprisedf.itemID.unique():
    pred = algo.predict(80, i)
    results.append(pred)

In [35]:
results = pd.DataFrame(results)
results.head()

Unnamed: 0,uid,iid,r_ui,est,details
0,80,1,,1.0,{'was_impossible': False}
1,80,3,,0.81357,{'was_impossible': False}
2,80,6,,1.0,{'was_impossible': False}
3,80,47,,1.0,{'was_impossible': False}
4,80,50,,1.0,{'was_impossible': False}


In [36]:
results[results.est == 1].merge(movies[['movieId', 'title']], left_on='iid', right_on='movieId')

Unnamed: 0,uid,iid,r_ui,est,details,movieId,title
0,80,1,,1.0,{'was_impossible': False},1,Toy Story (1995)
1,80,6,,1.0,{'was_impossible': False},6,Heat (1995)
2,80,47,,1.0,{'was_impossible': False},47,Seven (a.k.a. Se7en) (1995)
3,80,50,,1.0,{'was_impossible': False},50,"Usual Suspects, The (1995)"
4,80,110,,1.0,{'was_impossible': False},110,Braveheart (1995)
...,...,...,...,...,...,...,...
1564,80,95519,,1.0,{'was_impossible': False},95519,Dragon Ball Z: Bojack Unbound (Doragon bôru Z ...
1565,80,128736,,1.0,{'was_impossible': False},128736,I'm Here (2010)
1566,80,137859,,1.0,{'was_impossible': False},137859,Dragon Ball Z Gaiden: The Plot to Destroy the ...
1567,80,181315,,1.0,{'was_impossible': False},181315,Phantom Thread (2017)


In [37]:
ratings[ratings.userId == 1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,2000-07-30
1,1,3,4.0,2000-07-30
2,1,6,4.0,2000-07-30
3,1,47,5.0,2000-07-30
4,1,50,5.0,2000-07-30
...,...,...,...,...
227,1,3744,4.0,2000-07-30
228,1,3793,5.0,2000-07-30
229,1,3809,4.0,2000-07-30
230,1,4006,4.0,2000-07-30
