In [5]:
import pandas as pd
from surprise import Reader, SVD, Dataset, accuracy
from surprise.model_selection import train_test_split,GridSearchCV,cross_validate
movie = pd.read_csv("movies.csv")
rating = pd.read_csv("ratings.csv")
df = pd.merge(movie, rating,how="left",on="movieId")
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.0,4.0,944919400.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6.0,5.0,858275500.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8.0,4.0,833981900.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10.0,4.0,943497900.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11.0,4.5,1230859000.0


In [6]:
movie_ids = [130219,356,4422,541]
movies= ["The Dark Knight (2011)",
         "Cries and Whispers (Viskningar och rop) (1972)",
         "Forrest Gump (1994)",
         "Blade Runner (1982)"]

sample_df = df.loc[df["movieId"].isin(movie_ids)]
sample_df

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
2457839,356,Forrest Gump (1994),Comedy|Drama|Romance|War,4.0,4.0,8.408789e+08
2457840,356,Forrest Gump (1994),Comedy|Drama|Romance|War,7.0,4.0,1.011208e+09
2457841,356,Forrest Gump (1994),Comedy|Drama|Romance|War,8.0,5.0,8.339823e+08
2457842,356,Forrest Gump (1994),Comedy|Drama|Romance|War,9.0,4.0,9.940192e+08
2457843,356,Forrest Gump (1994),Comedy|Drama|Romance|War,10.0,3.0,9.434971e+08
...,...,...,...,...,...,...
14742596,4422,Cries and Whispers (Viskningar och rop) (1972),Drama,137665.0,5.0,1.015723e+09
14742597,4422,Cries and Whispers (Viskningar och rop) (1972),Drama,137851.0,4.5,1.346153e+09
14742598,4422,Cries and Whispers (Viskningar och rop) (1972),Drama,137904.0,3.5,1.169100e+09
14742599,4422,Cries and Whispers (Viskningar och rop) (1972),Drama,138325.0,5.0,1.282957e+09


In [7]:
user_movie_df = sample_df.pivot_table(index="userId",columns="movieId",values="rating")
user_movie_df

movieId,356,541,4422,130219
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.0,,4.0,,
2.0,,5.0,,
3.0,,5.0,,
4.0,4.0,,,
7.0,4.0,,,
...,...,...,...,...
138474.0,5.0,,,
138483.0,4.0,4.0,,
138484.0,5.0,,,
138486.0,5.0,,,


In [8]:
reader = Reader(rating_scale=(1,5))

In [9]:
data = Dataset.load_from_df(sample_df[["userId","movieId","rating"]],reader)
data

<surprise.dataset.DatasetAutoFolds at 0x163a39be950>

In [36]:
train, test = train_test_split(data,test_size=.25)
svd_model = SVD(n_factors=60)
svd_model.fit(train)
predictions = svd_model.test(test)
predictions

[Prediction(uid=67044.0, iid=356, r_ui=3.5, est=4.119394343001923, details={'was_impossible': False}),
 Prediction(uid=96584.0, iid=356, r_ui=4.0, est=4.119394343001923, details={'was_impossible': False}),
 Prediction(uid=119031.0, iid=541, r_ui=4.5, est=4.176049061237379, details={'was_impossible': False}),
 Prediction(uid=99263.0, iid=356, r_ui=4.0, est=4.119394343001923, details={'was_impossible': False}),
 Prediction(uid=78361.0, iid=541, r_ui=4.0, est=3.9200087161947867, details={'was_impossible': False}),
 Prediction(uid=52730.0, iid=356, r_ui=4.0, est=4.119394343001923, details={'was_impossible': False}),
 Prediction(uid=137206.0, iid=541, r_ui=4.0, est=4.176049061237379, details={'was_impossible': False}),
 Prediction(uid=56788.0, iid=356, r_ui=4.0, est=4.119394343001923, details={'was_impossible': False}),
 Prediction(uid=107766.0, iid=541, r_ui=4.5, est=4.0579011011661645, details={'was_impossible': False}),
 Prediction(uid=124862.0, iid=541, r_ui=5.0, est=4.176049061237379, 

In [37]:
accuracy.rmse(predictions)

RMSE: 0.9380


0.93799535220171

In [38]:
svd_model.predict(uid=40101.0,iid=356,verbose=True,r_ui=2)

user: 40101.0    item: 356        r_ui = 2.00   est = 3.92   {'was_impossible': False}


Prediction(uid=40101.0, iid=356, r_ui=2, est=3.922437925389811, details={'was_impossible': False})

In [39]:
svd_model.predict(uid=97368.0,iid=356,verbose=True)

user: 97368.0    item: 356        r_ui = None   est = 4.16   {'was_impossible': False}


Prediction(uid=97368.0, iid=356, r_ui=None, est=4.161252101433966, details={'was_impossible': False})

In [55]:
param_grid = {"n_epochs":[5,10,20],
              "lr_all":[0.002,0.005,0.007],
              "n_factors":[20,40,70,100]}
gs = GridSearchCV(SVD,param_grid,measures=["rmse","mae"],cv=5,n_jobs=-1,joblib_verbose=True)
gs.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:   20.4s finished


In [44]:
gs.best_params

{'rmse': {'n_epochs': 5, 'lr_all': 0.005, 'n_factors': 20},
 'mae': {'n_epochs': 5, 'lr_all': 0.007, 'n_factors': 20}}

In [45]:
gs.best_score

{'rmse': 0.9302532978927136, 'mae': 0.713294478912878}

In [46]:
gs.best_params["rmse"]

{'n_epochs': 5, 'lr_all': 0.005, 'n_factors': 20}

In [47]:
dir(SVD())

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slotnames__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'biased',
 'bsl_options',
 'compute_baselines',
 'compute_similarities',
 'default_prediction',
 'estimate',
 'fit',
 'get_neighbors',
 'init_mean',
 'init_std_dev',
 'lr_bi',
 'lr_bu',
 'lr_pu',
 'lr_qi',
 'n_epochs',
 'n_factors',
 'predict',
 'random_state',
 'reg_bi',
 'reg_bu',
 'reg_pu',
 'reg_qi',
 'sgd',
 'sim_options',
 'test',
 'verbose']

In [48]:
SVD().n_factors

100

In [56]:
svd_model = SVD(**gs.best_params["rmse"]) 
data = data.build_full_trainset()
svd_model.fit(data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x163c4bb7610>

In [58]:
pred=svd_model.predict(uid=40101.0,iid=356,verbose=True,r_ui=2)
pred

user: 40101.0    item: 356        r_ui = 2.00   est = 3.90   {'was_impossible': False}


Prediction(uid=40101.0, iid=356, r_ui=2, est=3.8954258430566813, details={'was_impossible': False})

plus

In [61]:
pred_df = pd.DataFrame([{'uid': pred.uid, 
                         'iid': pred.iid, 
                         'r_ui': pred.r_ui, 
                         'est': pred.est, 
                         }])
pred_df

Unnamed: 0,uid,iid,r_ui,est
0,40101.0,356,2,3.895426


In [66]:
preds = []

# Her bir kullanıcı ve öğe çifti için tahmin yap
for uid, iid in zip(sample_df['userId'], sample_df['movieId']):
    pred = svd_model.predict(uid, iid)
    preds.append(pd.DataFrame({'uid': [pred.uid], 
                               'iid': [pred.iid], 
                               'r_ui': [pred.r_ui], 
                               'est': [pred.est], 
                               }))

# Tüm tahminleri tek bir DataFrame'e birleştir
pred_df = pd.concat(preds, ignore_index=True)

In [70]:
preds

[   uid  iid  r_ui      est
 0  4.0  356  None  4.04935,
    uid  iid  r_ui       est
 0  7.0  356  None  4.047771,
    uid  iid  r_ui       est
 0  8.0  356  None  4.156136,
    uid  iid  r_ui      est
 0  9.0  356  None  4.07154,
     uid  iid  r_ui       est
 0  10.0  356  None  4.014795,
     uid  iid  r_ui       est
 0  11.0  356  None  4.104422,
     uid  iid  r_ui       est
 0  12.0  356  None  4.074326,
     uid  iid  r_ui       est
 0  13.0  356  None  4.086807,
     uid  iid  r_ui      est
 0  14.0  356  None  4.05212,
     uid  iid  r_ui       est
 0  15.0  356  None  4.127052,
     uid  iid  r_ui       est
 0  16.0  356  None  4.062426,
     uid  iid  r_ui       est
 0  18.0  356  None  4.058937,
     uid  iid  r_ui       est
 0  23.0  356  None  3.965526,
     uid  iid  r_ui       est
 0  24.0  356  None  4.057546,
     uid  iid  r_ui       est
 0  25.0  356  None  4.049665,
     uid  iid  r_ui      est
 0  26.0  356  None  4.08215,
     uid  iid  r_ui       est
 0  28.0  

In [67]:
pred_df

Unnamed: 0,uid,iid,r_ui,est
0,4.0,356,,4.049350
1,7.0,356,,4.047771
2,8.0,356,,4.156136
3,9.0,356,,4.071540
4,10.0,356,,4.014795
...,...,...,...,...
97338,137665.0,4422,,4.114758
97339,137851.0,4422,,4.104296
97340,137904.0,4422,,3.933818
97341,138325.0,4422,,4.148356


In [72]:
pred_df.groupby("uid").agg({"est":"mean"})

Unnamed: 0_level_0,est
uid,Unnamed: 1_level_1
1.0,4.162488
2.0,4.252716
3.0,4.237015
4.0,4.049350
7.0,4.047771
...,...
138474.0,4.120755
138483.0,4.114165
138484.0,4.116710
138486.0,4.066799


In [74]:
svd_model.test(test)

[Prediction(uid=67044.0, iid=356, r_ui=3.5, est=3.934124404165667, details={'was_impossible': False}),
 Prediction(uid=96584.0, iid=356, r_ui=4.0, est=4.050297814783518, details={'was_impossible': False}),
 Prediction(uid=119031.0, iid=541, r_ui=4.5, est=4.216525232797461, details={'was_impossible': False}),
 Prediction(uid=99263.0, iid=356, r_ui=4.0, est=4.075630346297981, details={'was_impossible': False}),
 Prediction(uid=78361.0, iid=541, r_ui=4.0, est=4.167374101631312, details={'was_impossible': False}),
 Prediction(uid=52730.0, iid=356, r_ui=4.0, est=4.064229787736486, details={'was_impossible': False}),
 Prediction(uid=137206.0, iid=541, r_ui=4.0, est=4.1912999138563745, details={'was_impossible': False}),
 Prediction(uid=56788.0, iid=356, r_ui=4.0, est=4.108679283095635, details={'was_impossible': False}),
 Prediction(uid=107766.0, iid=541, r_ui=4.5, est=4.270797995836794, details={'was_impossible': False}),
 Prediction(uid=124862.0, iid=541, r_ui=5.0, est=4.250901475201132, d