In [1]:
import numpy as np
import pandas as pd

raw = [['1', '101', 5.0], ['1', '102', 3.0], ['1', '103', 2.5],
       ['2', '101', 2.0], ['2', '102', 2.5], ['2', '103', 5.0],
       ['2', '104', 2.0], ['3', '101', 2.0], ['3', '104', 4.0], 
       ['3', '105', 4.5], ['3', '107', 5.0], ['4', '101', 5.0], 
       ['4', '103', 3.0], ['4', '104', 4.5], ['4', '106', 4.0],
       ['5', '101', 4.0], ['5', '102', 3.0], ['5', '103', 2.0],
       ['5', '104', 4.0], ['5', '105', 3.5], ['5', '106', 4.0]]

df = pd.DataFrame(raw, columns=['CID', 'MID', 'SCORE'])
df.head()

Unnamed: 0,CID,MID,SCORE
0,1,101,5.0
1,1,102,3.0
2,1,103,2.5
3,2,101,2.0
4,2,102,2.5


In [3]:
from surprise import Dataset
from surprise import KNNBasic,SlopeOne,SVD,SVDpp
from surprise import Reader
from surprise.model_selection import train_test_split,cross_validate

reader = Reader(rating_scale=(1,5))
data = Dataset.load_from_df(df,reader)
trainset,testset = train_test_split(data,test_size=0.2)
knn =  KNNBasic(k=3,sim_options={'user_based':True})
knn.fit(trainset).test(testset)

Computing the msd similarity matrix...
Done computing similarity matrix.


[Prediction(uid='4', iid='106', r_ui=4.0, est=4.0, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid='3', iid='101', r_ui=2.0, est=4.239999999999999, details={'actual_k': 3, 'was_impossible': False}),
 Prediction(uid='1', iid='102', r_ui=3.0, est=2.905882352941177, details={'actual_k': 2, 'was_impossible': False}),
 Prediction(uid='5', iid='103', r_ui=2.0, est=3.369565217391304, details={'actual_k': 2, 'was_impossible': False}),
 Prediction(uid='4', iid='103', r_ui=3.0, est=2.7597402597402594, details={'actual_k': 2, 'was_impossible': False})]

In [4]:
knn.predict(uid='1',iid='102')

Prediction(uid='1', iid='102', r_ui=None, est=2.905882352941177, details={'actual_k': 2, 'was_impossible': False})

In [5]:
cross_validate(knn,data)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


{'test_rmse': array([1.3936984 , 0.6142118 , 1.24398243, 1.39459231, 1.28354918]),
 'test_mae': array([1.08639545, 0.54647667, 0.97708561, 0.99454813, 0.96925134]),
 'fit_time': (0.0,
  0.003998279571533203,
  0.00099945068359375,
  0.0010001659393310547,
  0.0010004043579101562),
 'test_time': (0.0010006427764892578, 0.0, 0.0, 0.0009989738464355469, 0.0)}

使用GridSearchCV——调参

In [7]:
from surprise.model_selection import GridSearchCV

file_path = ('C:/Users/acerpc/Desktop/data_folder/ml-100k/u.data')
reader = Reader(line_format='user item rating timestamp',sep='\t')
ml = Dataset.load_from_file(file_path,reader)

param_grid = {'k':[10,20],'reg_all':[0.4,0.6]}
gcv = GridSearchCV(KNNBasic,param_grid,measures=['rmse', 'mae'],cv=3)
gcv.fit(ml)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


In [8]:
print(gcv.best_params)
print(gcv.best_score)

{'rmse': {'k': 20, 'reg_all': 0.4}, 'mae': {'k': 20, 'reg_all': 0.4}}
{'rmse': 0.9862906196306217, 'mae': 0.778162145503173}


SVD

In [9]:
#svd = SVD(n_factors=10)
param_grid = {'n_factors':[20,50],'reg_all':[0.05,0.1,0.5]}
gcv = GridSearchCV(SVD,param_grid,measures=['rmse', 'mae'],cv=3)
gcv.fit(ml)

In [10]:
print(gcv.best_params)
print(gcv.best_score)

{'rmse': {'n_factors': 50, 'reg_all': 0.05}, 'mae': {'n_factors': 50, 'reg_all': 0.05}}
{'rmse': 0.9410984667901771, 'mae': 0.7439456524312185}


In [17]:
algo = gcv.best_estimator['rmse']
algo.fit(ml.build_full_trainset())

results_df = pd.DataFrame.from_dict(gcv.cv_results)

In [18]:
results_df

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse,split0_test_mae,split1_test_mae,split2_test_mae,mean_test_mae,std_test_mae,rank_test_mae,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_n_factors,param_reg_all
0,0.937225,0.948026,0.94643,0.943893,0.00476,2,0.742169,0.749547,0.748662,0.746793,0.00329,2,3.076569,0.067018,0.494179,0.064912,"{'n_factors': 20, 'reg_all': 0.05}",20,0.05
1,0.938627,0.948618,0.948981,0.945409,0.004798,4,0.744736,0.751442,0.752556,0.749578,0.003454,4,3.005667,0.056965,0.548901,0.084472,"{'n_factors': 20, 'reg_all': 0.1}",20,0.1
2,0.958195,0.967371,0.968052,0.96454,0.004495,5,0.768384,0.77641,0.777872,0.774222,0.004171,5,3.077429,0.021039,0.487329,0.071462,"{'n_factors': 20, 'reg_all': 0.5}",20,0.5
3,0.932725,0.945459,0.945112,0.941098,0.005923,1,0.738234,0.746406,0.747197,0.743946,0.004052,1,4.765633,0.20096,0.476592,0.071544,"{'n_factors': 50, 'reg_all': 0.05}",50,0.05
4,0.937793,0.9482,0.948382,0.944792,0.004949,3,0.744114,0.750929,0.751964,0.749002,0.003482,3,4.934842,0.333392,0.537098,0.050241,"{'n_factors': 50, 'reg_all': 0.1}",50,0.1
5,0.958197,0.967396,0.968079,0.964557,0.004506,6,0.768407,0.776437,0.777898,0.774247,0.004173,6,4.727065,0.051366,0.474288,0.063808,"{'n_factors': 50, 'reg_all': 0.5}",50,0.5


In [20]:
res = algo.predict('2','111')

In [22]:
list(res)

['2', '111', None, 3.5514888174266455, {'was_impossible': False}]