* ### Automatic cross_validation

In [1]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate

In [2]:
data = Dataset.load_builtin('ml-100k')

Dataset ml-100k could not be found. Do you want to download it? [Y/n] Y
Trying to download dataset from http://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /home/db/.surprise_data/ml-100k


In [4]:
algo=SVD()
# Run 5-fold cross-validation 
cross_validate(algo,data,measures=['RMSE','MAE'],cv=5,verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9393  0.9317  0.9431  0.9350  0.9387  0.9376  0.0039  
MAE (testset)     0.7388  0.7357  0.7446  0.7372  0.7405  0.7394  0.0031  
Fit time          4.34    4.34    4.34    4.34    4.35    4.34    0.00    
Test time         0.21    0.19    0.14    0.17    0.19    0.18    0.02    


{'test_rmse': array([0.93926609, 0.93167591, 0.9431158 , 0.93503944, 0.9386839 ]),
 'test_mae': array([0.73882238, 0.73572078, 0.744572  , 0.73715702, 0.74051729]),
 'fit_time': (4.343350887298584,
  4.343995571136475,
  4.339187145233154,
  4.337637662887573,
  4.350447654724121),
 'test_time': (0.2081305980682373,
  0.19013023376464844,
  0.14000606536865234,
  0.17091965675354004,
  0.19012928009033203)}

* ### Train-test split and the fit() method

In [7]:
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split

In [8]:
data = Dataset.load_builtin('ml-100k')

In [12]:
trainset, testset=train_test_split(data,test_size=.25)
algo=SVD()
algo.fit(trainset)
predictions=algo.test(testset)
accuracy.rmse(predictions)

RMSE: 0.9377


0.937719591326765

* ### Train on a whole trainset and the predict() method


In [13]:
from surprise import KNNBasic
from surprise import Dataset

data = Dataset.load_builtin('ml-100k')
algo=KNNBasic()
algo.fit(trainset)

#prediction
uid=str(196)
iid=str(302)

pred=algo.predict(uid,iid,r_ui=4,verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
user: 196        item: 302        r_ui = 4.00   est = 4.07   {'actual_k': 40, 'was_impossible': False}


* ### Use a custom dataset


In [18]:
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
import os

In [26]:
file_path=os.path.expanduser('/home/db/.surprise_data/ml-100k/ml-100k/u.data')
file_path

'/home/db/.surprise_data/ml-100k/ml-100k/u.data'

In [27]:
reader=Reader(line_format='user item rating timestamp',sep='\t',rating_scale=(1,5))
data=Dataset.load_from_file(file_path,reader=reader)

In [28]:
algo=BaselineOnly()
cross_validate(algo,data,verbose=True)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9468  0.9404  0.9446  0.9472  0.9389  0.9436  0.0034  
MAE (testset)     0.7484  0.7462  0.7482  0.7510  0.7463  0.7480  0.0018  
Fit time          0.11    0.12    0.11    0.11    0.11    0.11    0.00    
Test time         0.10    0.11    0.17    0.16    0.10    0.13    0.03    


{'test_rmse': array([0.94675549, 0.9403513 , 0.94460222, 0.94722602, 0.93888452]),
 'test_mae': array([0.74842747, 0.74618857, 0.74815628, 0.75103017, 0.74634   ]),
 'fit_time': (0.11129379272460938,
  0.11593890190124512,
  0.10991406440734863,
  0.10744047164916992,
  0.10954022407531738),
 'test_time': (0.09913206100463867,
  0.10679364204406738,
  0.17378520965576172,
  0.16010570526123047,
  0.10203862190246582)}

* ### load from dataframe

In [31]:
from surprise import NormalPredictor
from surprise import Dataset
from surprise.model_selection import cross_validate
import pandas as pd

In [32]:
ratings_dict = {'itemID': [1, 1, 1, 2, 2],
                'userID': [9, 32, 2, 45, 'user_foo'],
                'rating': [3, 2, 4, 3, 1]}
df = pd.DataFrame(ratings_dict)

In [35]:
reader=Reader(rating_scale=(1,5))
data=Dataset.load_from_df(df[['userID','itemID','rating']],reader=reader)
cross_validate(NormalPredictor(),data,cv=3,verbose=True)

Evaluating RMSE, MAE of algorithm NormalPredictor on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.4145  1.4673  1.0170  1.2996  0.2010  
MAE (testset)     1.0214  1.4622  1.0170  1.1669  0.2088  
Fit time          0.00    0.00    0.00    0.00    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    


{'test_rmse': array([1.41453811, 1.467296  , 1.01704987]),
 'test_mae': array([1.02142488, 1.46218097, 1.01704987]),
 'fit_time': (0.0001456737518310547,
  0.00010561943054199219,
  9.584426879882812e-05),
 'test_time': (0.0005013942718505859,
  9.894371032714844e-05,
  5.0067901611328125e-05)}

In [36]:
df

Unnamed: 0,itemID,userID,rating
0,1,9,3
1,1,32,2
2,1,2,4
3,2,45,3
4,2,user_foo,1


* ### Use cross-validation 

In [37]:
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import KFold

In [38]:
data=Dataset.load_builtin('ml-100k')

In [41]:
kf=KFold(n_splits=5)
algo=SVD()
for trainset,testset in kf.split(data):
    algo.fit(trainset)
    prediction=algo.test(testset)
    
    accuracy.rmse(predictions,verbose=True)

RMSE: 0.9377
RMSE: 0.9377
RMSE: 0.9377
RMSE: 0.9377
RMSE: 0.9377
