In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd

In [3]:
import os
asset_base = "../asset/ml-latest-small/"
os.listdir(asset_base)

['links.csv', 'tags.csv', 'ratings.csv', 'README.txt', 'movies.csv']

In [4]:
movies = pd.read_csv(asset_base + "movies.csv")
ratings = pd.read_csv(asset_base + "ratings.csv")
links = pd.read_csv(asset_base + "links.csv")
tags = pd.read_csv(asset_base + "tags.csv")

In [5]:
ratings.tail()

Unnamed: 0,userId,movieId,rating,timestamp
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352
100835,610,170875,3.0,1493846415


In [6]:
df = ratings[ratings.columns[:-1]]

In [7]:
df_table = df.set_index(["userId", "movieId"]).unstack()

In [9]:
import surprise

In [11]:
data = surprise.Dataset.load_builtin('ml-100k')

In [12]:
from surprise.model_selection import KFold

bsl_options = {
    'method': 'als',
    'n_epochs': 5,
    'reg_u': 12,
    'reg_i': 5
}

algo = surprise.BaselineOnly(bsl_options)

np.random.seed(0)
acc = np.zeros(3)
cv = KFold(3)

for i, (trainset, testset) in enumerate(cv.split(data)):
    algo.fit(trainset)
    predictions = algo.test(testset)
    acc[i] = surprise.accuracy.rmse(predictions, verbose=True)
acc.mean()

Estimating biases using als...
RMSE: 0.9453
Estimating biases using als...
RMSE: 0.9377
Estimating biases using als...
RMSE: 0.9500


0.9443304984013942

In [14]:
from surprise.model_selection import cross_validate

In [15]:
%%time
algo = surprise.SVD(n_factors=20)
print(cross_validate(algo, data))

{'test_rmse': array([0.93455846, 0.94132777, 0.92695815, 0.9348729 , 0.93820775]), 'test_mae': array([0.73946363, 0.74515879, 0.72980675, 0.73676238, 0.73888612]), 'fit_time': (1.8517980575561523, 1.823612928390503, 1.7706680297851562, 1.7799780368804932, 1.774785041809082), 'test_time': (0.20868301391601562, 0.11798906326293945, 0.11498117446899414, 0.19322490692138672, 0.11872386932373047)}
CPU times: user 10.4 s, sys: 28.9 ms, total: 10.5 s
Wall time: 10.5 s


In [16]:
from surprise.model_selection import train_test_split

In [17]:
from surprise import accuracy

In [18]:
trainset, testset = train_test_split(data, test_size=.15)

In [20]:
# Use user_based true/false to switch between user-based or item-based collaborative filtering
algo = surprise.KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True})
algo.fit(trainset)

# we can now query for specific predicions
uid = str(196)  # raw user id
iid = str(302)  # raw item id

# get a prediction for specific users and items.
pred = algo.predict(uid, iid, r_ui=4, verbose=True)

# run the trained model against the testset
test_pred = algo.test(testset)

# get RMSE
print("User-based Model : Test Set")
accuracy.rmse(test_pred, verbose=True)

# if you wanted to evaluate on the trainset
print("User-based Model : Training Set")
train_pred = algo.test(trainset.build_testset())
accuracy.rmse(train_pred)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
user: 196        item: 302        r_ui = 4.00   est = 4.18   {'actual_k': 50, 'was_impossible': False}
User-based Model : Test Set
RMSE: 0.9278
User-based Model : Training Set
RMSE: 0.4595


0.4595192769045692