In [1]:
import pandas as pd
from surprise import prediction_algorithms as pa
from surprise import Dataset, Reader
from surprise import evaluate, print_perf

In [2]:
data = pd.read_csv('./movielens_small/ratings.csv')
number_of_rows = len(data)

In [3]:
reader = Reader(rating_scale=(1, 5))
train_data = Dataset.load_from_df(data[['userId','movieId','rating']], reader)

algo = pa.knns.KNNBasic(k=10)

train_data.split(n_folds=5)

perf = evaluate(algo, train_data, measures=['RMSE', 'MAE'])

Evaluating RMSE, MAE of algorithm KNNBasic.

------------
Fold 1
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9717
MAE:  0.7433
------------
Fold 2
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9595
MAE:  0.7357
------------
Fold 3
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9713
MAE:  0.7408
------------
Fold 4
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9625
MAE:  0.7367
------------
Fold 5
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9691
MAE:  0.7429
------------
------------
Mean RMSE: 0.9668
Mean MAE : 0.7399
------------
------------


In [15]:
for x in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.75,0.8,0.9]:
    split = int(x*number_of_rows)
    data = data.sort_values(['timestamp'], ascending=[1])
    train_data = data[:split]
    test_data = data[split:]

    reader = Reader(rating_scale=(1, 5))
    train_data = Dataset.load_from_df(train_data[['userId','movieId','rating']], reader)
    train_data = train_data.build_full_trainset()

    algo = pa.knns.KNNBasic(k=10)

    algo.train(train_data)

    rmse = 0.0
    mae = 0.0
    for index,row in test_data.iterrows():
        ans = algo.predict(row['userId'],row['movieId'])
        rmse += (ans.est-row['rating'])**2
        mae += abs(ans.est-row['rating'])
    print("rmse : = ",(rmse*1.0/len(test_data))**0.5)
    print("mae : = ",(mae*1.0/len(test_data)))

Computing the msd similarity matrix...
Done computing similarity matrix.
rmse : =  1.07101198352
mae : =  0.851547586044
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse : =  1.07653509197
mae : =  0.845802206656
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse : =  1.06700145423
mae : =  0.83260725111
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse : =  1.04385117435
mae : =  0.812186292471
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse : =  1.04681014379
mae : =  0.815498098866
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse : =  1.05644160379
mae : =  0.824657058312
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse : =  1.07779533901
mae : =  0.844681589602
Computing the msd similarity matrix...
Done computing similarity matrix.
rmse : =  1.0719785683
mae : =  0.84331336382
Computing the msd similarity matrix

In [13]:
split = 0.8
data = data.sort_values(['userId','timestamp'],ascending=[1,1])
users = data.userId.unique()
train_data = pd.DataFrame()
test_data = pd.DataFrame()
user_set_sizes = []
for user in users:
    temp_data = data['userId'] == user
    temp_data = data[temp_data]
    user_set_sizes.append(len(temp_data))
    x = temp_data[:int(split*len(temp_data))]
    train_data = train_data.append(x)
    x = temp_data[int(split*len(temp_data)):]
    test_data = test_data.append(x)

reader = Reader(rating_scale=(1, 5))
train_data = Dataset.load_from_df(train_data[['userId','movieId','rating']], reader)
train_data = train_data.build_full_trainset()

algo = pa.knns.KNNBasic(k=10)

algo.train(train_data)

rmse = 0.0
mae = 0.0
for index,row in test_data.iterrows():
    ans = algo.predict(row['userId'],row['movieId'])
    rmse += (ans.est-row['rating'])**2
    mae += abs(ans.est-row['rating'])
print("rmse : = ",(rmse*1.0/len(test_data))**0.5)
print("mae : = ",(mae*1.0/len(test_data)))

Computing the msd similarity matrix...
Done computing similarity matrix.
rmse : =  1.0073038407
mae : =  0.770365747354
