In [49]:
import numpy as np
import pandas as pd
import random
from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise import BaselineOnly
from surprise import CoClustering
from surprise.model_selection import KFold
from surprise import accuracy
from surprise import KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore, SlopeOne
from surprise.model_selection import GridSearchCV
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate
from joblib import Parallel, delayed
import multiprocessing
import time

Load csv file into a pandas dataframe with the right surprise format (userID, itemID, rating)

In [2]:
def get_users(line):
    row, col = line.split("_")
    row = row.replace("r", "")
    return int(row)

In [3]:
def get_items(line):
    row, col = line.split("_")
    col = col.replace("c", "")
    return int(col)

In [4]:
data = pd.read_csv('data_train.csv')

In [5]:
data.head()

Unnamed: 0,Id,Prediction
0,r44_c1,4
1,r61_c1,3
2,r67_c1,4
3,r72_c1,3
4,r86_c1,5


In [6]:
data['userID'] = data['Id'].apply(get_users)
data['itemID'] = data['Id'].apply(get_items)
data = data.drop('Id', axis=1)
data = data.rename(columns={'Prediction':'rating'})[['userID','itemID','rating']]

In [7]:
data.head()

Unnamed: 0,userID,itemID,rating
0,44,1,4
1,61,1,3
2,67,1,4
3,72,1,3
4,86,1,5


Load dataframe into surprise

In [8]:
reader = Reader(rating_scale=(1, 5))
surp = Dataset.load_from_df(data, reader)

Example grid search with svd algorithm

In [9]:
param_grid = {'n_epochs': [1, 2]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=2)
gs.fit(surp)

In [10]:
results_df = pd.DataFrame.from_dict(gs.cv_results)

In [11]:
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

1.0160496276828161
{'n_epochs': 2}


k-fold fits on dataset

In [24]:
algo = KNNBasic()

In [25]:
bsl_options = {'method': 'als',
               'n_epochs': 20,
               }
sim_options = {'name': 'pearson_baseline'}
algo = KNNBasic(bsl_options=bsl_options, sim_options=sim_options)

In [26]:
def kfoldfitting(algo, trainset, testset):
    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)
    return 0

In [27]:
kf = KFold(n_splits=3)
processes = [multiprocessing.Process(target = kfoldfitting, args=(algo,trainset, testset)) for (trainset, testset) in kf.split(surp)]

SVD

In [12]:
for p in processes:
    p.start()

RMSE: 1.0323
RMSE: 1.0338
RMSE: 1.0320


In [15]:
#KNNBasic
for p in processes:
    p.start()

Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
RMSE: 1.0301
RMSE: 1.0331
RMSE: 1.0319


In [28]:
#KNNBasic
for p in processes:
    p.start()

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Computing the pearson_baseline similarity matrix...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
RMSE: 1.0356
RMSE: 1.0339
RMSE: 1.0362


In [25]:

# define a cross-validation iterator
kf = KFold(n_splits=3)

algo = SVD()

for trainset, testset in kf.split(surp):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

RMSE: 1.0321
RMSE: 1.0328
RMSE: 1.0312


In [26]:
# define a cross-validation iterator
kf = KFold(n_splits=3)

bsl_options = {'method': 'sgd',
               'learning_rate': .00005,
               }
algo = BaselineOnly(bsl_options=bsl_options)

for trainset, testset in kf.split(surp):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

Estimating biases using sgd...
RMSE: 1.0447
Estimating biases using sgd...
RMSE: 1.0437
Estimating biases using sgd...
RMSE: 1.0453


In [27]:
# define a cross-validation iterator
kf = KFold(n_splits=3)

algo = KNNBaseline()

for trainset, testset in kf.split(surp):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.0121
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.0128
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.0111


In [None]:
# define a cross-validation iterator
kf = KFold(n_splits=3)

algo = KNNBasicc()

for trainset, testset in kf.split(surp):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.0310
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.0318


In [29]:
# define a cross-validation iterator
kf = KFold(n_splits=3)

algo = KNNWithMeans()

for trainset, testset in kf.split(surp):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.0250
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.0249
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.0265


In [30]:
# define a cross-validation iterator
kf = KFold(n_splits=3)

algo = KNNWithZScore()

for trainset, testset in kf.split(surp):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.0263
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.0254
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.0271


In [40]:
# define a cross-validation iterator
kf = KFold(n_splits=3)

algo = SlopeOne()

for trainset, testset in kf.split(surp):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

RMSE: 1.0013
RMSE: 1.0017
RMSE: 1.0014


In [50]:
# define a cross-validation iterator
kf = KFold(n_splits=3)

algo = CoClustering()

for trainset, testset in kf.split(surp):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

RMSE: 1.0141
RMSE: 1.0135
RMSE: 1.0134


In [51]:
param_grid = {'n_cltr_u': [4, 5,6,10]}
gs = GridSearchCV(CoClustering, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(surp)

In [52]:
results_df = pd.DataFrame.from_dict(gs.cv_results)

In [53]:
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

1.014580842421983
{'n_cltr_u': 4}


In [56]:
gs.cv_results

{'split0_test_rmse': array([1.01561956, 1.01602756, 1.0169597 , 1.01816032]),
 'split1_test_rmse': array([1.01346801, 1.01427019, 1.01477627, 1.01641257]),
 'split2_test_rmse': array([1.01465496, 1.01506113, 1.01645298, 1.01728355]),
 'mean_test_rmse': array([1.01458084, 1.01511963, 1.01606298, 1.01728548]),
 'std_test_rmse': array([0.00087993, 0.00071864, 0.00093307, 0.00071352]),
 'rank_test_rmse': array([1, 2, 3, 4]),
 'split0_test_mae': array([0.81656638, 0.81712222, 0.81775682, 0.81859485]),
 'split1_test_mae': array([0.81529681, 0.81574445, 0.81617181, 0.81719389]),
 'split2_test_mae': array([0.81584451, 0.81611471, 0.81707586, 0.81778652]),
 'mean_test_mae': array([0.81590256, 0.81632712, 0.8170015 , 0.81785842]),
 'std_test_mae': array([0.00051992, 0.00058218, 0.00064921, 0.00057419]),
 'rank_test_mae': array([1, 2, 3, 4]),
 'mean_fit_time': array([18.34776433, 18.88828731, 18.93605463, 20.37665057]),
 'std_fit_time': array([0.151754  , 0.5202598 , 0.06941659, 0.04220255]),
 'm

In [41]:
trainset = surp.build_full_trainset()

In [42]:
algo = SlopeOne()
algo.fit(trainset)

<surprise.prediction_algorithms.slope_one.SlopeOne at 0x7f3f07bbbb70>

In [13]:
bsl_options = {'method': 'sgd',
               'learning_rate': .00005,
               }
algo = BaselineOnly(bsl_options=bsl_options)
algo.fit(trainset)

Estimating biases using sgd...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x7f3b3c574400>

Load test file and predict ratings

In [43]:
test = pd.read_csv('examples_sample_submission.csv')

In [44]:
test['userID'] = test['Id'].apply(get_users)
test['itemID'] = test['Id'].apply(get_items)

In [45]:
test.head()

Unnamed: 0,Id,Prediction,userID,itemID
0,r37_c1,3,37,1
1,r73_c1,3,73,1
2,r156_c1,3,156,1
3,r160_c1,3,160,1
4,r248_c1,3,248,1


In [46]:
preds = test.apply(lambda row: round(algo.predict(row.userID, row.itemID).est) , axis=1)
test['Prediction'] = preds

In [47]:
test = test.drop(['userID','itemID'], axis=1)

Create submission file

In [48]:
test.to_csv('subSlopeOne.csv', index=False)