In [28]:
import numpy as np
import pandas as pd
import random
from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise import BaselineOnly
from surprise import accuracy
from surprise.model_selection import GridSearchCV
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate

Load csv file into a pandas dataframe with the right surprise format (userID, itemID, rating)

In [29]:
def get_users(line):
    row, col = line.split("_")
    row = row.replace("r", "")
    return int(row)

In [30]:
def get_items(line):
    row, col = line.split("_")
    col = col.replace("c", "")
    return int(col)

In [36]:
data = pd.read_csv('data_train.csv')

In [32]:
data.head()

Unnamed: 0,Id,Prediction
0,r44_c1,4
1,r61_c1,3
2,r67_c1,4
3,r72_c1,3
4,r86_c1,5


In [37]:
data['userID'] = data['Id'].apply(get_users)
data['itemID'] = data['Id'].apply(get_items)
data = data.drop('Id', axis=1)
data = data.rename(columns={'Prediction':'rating'})[['userID','itemID','rating']]

In [38]:
data.head()

Unnamed: 0,userID,itemID,rating
0,44,1,4
1,61,1,3
2,67,1,4
3,72,1,3
4,86,1,5


Load dataframe into surprise

In [39]:
reader = Reader(rating_scale=(1, 5))
surp = Dataset.load_from_df(data, reader)

Example grid search with svd algorithm

In [40]:
param_grid = {'n_epochs': [1, 2]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=2)
gs.fit(surp)

In [41]:
results_df = pd.DataFrame.from_dict(gs.cv_results)

In [42]:
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

1.015813004134662
{'n_epochs': 2}


Example basic fit on full dataset

In [43]:
trainset = surp.build_full_trainset()

In [44]:
bsl_options = {'method': 'sgd',
               'learning_rate': .00005,
               }
algo = BaselineOnly(bsl_options=bsl_options)
algo.fit(trainset)

Estimating biases using sgd...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x12cf98cc0>

Load test file and predict ratings

In [45]:
test = pd.read_csv('examples_sample_submission.csv')

In [46]:
test['userID'] = test['Id'].apply(get_users)
test['itemID'] = test['Id'].apply(get_items)

In [47]:
test.head()

Unnamed: 0,Id,Prediction,userID,itemID
0,r37_c1,3,37,1
1,r73_c1,3,73,1
2,r156_c1,3,156,1
3,r160_c1,3,160,1
4,r248_c1,3,248,1


In [48]:
preds = test.apply(lambda row: round(algo.predict(row.userID, row.itemID).est) , axis=1)
test['Prediction'] = preds

In [49]:
test = test.drop(['userID','itemID'], axis=1)

Create submission file

In [101]:
test.to_csv('subVince', index=False)