In [1]:
import numpy as np
import pandas as pd

from developed_methods import *

In [10]:
dtrain = pd.read_csv("data/train.csv")
dtest = pd.read_csv("data/test.csv")

## mapping 
from sklearn import preprocessing
le_user = preprocessing.LabelEncoder()
le_user.fit(np.append(dtrain['user_id'], dtest["user_id"]))
dtrain['user_id'] = le_user.transform(dtrain["user_id"])
dtest["user_id"] = le_user.transform(dtest["user_id"])

le_item = preprocessing.LabelEncoder()
le_item.fit(np.append(dtrain['item_id'], dtest["item_id"]))
dtrain["item_id"] = le_item.transform(dtrain["item_id"])
dtest["item_id"] = le_item.transform(dtest["item_id"])

In [11]:
# split the train data into train and test dataset
dtest_real = dtest.copy() # create a copy for the real test dataset

from sklearn.model_selection import train_test_split
dtrain, dtest = train_test_split(dtrain, test_size=0.2, random_state=42)

# train_pair, train_rating
train_pair = dtrain[['user_id', 'item_id']].values
train_rating = dtrain['rating'].values

## save real ratings for test set for evaluation.
test_rating = np.array(dtest['rating'])
test_pair = dtest[['user_id', 'item_id']].values

## remove the ratings in the test set to simulate prediction
dtest = dtest.drop(columns='rating')

n_user, n_item = max(train_pair[:,0].max(), test_pair[:,0].max())+1, max(train_pair[:,1].max(), test_pair[:,1].max())+1

In [12]:
## Step 2: Convert the data to be type that `suprise` could understand
# via `reader` in `surprise`
from surprise import Reader
from surprise import Dataset
# rating_scale is required
rating_min = min(train_rating)
rating_max = max(train_rating)
reader = Reader(rating_scale=(rating_min, rating_max))
## this auto-folds dataset
surp_train_AF = Dataset.load_from_df(dtrain[['user_id', 'item_id', 'rating']], reader)
## this is trainset dataset
surp_train = surp_train_AF.build_full_trainset()

In [8]:
## `surprise` has an issue to cold-start problem when make prediction
# define a prediction function to exclude the cold-start users/items
def surp_pred(dtest, dtrain, method, cold_start=True):
	gbl_mean = dtrain['rating'].mean()
	user_lst, item_lst = list(set(dtrain['user_id'])), list(set(dtrain['item_id']))
	surp_pred_lst = []
	for _, row in dtest.iterrows():
		user_id, item_id = row['user_id'], row['item_id']
		if cold_start:
			rating_tmp = method.estimate(user_id, item_id)
		else:
			if ((row['user_id'] in user_lst) and (row['item_id'] in item_lst)):
				rating_tmp = method.estimate(user_id, item_id)
			else:
				rating_tmp = gbl_mean
		surp_pred_lst.append(rating_tmp)
	surp_pred_lst = np.array(surp_pred_lst)
	return surp_pred_lst

In [13]:
## Step 3: determine the method you want to use
# algo 1: baseline
from surprise.prediction_algorithms.baseline_only import BaselineOnly
from surprise.model_selection import cross_validate
algo1 = BaselineOnly()
# fit model
algo1.fit(surp_train)
pred_baseline = surp_pred(dtest, dtrain, algo1)
print('RSME for surprise-baseline: %.3f' %rmse(test_rating, pred_baseline))

Estimating biases using als...
RSME for surprise-baseline: 3.575


In [14]:
# algo 2: SVD model
from surprise.prediction_algorithms.matrix_factorization import SVD
from surprise.model_selection import GridSearchCV
## GS based on CV
param_grid = {'reg_all': 10**np.arange(-4, -2, .5), 'n_factors': [2, 3, 5, 10]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
gs.fit(surp_train_AF)
# cv result 
results_df = pd.DataFrame.from_dict(gs.cv_results)
print(results_df)
# best RMSE score
print('best RMSE score: %.3f' %gs.best_score['rmse'])
# combination of parameters that gave the best RMSE score
print('best hyperparam: %s' %gs.best_params['rmse'])
# We can now use the algorithm that yields the best rmse:
algo2_best = gs.best_estimator['rmse']
algo2_best.fit(surp_train)
# make prediction
pred_svd = surp_pred(dtest, dtrain, algo2_best)
print('RSME for surprise-svd-best: %.3f' %rmse(test_rating, pred_svd))

    split0_test_rmse  split1_test_rmse  split2_test_rmse  mean_test_rmse  \
0           1.663445          1.681649          1.649549        1.664881   
1           1.663532          1.681998          1.650212        1.665247   
2           1.664459          1.682798          1.652870        1.666709   
3           1.669867          1.689716          1.656599        1.672060   
4           1.663600          1.681994          1.650850        1.665481   
5           1.664451          1.682732          1.650495        1.665893   
6           1.666235          1.682637          1.653086        1.667319   
7           1.671872          1.687321          1.657810        1.672334   
8           1.664358          1.683135          1.651464        1.666319   
9           1.663942          1.683778          1.651022        1.666247   
10          1.664755          1.684311          1.652819        1.667295   
11          1.668539          1.687528          1.657363        1.671143   
12          