In [1]:
import numpy as np
import pandas as pd
import random
from surprise import Dataset
from surprise import Reader
from surprise import SVD, SVDpp, NMF
from surprise import BaselineOnly
from surprise import CoClustering
from surprise.model_selection import KFold
from surprise import accuracy
from surprise import KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore, SlopeOne
from surprise.model_selection import GridSearchCV
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate
from sklearn import linear_model
from sklearn.metrics import mean_squared_error

In [2]:
#functions
def get_users(line):
    row, col = line.split("_")
    row = row.replace("r", "")
    return int(row)
def get_items(line):
    row, col = line.split("_")
    col = col.replace("c", "")
    return int(col)
def adapt_prediction_in_matrix(predictions):
    X = []
    for pred in predictions:
        Y = []
        pred.sort()
        for pre in pred:
            Y.append(pre.est)
        X.append(Y)
    return X

In [3]:
data = pd.read_csv('data_train.csv')
data['userID'] = data['Id'].apply(get_users)
data['itemID'] = data['Id'].apply(get_items)
data = data.drop('Id', axis=1)
data = data.rename(columns={'Prediction':'rating'})[['userID','itemID','rating']]
data.head()

Unnamed: 0,userID,itemID,rating
0,44,1,4
1,61,1,3
2,67,1,4
3,72,1,3
4,86,1,5


In [4]:
reader = Reader(rating_scale=(1, 5))
surp = Dataset.load_from_df(data, reader)

In [5]:
#defines options for the different algorithms used
bsl_options = {'method': 'als',
               'n_epochs': 20,
               }
sim_options = {'name': 'pearson_baseline'}

In [6]:
algos = [ SVD(),BaselineOnly()]

In [16]:
#defines the list of algos to be blended
algos = [BaselineOnly(),
         SlopeOne(),
         KNNBaseline(k=60,sim_options=sim_options, bsl_options=bsl_options),
        ]


In [21]:
def blendingCrossValidation(algos, dataset, kfold = 4, verbose = True):
    """performs a kfold validation on the dataset for a blending of all the algos
    1 fold is used for training the algos
    1 fold is used for training the blending
    remaining folds are used for computing the RMSE"""
    for k in range(kfold):
        if (verbose) : print('starts fold number ', k+1)
        #splits the data into a train_set and a validation set
        train_set, validation_set = train_test_split(dataset,test_size = 3./kfold)
        #splits the validation set into kfold-1 validation sets
        validation_set= np.array(validation_set,dtype=np.int)
        validation_sets = np.reshape(validation_set,[kfold-1, int(np.shape(validation_set)[0]/3),3] )
        
        if (verbose) : print('fits the algos on the train_set')
        for algo in algos:
            algo.fit(train_set)
        
        if (verbose) : print('test the algos on the test set for blending')
        test_set = validation_sets[0]
        predictions = []
        for algo in algos:
            prediction = algo.test(test_set)
            predictions.append(prediction)
            accuracy.rmse(prediction, verbose=verbose)

        if (verbose) : print('calculates the blending coefficients')

        X = adapt_prediction_in_matrix(predictions)
        
        #adapts the train set to a tuple to let it be sorted for the linear model calculation
        test_set = [tuple(row)for row in test_set]
        test_set.sort()
        test_set = np.array(test_set)

        reg = linear_model.LinearRegression().fit(np.transpose(X), test_set[:,2])

        if (verbose): print('blending coefficients : ', reg.coef_)
        coefs = reg.coef_
        
        #tests the algos on each validation set and prints out the rmse
        for val in validation_sets:
            pred = reg.predict([[algo.predict(userID, itemID).est for algo in algos] for userID,itemID in val[:,[0,1]] ])
            if(verbose) : print('RMSE of blending is : ', np.sqrt(mean_squared_error(val[:,2],pred)))
        

In [15]:
blendingCrossValidation(algos, surp)

starts fold number  1
fits the algos on the train_set
Estimating biases using als...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
test the algos on the test set for blending
RMSE: 1.0061
RMSE: 1.0150
RMSE: 1.0147
RMSE: 1.0324
RMSE: 1.0370
RMSE: 1.0368
RMSE: 1.0343
calculates the blending coefficients
blending coefficients :  [ 0.69069791  0.07345918  0.18909439  0.0085653  -0.07854477  0.11419837
  0.00485125]
RMSE of blending is :  1.0046939446190746
RMSE of blending is :  1.0065228971080875
RMSE of blending is :  1.0053997002990074
starts fold number  2
fits the algos on the train_set
Estimating biases using als...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
test the algos on the test set for blend