In [46]:
import numpy as np
import pandas as pd
import random
from surprise import Dataset
from surprise import Reader
from surprise import SVD, SVDpp
from surprise import BaselineOnly
from surprise import CoClustering
from surprise.model_selection import KFold
from surprise import accuracy
from surprise import KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore, SlopeOne
from surprise.model_selection import GridSearchCV
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate
from sklearn import linear_model

In [17]:
#functions
def get_users(line):
    row, col = line.split("_")
    row = row.replace("r", "")
    return int(row)
def get_items(line):
    row, col = line.split("_")
    col = col.replace("c", "")
    return int(col)
def adapt_prediction_in_matrix(predictions):
    X = []
    for pred in predictions:
        Y = []
        pred.sort()
        for pre in pred:
            Y.append(pre.est)
        X.append(Y)
    return X

In [3]:
data = pd.read_csv('data_train.csv')
data['userID'] = data['Id'].apply(get_users)
data['itemID'] = data['Id'].apply(get_items)
data = data.drop('Id', axis=1)
data = data.rename(columns={'Prediction':'rating'})[['userID','itemID','rating']]
data.head()

Unnamed: 0,userID,itemID,rating
0,44,1,4
1,61,1,3
2,67,1,4
3,72,1,3
4,86,1,5


In [4]:
reader = Reader(rating_scale=(1, 5))
surp = Dataset.load_from_df(data, reader)

In [48]:
#defines options for the different algorithms used
bsl_options = {'method': 'als',
               'n_epochs': 20,
               }
sim_options = {'name': 'pearson_baseline'}

In [49]:
#defines the list of algos to be blended
algos = [SlopeOne(),
         KNNBaseline(k=60,sim_options=sim_options, bsl_options=bsl_options), 
         CoClustering(),
         SVDpp()
        ]


In [50]:
def getBlendingCoefficients(algos):
    """Creates a k-fold validation on the data 
    with all the specified algorithms 
    and then tests is at every-fold time 
    to have different coefficient for blending
    input : algos  : to be tested
    outputs : vector of the coefficient of the best parameters"""
    
    print('splits the data set') 
    train_set, test_set = train_test_split(surp)
    
    print('fits the algos on the train_set')
    for algo in algos:
        algo.fit(train_set)
    
    print('test the alogs on the test set')
    predictions = []
    for algo in algos:
        prediction = algo.test(test_set)
        predictions.append(prediction)
        accuracy.rmse(prediction, verbose=True)
        
    print('calculates the blending coefficients')
    
    X = adapt_prediction_in_matrix(predictions)
    
    test_set.sort()
    test_set = np.array(test_set)
    reg = linear_model.LinearRegression().fit(np.transpose(X), test_set[:,2])
    
    return reg.coef_

In [51]:
coefs = getBlendingCoefficients(algos)

splits the data set
fits the algos on the train_set
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
test the alogs on the test set
RMSE: 1.0006
RMSE: 0.9968
RMSE: 1.0120
RMSE: 1.0173
calculates the blending coefficients


In [52]:
coefs

array([ 0.23670371,  0.50315915, -0.07752119,  0.3256102 ])

In [53]:
#loads the data set

In [54]:
#notice that you need to fit the algos on the whole train set this time to have the final result
train_set = surp.build_full_trainset()

In [55]:
test = pd.read_csv('examples_sample_submission.csv')

In [56]:
test['userID'] = test['Id'].apply(get_users)
test['itemID'] = test['Id'].apply(get_items)

In [57]:
#fits the algos on the full data set
for algo in algos:
    algo.fit(train_set)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [58]:
#predicts with the effect of each coefficient defined
preds = test.apply(lambda row: round( np.sum([coefs[i] * algos[i].predict(row.userID, row.itemID).est for i in range(len(coefs))] ) ), axis=1)
test['Prediction'] = preds
test = test.drop(['userID','itemID'], axis=1)

In [59]:
test.to_csv('blend4BasicModels.csv', index=False)