# Ensemble

In [2]:
import pandas as pd
import surprise
import numpy as np
import matplotlib.pyplot as plt
from time import time
from collections import defaultdict
import statistics
from scipy.stats import percentileofscore
import math
from scipy.stats import uniform

from fastai.collab import * 
from fastai.tabular import *
import seaborn as sns

from sklearn import preprocessing as pre
from surprise import SVD
from surprise import KNNBasic
from surprise import Dataset
from surprise.model_selection import cross_validate
from sklearn.model_selection import train_test_split
import os

## Utils

In [3]:
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

## Read and clean dataset

In [5]:
df.head()

Unnamed: 0,"2765830;""(Aalborg) Andersen","Lucas "";6.74000000;1"
0,"3948313;""(Aalborg) Andersen","Lucas "";2.73000000;1"
1,"10051046;""(Aalborg) Andersen","Lucas "";4.69000000;1"
2,"884724;""(Aalborg) Kusk K "";5.12000000;1",
3,"3891694;""(Aalborg) Kusk K "";3.24000000;1",
4,"824001;""(Aalborg) Prica","Tim "";2.45000000;1"


In [4]:
df=pd.read_csv(r'D:\Projects\ISAI\Data\SubEventsCF\results.csv', skiprows=0)
df.drop_duplicates(inplace=True)
print('We have',df.shape[0], 'ratings')
print('The number of unique users we have is:', len(df["IDUtente"].unique()))
print('The number of unique teams we have is:', len(df["Team"].unique()))
print('The median user rated %d teams.'%df["IDUtente"].value_counts().median())
print('The max rating is: %d'%df["Importo"].max(),'the min rating is: %d'%df["Importo"].min())
df.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


We have 13395033 ratings


KeyError: 'IDUtente'

In [81]:
dftot=pd.read_csv(r'D:\Projects\ISAI\Data\SubEventsCF\itemCFdata.csv', skiprows=0)
dfnum=pd.read_csv(r'D:\Projects\ISAI\Data\SubEventsCF\numerogiocateCF.csv', skiprows=0)

##Check for teams that have been bet on less times than arbitrary value
#min_inst = 1
#team_count = dftot.value_counts('Team') < min_inst
##Get list of teams to drop
#teams_to_drop = team_count.where(team_count==True).dropna().index
##Drop from dataframe
#dftot = dftot[~dftot['Team'].isin(teams_to_drop)]

start_time = time()

alpha = 0.5
beta = 0.5

#Calculate, for each user, the percentile rank for each amount relative to the list of all the amounts wagered
for user in set(dftot['IDUtente']):
    importi = dftot.loc[dftot['IDUtente'] == user, 'Importo']
    numero = dfnum.loc[dfnum['IDUtente'] == user, 'NumeroGiocate']

    dftot.loc[dftot['IDUtente'] == user, 'Importo'] = [(alpha*(percentileofscore(importi, a, 'rank')/20) + beta*(percentileofscore(numero, b, 'rank')/20)) for a,b in zip(importi,numero)]
    

total_time = time()-start_time
print('Time elapsed in percentile calculation: ', total_time, 's')
print('Average time per user: ', total_time/len(set(df['IDUtente'])), 's')

Time elapsed in percentile calculation:  32.278406381607056 s
Average time per user:  0.00016813508967963712 s


In [83]:
print('After pruning:')
print('We have',dftot.shape[0], 'ratings')
print('The number of unique users we have is:', len(dftot["IDUtente"].unique()))
print('The number of unique teams we have is:', len(dftot["Team"].unique()))
print('The median user rated %d teams.'%dftot["IDUtente"].value_counts().median())
print('The max rating is: %d'%dftot["Importo"].max(),'the min rating is: %d'%dftot["Importo"].min())
dftot.head()

After pruning:
We have 185273 ratings
The number of unique users we have is: 6706
The number of unique teams we have is: 6085
The median user rated 12 teams.
The max rating is: 5 the min rating is: 0


Unnamed: 0,IDUtente,Team,Importo
0,1773,(Mercedes) L.Hamilton,1.427833
1,21249,(Mercedes) V.Bottas,5.0
2,536,(Racing Point) S.Perez,3.558052
3,15746,07 Vestur,1.225394
4,20820,07 Vestur,2.217742


In [84]:
matrix = df.pivot_table(index='IDUtente', columns='Team', values='Importo')
matrix

Team,(Mercedes) L.Hamilton,(Mercedes) V.Bottas,(Racing Point) S.Perez,07 Vestur,1 Dezembro,1. FFC Frankfurt Women,1860 Munich,1877 Alemdag,1922 Konyaspor,4 de Julho,...,Zlatibor,Znamya Noginsk,Zob Ahan,Zoe Hammond,Zonguldak Komurspor,Zoo Kericho FC,Zrinjski Mostar,Zvezda Ryazan,Zvezda St. Petersburg,Zweigen Kanazawa
IDUtente,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
388.0,,,,,,,,,,,...,,,,,,,,,,
391.0,,,,,,,,,,,...,,,,,,,,,,
392.0,,,,,,,,,,,...,,,,,,,,,,
393.0,,,,,,,,,,,...,,,,,,,,,,
394.0,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21376.0,,,,,,,,,,,...,,,,,,,,,,
21377.0,,,,,,,,,,,...,,,,,,,,,,
21378.0,,,,,,,,,,,...,,,,,,,,,,
21380.0,,,,,,,,,,,...,,,,,,,,,,


In [85]:
df = dftot

#swapping columns
raw=df[['IDUtente','Team','Importo']] 
raw.columns = ['n_users','n_items','rating']

rawTrain,rawholdout = train_test_split(raw, test_size=0.25)
# when importing from a DF, you only need to specify the scale of the ratings.
reader = surprise.Reader(rating_scale=(0,5))
#into surprise:
data = surprise.Dataset.load_from_df(rawTrain,reader)
holdout = surprise.Dataset.load_from_df(rawholdout,reader)

## In Pseudo Code, our Algorithm is as follows:
We split the dataset into 10 folds, where we train on 9 of the folds and test on the remaining one, which randomly alternates..
We run several recommender systems on the dataset, and optimize the recommender systems on the 75% system.
intialize a weighted variable alpha to be 1/q, where q is the number of recommender systems we use.
let the rated matrix equal alpha * sum(predicted Ratings Matrices) and compare that with the real rating.
Using Gradient Descent, optimize the alpha term over parameter space to be able to optimize to give the most weight to the model which can represent the best prediction.
### First, lets pick some algorithms to include into our ensemble. We'll choose four.
1. Collaborative Filtering
2. Matrix Factorization
3. Collaborative filtering with co-clustering
4. Collaborative Filtering based on the popular Slope One Algorithm

In [86]:
# split data into folds. 
kSplit = surprise.model_selection.split.KFold(n_splits=10, shuffle=True) 
#initialize error vectors
rmseKNN = []
rmseSVD = []
rmseCo = []
rmseSlope = []

## Collaborative Filtering classic KNN
Number one on our list: Collaborative filtering is a recommender system that recommends based off of similiarity between items. The big idea is that items that are similiar should be similiarly liked by the same user. For example, if you liked Alien, and you really liked Predator, there's a good chance you'll enjoy Alien Versus Predator. We're just doing the same thing with books here. If you'd like to read more, read up here: http://courses.ischool.berkeley.edu/i290-dm/s11/SECURE/a1-koren.pdf

In [87]:
sim_options = sim_options = {'name': 'cosine',
               'user_based': False  # compute  similarities between items
               }
collabKNN = surprise.KNNBasic(k=30, sim_options=sim_options)
for trainset, testset in kSplit.split(data): #data leakage due to pre-processing before splitting?
    collabKNN.fit(trainset)
    predictionsKNN = collabKNN.test(testset)
    rmseKNN.append(surprise.accuracy.rmse(predictionsKNN,verbose=True))

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0408
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0473
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0496
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0412
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0464
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0419
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0369
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0455
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0519
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0313


In [88]:
time_start = time()
predictionsKNN = []

for team in set(df['Team']):
    predictionsKNN.append(collabKNN.predict(21385, team))

top_n = get_top_n(predictionsKNN, n=10)
print('Time elapsed: ', time() - time_start,'s')
top_n

Time elapsed:  0.24300003051757812 s


defaultdict(list,
            {21385: [('Virtus Bologna', 3.630952380952381),
              ('VfL Bad Schwartau', 3.630952380952381),
              ('Shawinigan Cataractes', 3.630952380952381),
              ("St Patrick's Athletic FC", 3.630952380952381),
              ('PEPO Lappeenranta', 3.630952380952381),
              ('Kristianstads DFF (Women)', 3.630952380952381),
              ('SC Freiburg Women', 3.630952380952381),
              ('Gatineau Olympiques', 3.630952380952381),
              ('Petaling Jaya City FC', 3.630952380952381),
              ('Spartak Noginsk Women', 3.630952380952381)]})

## Singular Vector Decomposition
This algorithm was created by Simon Funk during the Netflix Prize, and it is called FunkSVD. The big idea behind this algorithm is you try to estimate the best latent factors for the ratings. So, if you have a 100k users and 10k books, you factor the 100k x 10k matrix into the number of factors. In turn, you would be making two 100k x 30 and 30 x 10k matrices. You multiply them together to get the predicted rating. This lets us optimize on the latent factors between users, such as users that are similiar together because they all rated action films, and latent factors between items, like book series like Goosebumps and Steven King. We multiply each of these to get the predicted rating.

If you'd like to read more, look it up here: https://papers.nips.cc/paper/3208-probabilistic-matrix-factorization.pdf

In [90]:
funkSVD = surprise.prediction_algorithms.matrix_factorization.SVD(n_epochs=30, n_factors=100)

for trainset, testset in kSplit.split(data): #iterate through the folds.
    funkSVD.fit(trainset) 
    predictionsSVD = funkSVD.test(testset)   
    rmseSVD.append(surprise.accuracy.rmse(predictionsSVD, verbose=True)) #get root means squared error

RMSE: 0.9808
RMSE: 0.9872
RMSE: 0.9866
RMSE: 0.9876
RMSE: 0.9877
RMSE: 0.9785
RMSE: 0.9869
RMSE: 0.9866
RMSE: 0.9856
RMSE: 0.9912


In [91]:
time_start = time()
predictionsSVD = []

for team in set(df['Team']):
    predictionsSVD.append(funkSVD.predict(21385, team))

top_n = get_top_n(predictionsSVD, n=10)
print('Time elapsed: ', time() - time_start,'s')
top_n

Time elapsed:  0.08399677276611328 s


defaultdict(list,
            {21385: [('Kuban Holding', 3.5385588951585705),
              ('Brisbane Roar', 3.504588551750979),
              ('PFC Kuban', 3.446240657728788),
              ('Indiana Pacers', 3.424433123406577),
              ('Deportivo Pereira', 3.3924991126353987),
              ('Boston Celtics', 3.392130081057103),
              ('The Strongest', 3.377591093752082),
              ('Fiorentina', 3.3578938732120793),
              ('Detroit Pistons', 3.34038150527184),
              ('AFK Csikszereda Miercurea Ciuc', 3.336929752513204)]})

### Co-clustering collaborative filtering.
Co-clustering is where you cluster users and items together, using clustering techniques. You identify three clusters. You'll have to sum three things to get a predicted rating:
1. You find the cluster for the specified rating of user u and item i, and identify the mean of that cluster. So you find the mean of cluster u_i.
2. find the mean of the cluster of item i and subtract that from the average rating of that item.
3. find the mean of cluster of user u and substract that from the average rating of that user. 

If you want to learn more about Co-Clustering, read more here: https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.113.6458&rep=rep1&type=pdf

In [92]:
coClus = surprise.prediction_algorithms.co_clustering.CoClustering(n_cltr_u=4, n_cltr_i=4, n_epochs=100) 
for trainset, testset in kSplit.split(data): #iterate through the folds.
    coClus.fit(trainset)
    predictionsCoClus = coClus.test(testset)
    rmseCo.append(surprise.accuracy.rmse(predictionsCoClus,verbose=True))#get root means squared error

RMSE: 1.1653
RMSE: 1.1509


KeyboardInterrupt: 

In [None]:
time_start = time()
predictionscoClus = []

for team in set(df['Team']):
    predictionscoClus.append(coClus.predict(21385, team))

top_n = get_top_n(predictionscoClus, n=10)
print('Time elapsed: ', time() - time_start,'s')
top_n

### Slope One Collaborative Filtering Algorithm
This algorithm computes the slope of each of the relevant items rated by a user, finds the difference, then computes the prediction. Its a blunt instrument, but its a good heuristic that might improve our ensemble method. You can read more here: https://arxiv.org/abs/cs/0702144

In [93]:
slopeOne = surprise.prediction_algorithms.slope_one.SlopeOne()
for trainset, testset in kSplit.split(data): #iterate through the folds.
    slopeOne.fit(trainset)
    predictionsSlope = slopeOne.test(testset)
    rmseSlope.append(surprise.accuracy.rmse(predictionsSlope,verbose=True))#get root means squared error

RMSE: 1.0680
RMSE: 1.0687
RMSE: 1.0730
RMSE: 1.0693
RMSE: 1.0732
RMSE: 1.0794
RMSE: 1.0741
RMSE: 1.0787
RMSE: 1.0700
RMSE: 1.0770


In [95]:
time_start = time()
predictionsslope = []

for team in set(df['Team']):
    predictionsslope.append(slopeOne.predict(21385, team))

top_n = get_top_n(predictionsslope, n=10)
print('Time elapsed: ', time() - time_start,'s')
top_n

Time elapsed:  0.2909975051879883 s


defaultdict(list,
            {21385: [('CD Huarte', 5),
              ('Beti Kozkor KE', 5),
              ('Petaling Jaya City FC', 5),
              ('WBC Wels', 5),
              ('Oviedo Baloncesto', 5),
              ('Klosterneuburg Dukes', 5),
              ('Joker Swiecie Women', 5),
              ('CA Talleres Remedios de Escalada', 5),
              ('Hyogo Storks', 5),
              ('Alians Lypova Dolyna', 4.913888888888888)]})

## Deep factorization

In [96]:
data = CollabDataLoaders.from_df(df, seed=42, valid_pct=0.2, user_name='IDUtente', item_name='Team', rating_name='Importo')
learn = collab_learner(data, y_range=(0,5.5), n_factors=60)
learn.fit_one_cycle(n_epoch=10, lr_max=5e-3, wd=0.1)

epoch,train_loss,valid_loss,time
0,0.973273,0.960936,00:35
1,0.891658,0.905405,00:40
2,0.775183,0.891358,00:42
3,0.726702,0.886046,00:44
4,0.642304,0.875212,00:43
5,0.533611,0.871006,00:41
6,0.433984,0.867051,00:40
7,0.333522,0.867322,00:37
8,0.241643,0.8698,00:39
9,0.207458,0.87052,00:36


In [99]:
learn.show_results()

Unnamed: 0,IDUtente,Team,Importo,Importo_pred
0,860.0,3857.0,3.75,2.961366
1,3919.0,921.0,1.964286,3.164419
2,868.0,2682.0,2.8,2.567116
3,3865.0,2980.0,1.3,2.158129
4,317.0,2494.0,2.672575,2.963706
5,1739.0,5492.0,2.75,2.847874
6,5683.0,4534.0,1.860987,4.486747
7,5453.0,5116.0,3.421053,3.186068
8,2518.0,1506.0,3.092105,2.60853


### Ensemble algorithm

It looks like our KNN is outperforming the rest. Lets try to hybridize the models so we can get the best parts of every model. To do this, we're going to use Suprise to make a new algorithm, and make it out-perform the rest.

Now we'll make a class in Surprise and inherit it from Algobase.

In [42]:
class HybridFacto(surprise.AlgoBase):
    def __init__(self, epochs, learning_rate):
        self.alpha = np.array([0.25]*4)
        self.epochs = epochs
        self.learning_rate = learning_rate
    
    def fit(self, holdout):
        holdout=holdout.build_full_trainset().build_testset()
        
        for epoch in range(self.epochs): 
            print('Starting epoch: ', epoch)
                
            predictions = np.array([collabKNN.test(holdout), funkSVD.test(holdout), coClus.test(holdout), slopeOne.test(holdout)])
                        
            maeGradient = [surprise.accuracy.mae([pred for pred in prediction]) for prediction in predictions] 
            
            newalpha = self.alpha - np.transpose([self.learning_rate * mae for mae in maeGradient])
            
            #convergence check:
            alpha_diff = [x-y for x,y in zip(newalpha, self.alpha)]
            alpha_abs_mean = abs(np.mean(alpha_diff))
             
            print('alpha_abs_mean: ', alpha_abs_mean)
            print('====================================')
            
            if alpha_abs_mean < 0.001:
                break
                    
            self.alpha = newalpha
            
    def estimate(self,u,i):
        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible('User and/or item is unknown.')
        algoResults = np.array([collabKNN.predict(u,i),funkSVD.predict(u,i),coClus.predict(u,i),slopeOne.predict(u,i)])
        return np.sum(np.dot(self.alpha,algoResults))

In [55]:

#Round predicted ratings


class HybridFacto(surprise.AlgoBase):
    def __init__(self, epochs, learning_rate):
        self.alpha = np.array([0.25]*4)
        self.epochs = epochs
        self.learning_rate = learning_rate
    
    def fit(self, holdout):
        holdout=holdout.build_full_trainset().build_testset()
        
        for epoch in range(self.epochs): 
            print('Starting epoch: ', epoch)
                
            predictions = np.array([collabKNN.test(holdout), funkSVD.test(holdout), coClus.test(holdout), slopeOne.test(holdout)])
                 
            print(predictions[0][0])
                
            maeGradient = [surprise.accuracy.rmse([pred for pred in prediction]) for prediction in predictions] 
            
            
            newalpha = self.alpha - np.transpose([self.learning_rate * mae for mae in maeGradient])
            
            #convergence check:
            alpha_diff = [x-y for x,y in zip(newalpha, self.alpha)]
            alpha_abs_mean = abs(np.mean(alpha_diff))
             
            print('alpha_abs_mean: ', alpha_abs_mean)
            print('====================================')
            
            if alpha_abs_mean < 0.001:
                break
                    
            self.alpha = newalpha
            
    def estimate(self,u,i):
        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible('User and/or item is unknown.')
        algoResults = np.array([collabKNN.predict(u,i), funkSVD.predict(u,i), coClus.predict(u,i), slopeOne.predict(u,i)])
        return np.sum(np.dot(self.alpha,algoResults))

In [56]:
holdout = surprise.Dataset.load_from_df(rawholdout, reader)
hybrid = HybridFacto(epochs=2, learning_rate=0.005)
hybrid.fit(holdout)

rmseHyb = []
for trainset, testset in kSplit.split(data): #iterate through the folds.
    predhybrid = hybrid.test(testset)
    rmseHyb.append(surprise.accuracy.rmse(predhybrid))

Starting epoch:  0
[11673 'CS Maritimo Funchal U23' 4.173640167364017 2.1523709902370993
 {'actual_k': 30, 'was_impossible': False}]
RMSE: 1.4761
RMSE: 1.4267
RMSE: 1.6040
RMSE: 1.5075
alpha_abs_mean:  0.007517887943545264
Starting epoch:  1
[11673 'CS Maritimo Funchal U23' 4.173640167364017 2.1523709902370993
 {'actual_k': 30, 'was_impossible': False}]
RMSE: 1.4761
RMSE: 1.4267
RMSE: 1.6040
RMSE: 1.5075
alpha_abs_mean:  0.007517887943545264


AttributeError: 'HybridFacto' object has no attribute 'trainset'