# Ensemble

In [10]:
import pandas as pd
import surprise as sur
import numpy as np
import matplotlib.pyplot as plt
from time import time
from collections import defaultdict
import statistics
from scipy.stats import percentileofscore
import math

from sklearn import preprocessing as pre
from surprise import SVD
from surprise import KNNBasic
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import os

## Read and clean dataset

In [175]:
df=pd.read_csv(r'D:\Projects\ISAI\Data\SubEventsCF\itemCFdata.csv', skiprows=0)
df.drop_duplicates(inplace=True)
print('We have',df.shape[0], 'ratings')
print('The number of unique users we have is:', len(df["IDUtente"].unique()))
print('The number of unique teams we have is:', len(df["Team"].unique()))
print('The median user rated %d teams.'%df["IDUtente"].value_counts().median())
print('The max rating is: %d'%df["Importo"].max(),'the min rating is: %d'%df["Importo"].min())
df.head()

We have 185273 ratings
The number of unique users we have is: 6706
The number of unique teams we have is: 6085
The median user rated 12 teams.
The max rating is: 216300 the min rating is: 0


Unnamed: 0,IDUtente,Team,Importo
0,1773,(Mercedes) L.Hamilton,105.894
1,21249,(Mercedes) V.Bottas,50.0
2,536,(Racing Point) S.Perez,57.69
3,15746,07 Vestur,5.1579
4,20820,07 Vestur,7.1


In [176]:
#Check for teams that have been bet on less times than arbitrary value
min_inst = 50
team_count = df.value_counts('Team') < min_inst
#Get list of teams to drop
teams_to_drop = team_count.where(team_count==True).dropna().index
#Drop from dataframe
df = df[~df['Team'].isin(teams_to_drop)]

start_time = time()
#Calculate, for each user, the percentile rank for each amount relative to the list of all the amounts wagered
for user in set(df['IDUtente']):
    importi = df.loc[df['IDUtente'] == user, 'Importo']
    df.loc[df['IDUtente'] == user, 'Importo'] = [percentileofscore(importi, a, 'rank')/10 for a in importi]

total_time = time()-start_time
print('Time elapsed in percentile calculation: ', total_time, 's')
print('Average time per user: ', total_time/len(set(df['IDUtente'])), 's')

Time elapsed in percentile calculation:  18.359578847885132 s
Average time per user:  0.0028935506458447804 s


In [167]:
print('The max rating is: %d'%df["Importo"].max(),'the min rating is: %d'%df["Importo"].min())
df.head()

The max rating is: 10 the min rating is: 0


Unnamed: 0,IDUtente,Team,Importo
157,619,Aalesunds,3.61039
158,1467,Aalesunds,5.862069
159,1737,Aalesunds,6.413043
160,1760,Aalesunds,8.513011
161,6736,Aalesunds,0.657895


In [168]:
matrix = df.pivot_table(index='IDUtente', columns='Team', values='Importo')
matrix

Team,AC Ajaccio,AC Horsens,AC Milan,AC Monza Brianza 1912,ACS Sepsi OSK Sfantul Gheorghe,ACS UTA Batrana Doamna,ACS Viitorul Pandurii Targu Jiu,ADO Den Haag,AE Paphos,AEK Athens,...,Wydad Casablanca,Yeni Malatyaspor,Yeovil Town,Yokohama F. Marinos,Yokohama FC,Young Boys,Zamalek,Zenit St Petersburg,Zob Ahan,Zoo Kericho FC
IDUtente,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
388,,,,,,,,,,,...,,,,,,,,,,
391,,,,,,,,,,,...,,,,,,,,,,
392,,,,,,,,,,,...,,,,,,,,,,
393,,,9.426752,,,,,,,,...,,,,,,1.210191,,6.178344,,
394,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21376,,,,,,,,,,,...,,,,,,,,,,
21377,,,,,,,,,,,...,,,,,,,,,,
21378,,,,,,,,,,,...,,,,,,,,,,
21380,,,,,,,,,,,...,,,,,,,,,,


In [177]:
#swapping columns
raw=df[['IDUtente','Team','Importo']] 
raw.columns = ['n_users','n_items','rating']

rawTrain,rawholdout = train_test_split(raw, test_size=0.25 )
# when importing from a DF, you only need to specify the scale of the ratings.
reader = surprise.Reader(rating_scale=(1,10)) 
#into surprise:
data = surprise.Dataset.load_from_df(rawTrain,reader)
holdout = surprise.Dataset.load_from_df(rawholdout,reader)

## In Pseudo Code, our Algorithm is as follows:
We split the dataset into 10 folds, where we train on 9 of the folds and test on the remaining one, which randomly alternates..
We run several recommender systems on the dataset, and optimize the recommender systems on the 75% system.
intialize a weighted variable alpha to be 1/q, where q is the number of recommender systems we use.
let the rated matrix equal alpha * sum(predicted Ratings Matrices) and compare that with the real rating.
Using Gradient Descent, optimize the alpha term over parameter space to be able to optimize to give the most weight to the model which can represent the best prediction.
### First, lets pick some algorithms to include into our ensemble. We'll choose four.
1. Collaborative Filtering
2. Matrix Factorization
3. Collaborative filtering with co-clustering
4. Collaborative Filtering based on the popular Slope One Algorithm

## Collaborative Filtering
Number one on our list: Collaborative filtering is a recommender system that recommends based off of similiarity between items. The big idea is that items that are similiar should be similiarly liked by the same user. For example, if you liked Alien, and you really liked Predator, there's a good chance you'll enjoy Alien Versus Predator. We're just doing the same thing with books here. If you'd like to read more, read up here: http://courses.ischool.berkeley.edu/i290-dm/s11/SECURE/a1-koren.pdf

In [178]:
kSplit = surprise.model_selection.split.KFold(n_splits=10, shuffle=True) # split data into folds. 

In [179]:
sim_options = sim_options = {'name': 'cosine',
               'user_based': False  # compute  similarities between items
               }
collabKNN = surprise.KNNBasic(k=40,sim_options=sim_options) #try removing sim_options. You'll find memory errors. 
rmseKNN = []
rmseSVD = []
rmseCo = []
rmseSlope = []
for trainset, testset in kSplit.split(data): #iterate through the folds.
    collabKNN.fit(trainset)
    predictionsKNN = collabKNN.test(testset)
    rmseKNN.append(surprise.accuracy.rmse(predictionsKNN,verbose=True)) #get root means squared error

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 2.9262
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 2.9483
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 2.9379
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 2.9528
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 2.9398
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 2.9348
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 2.9317
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 2.9657
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 2.9426
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 2.9320


### Matrix Factorization Algorithm.
This algorithm was created by Simon Funk during the Netflix Prize, and it is called FunkSVD. The big idea behind this algorithm is you try to estimate the best latent factors for the ratings. So, if you have a 100k users and 10k books, you factor the 100k x 10k matrix into the number of factors. In turn, you would be making two 100k x 30 and 30 x 10k matrices. You multiply them together to get the predicted rating. This lets us optimize on the latent factors between users, such as users that are similiar together because they all rated action films, and latent factors between items, like book series like Goosebumps and Steven King. We multiply each of these to get the predicted rating.

If you'd like to read more, look it up here: https://papers.nips.cc/paper/3208-probabilistic-matrix-factorization.pdf

In [185]:
funkSVD = surprise.prediction_algorithms.matrix_factorization.SVD(n_factors=30,n_epochs=1000,biased=True)

In [186]:
for trainset, testset in kSplit.split(data): #iterate through the folds.
    funkSVD.fit(trainset)
    predictionsSVD = funkSVD.test(testset)
    rmseSVD.append(surprise.accuracy.rmse(predictionsSVD,verbose=True)) #get root means squared error

RMSE: 3.3935
RMSE: 3.4097
RMSE: 3.4318
RMSE: 3.3554
RMSE: 3.4059
RMSE: 3.4020
RMSE: 3.4234
RMSE: 3.3965
RMSE: 3.3965
RMSE: 3.3913


### Co-clustering collaborative filtering.
Co-clustering is where you cluster users and items together, using clustering techniques. You identify three clusters. You'll have to sum three things to get a predicted rating:
1. You find the cluster for the specified rating of user u and item i, and identify the mean of that cluster. So you find the mean of cluster u_i.
2. find the mean of the cluster of item i and subtract that from the average rating of that item.
3. find the mean of cluster of user u and substract that from the average rating of that user. 

If you want to learn more about Co-Clustering, read more here: https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.113.6458&rep=rep1&type=pdf

In [21]:
coClus = surprise.prediction_algorithms.co_clustering.CoClustering(n_cltr_u=4,n_cltr_i=4,n_epochs=25) 
for trainset, testset in kSplit.split(data): #iterate through the folds.
    coClus.fit(trainset)
    predictionsCoClus = coClus.test(testset)
    rmseCo.append(surprise.accuracy.rmse(predictionsCoClus,verbose=True))#get root means squared error

RMSE: 3.0327
RMSE: 3.0197
RMSE: 3.0165
RMSE: 2.9973
RMSE: 2.9936
RMSE: 3.0360
RMSE: 3.0344
RMSE: 3.0084
RMSE: 3.0320
RMSE: 3.0260


### Slope One Collaborative Filtering Algorithm
This algorithm computes the slope of each of the relevant items rated by a user, finds the difference, then computes the prediction. Its a blunt instrument, but its a good heuristic that might improve our ensemble method. You can read more here: https://arxiv.org/abs/cs/0702144

In [22]:
slopeOne = surprise.prediction_algorithms.slope_one.SlopeOne()
for trainset, testset in kSplit.split(data): #iterate through the folds.
    slopeOne.fit(trainset)
    predictionsSlope = slopeOne.test(testset)
    rmseSlope.append(surprise.accuracy.rmse(predictionsSlope,verbose=True))#get root means squared error

RMSE: 2.9139
RMSE: 2.9314
RMSE: 2.9270
RMSE: 2.9443
RMSE: 2.9130
RMSE: 2.9303
RMSE: 2.9270
RMSE: 2.9349
RMSE: 2.9471
RMSE: 2.9342


### Ensemble algorithm

It looks like our KNN is outperforming the rest. Lets try to hybridize the models so we can get the best parts of every model. To do this, we're going to use Suprise to make a new algorithm, and make it out-perform the rest.

Now we'll make a class in Surprise and inherit it from Algobase.

In [159]:
class HybridFacto(surprise.AlgoBase):
    def __init__(self, epochs, learning_rate):
        self.alpha = np.array([0.25]*4)
        self.epochs = epochs
        self.learning_rate = learning_rate
    
    def fit(self, holdout):
        holdout=holdout.build_full_trainset().build_testset()
        
        for epoch in range(self.epochs): 
            print('Starting epoch: ', epoch)
                
            predictions = np.array([collabKNN.test(holdout),funkSVD.test(holdout),coClus.test(holdout),slopeOne.test(holdout)])
                        
            maeGradient = [surprise.accuracy.mae([pred for pred in prediction]) for prediction in predictions] 
            
            newalpha = self.alpha - np.transpose([self.learning_rate * mae for mae in maeGradient])
            #convergence check:
            alpha_diff = [x-y for x,y in zip(newalpha, self.alpha)]
            alpha_abs_mean = abs(np.mean(alpha_diff))
             
            print('alpha_abs_mean: ', alpha_abs_mean)
            print('====================================')
            
            if alpha_abs_mean < 0.001:
                break
            self.alpha = newalpha
            print("Old alpha: ", self.alpha)
            print("New alpha: ", newalpha)
            
    def estimate(self,u,i):
        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible('User and/or item is unknown.')
        algoResults = np.array([collabKNN.predict(u,i),funkSVD.predict(u,i),coClus.predict(u,i),slopeOne.predict(u,i)])
        return np.sum(np.dot(self.alpha,algoResults))

In [163]:
class HybridFacto(surprise.AlgoBase):
    def __init__(self, epochs, learning_rate):
        self.alpha = np.array([0.25]*4)
        self.epochs = epochs
        self.learning_rate = learning_rate
    
    def fit(self, holdout):
        holdout=holdout.build_full_trainset().build_testset()
        
        for epoch in range(self.epochs): 
            print('Starting epoch: ', epoch)
                
            predictions = np.array([collabKNN.test(holdout),funkSVD.test(holdout),coClus.test(holdout),slopeOne.test(holdout)])
                
            print(predictions.shape)
            print(predictions)

            maeGradient = [surprise.accuracy.mae(prediction) for prediction in predictions] 
            
            
            newalpha = self.alpha - np.transpose([self.learning_rate * mae for mae in maeGradient])
            #convergence check:
            alpha_diff = [x-y for x,y in zip(newalpha, self.alpha)]
            alpha_abs_mean = abs(np.mean(alpha_diff))
             
            print('alpha_abs_mean: ', alpha_abs_mean)
            print('====================================')
            
            if alpha_abs_mean < 0.001:
                break
            self.alpha = newalpha
            print("Old alpha: ", self.alpha)
            print("New alpha: ", newalpha)
            
    def estimate(self,u,i):
        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible('User and/or item is unknown.')
        algoResults = np.array([collabKNN.predict(u,i),funkSVD.predict(u,i),coClus.predict(u,i),slopeOne.predict(u,i)])
        return np.sum(np.dot(self.alpha,algoResults))

In [164]:
holdout = surprise.Dataset.load_from_df(rawholdout, reader)
hybrid = HybridFacto(epochs=100, learning_rate=0.05)
hybrid.fit(holdout)

rmseHyb = []
for trainset, testset in kSplit.split(data): #iterate through the folds.
    predhybrid = hybrid.test(testset)
    rmseHyb.append(surprise.accuracy.rmse(predhybrid))

Starting epoch:  0
(4, 33639, 5)
[[[4972 'Besiktas JK' 3.0327868852459017 5
   {'actual_k': 40, 'was_impossible': False}]
  [4972 'Portsmouth' 5.0 5 {'actual_k': 40, 'was_impossible': False}]
  [4972 'Bournemouth' 8.606557377049182 5
   {'actual_k': 40, 'was_impossible': False}]
  ...
  [12459 'NEC Nijmegen' 7.8125 5
   {'actual_k': 12, 'was_impossible': False}]
  [11519 'Gaziantep BB SK' 3.75 5
   {'actual_k': 2, 'was_impossible': False}]
  [11371 'Coritiba' 6.25 5 {'actual_k': 4, 'was_impossible': False}]]

 [[4972 'Besiktas JK' 3.0327868852459017 5 {'was_impossible': False}]
  [4972 'Portsmouth' 5.0 5 {'was_impossible': False}]
  [4972 'Bournemouth' 8.606557377049182 5 {'was_impossible': False}]
  ...
  [12459 'NEC Nijmegen' 7.8125 5 {'was_impossible': False}]
  [11519 'Gaziantep BB SK' 3.75 5 {'was_impossible': False}]
  [11371 'Coritiba' 6.25 5 {'was_impossible': False}]]

 [[4972 'Besiktas JK' 3.0327868852459017 3.8756225584591606
   {'was_impossible': False}]
  [4972 'Portsmouth

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

array([2.30934057, 2.19299398, 2.33908956, 2.28509024])

In [41]:
holdout.build_full_trainset().build_testset()

[(4972, 'Besiktas JK', 3.0327868852459017),
 (4972, 'Portsmouth', 5.0),
 (4972, 'Bournemouth', 8.606557377049182),
 (4972, 'Amstetten', 3.6065573770491803),
 (4972, 'Neuchatel Xamax', 2.7049180327868854),
 (4972, 'Leeds United', 2.2131147540983607),
 (4972, 'Jeonbuk Motors', 8.278688524590164),
 (4972, 'FC Heidenheim', 5.327868852459017),
 (4972, 'Grasshoppers Zurich', 5.655737704918033),
 (4972, 'Montpellier HSC', 2.2131147540983607),
 (4972, 'Chelsea', 2.2131147540983607),
 (4972, 'Greuther Furth', 5.327868852459017),
 (4972, 'Hamburger SV', 4.672131147540983),
 (10967, 'Al-Nasr (UAE)', 4.375),
 (10967, 'Al Wahda (UAE)', 4.375),
 (7428, 'Werder Bremen', 2.3214285714285716),
 (7428, 'Sturm Graz', 1.25),
 (7428, 'Young Boys', 4.107142857142857),
 (7428, 'Viktoria Plzen', 4.821428571428571),
 (7428, 'Ceske Budejovice', 4.821428571428571),
 (7428, 'Red Bull Salzburg', 1.25),
 (7428, 'Castellon', 9.464285714285714),
 (7428, 'Standard Liege', 3.3928571428571432),
 (7428, 'Sevilla', 6.60714