# Collaborative Filtering Latest Version

In [24]:
import pandas as pd
import surprise
import numpy as np
import matplotlib.pyplot as plt
from time import time
from collections import defaultdict
import statistics
from scipy.stats import percentileofscore
import math
from scipy.stats import uniform

from fastai.collab import * 
from fastai.tabular import *
import seaborn as sns

from sklearn import preprocessing as pre
from surprise import SVD
from surprise import KNNBasic
from surprise import Dataset
from surprise.model_selection import cross_validate
from sklearn.model_selection import train_test_split
import os
from tqdm.notebook import tqdm

## Utils

In [4]:
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

## Read and clean dataset

In [46]:
df=pd.read_csv(r'C:\Users\m.medioli\Desktop\resultssoccer.csv', skiprows=0, sep=';', names=["IDUtente", "Team", "Importo", "NumeroGiocate"])
df.drop_duplicates(inplace=True)

In [52]:
user_to_keep = list(set(df["IDUtente"]))[:20000]
#print(user_to_keep)
df_list = []
for user in tqdm(user_to_keep):
    df_list.append(df.loc[df['IDUtente'] == user])
df_new = pd.concat(df_list)

  0%|          | 0/20000 [00:00<?, ?it/s]

         IDUtente           Team  Importo  NumeroGiocate
84732     3145728         Aarhus     8.47              1
1503966   3145728     Bodo/Glimt     8.40              1
1775552   3145728        Brondby    11.77              1
2424157   3145728     Copenhagen     8.32              1
2556753   3145728  Crvena Zvezda    11.54              1
We have 10150515 ratings
The number of unique users we have is: 141109
The number of unique teams we have is: 803


In [84]:
print(df_new.columns)
print(df_new.head())
print('We have',df_new.shape[0], 'ratings')
print('The number of unique users we have is:', len(df_new["IDUtente"].unique()))
print('The number of unique teams we have is:', len(df_new["Team"].unique()))
print('The median user rated %d teams.'%df_new["IDUtente"].value_counts().median())
print('The max rating is: %d'%df_new["Importo"].max(),'the min rating is: %d'%df_new["Importo"].min())
df = df_new
df.head()

#dftot=pd.read_csv(r'D:\Projects\ISAI\Data\SubEventsCF\itemCFdata.csv', skiprows=0)
#dfnum=pd.read_csv(r'D:\Projects\ISAI\Data\SubEventsCF\numerogiocateCF.csv', skiprows=0)

##Check for teams that have been bet on less times than arbitrary value
#min_inst = 1
#team_count = dftot.value_counts('Team') < min_inst
##Get list of teams to drop
#teams_to_drop = team_count.where(team_count==True).dropna().index
##Drop from dataframe
#dftot = dftot[~dftot['Team'].isin(teams_to_drop)]

start_time = time()

alpha = 0.5
beta = 0.5

result = {}
#Calculate, for each user, the percentile rank for each amount relative to the list of all the amounts wagered
for user in tqdm(set(df['IDUtente'])):
    importi = df.loc[df['IDUtente'] == user, 'Importo']
    numero = df.loc[df['IDUtente'] == user, 'NumeroGiocate']

    df.loc[df['IDUtente'] == user, 'Rating'] = [(alpha*(percentileofscore(importi, a, 'rank')/20) + beta*(percentileofscore(numero, b, 'rank')/20)) for a,b in zip(importi,numero)]
    

total_time = time()-start_time
print('Time elapsed in percentile calculation: ', total_time, 's')
print('Average time per user: ', total_time/len(set(df['IDUtente'])), 's')

Index(['IDUtente', 'Team', 'Importo', 'NumeroGiocate', 'Rating'], dtype='object')
         IDUtente           Team  Importo  NumeroGiocate  Rating
84732     3145728         Aarhus     8.47              1     2.0
1503966   3145728     Bodo/Glimt     8.40              1     2.0
1775552   3145728        Brondby    11.77              1     3.0
2424157   3145728     Copenhagen     8.32              1     2.0
2556753   3145728  Crvena Zvezda    11.54              1     3.0
We have 1468660 ratings
The number of unique users we have is: 20000
The number of unique teams we have is: 801
The median user rated 43 teams.
The max rating is: 2300000 the min rating is: 0


  0%|          | 0/20000 [00:00<?, ?it/s]

Time elapsed in percentile calculation:  322.8234975337982 s
Average time per user:  0.01614117487668991 s


In [85]:
df.to_csv(r'C:\Users\m.medioli\Desktop\resultssoccer_with_ratings.csv')

In [228]:
df = pd.read_csv(r'C:\Users\m.medioli\Desktop\resultssoccer_with_ratings.csv')
df["Rating"]=round(df["Rating"] * 2) / 2 

In [229]:
matrix = df.pivot_table(index='IDUtente', columns='Team', values='Rating')
matrix

Team,07 Vestur Sorvagur,12 de Octubre de Itaugua,1928 Bucaspor,AB Copenhagen,AB Taarnby,ACS Sepsi Osk Sfantu Gheorghe,AL Ahli Saudi,AL Ansar,AL Batin,AL Ettifaq,...,Zaglebie Sosnowiec,Zamalek SC,Zanaco,Zeta Golubovci,Zhejiang Greentown,Zlin,Zob Ahan Isfahan,Zorka-BDU,Zugdidi,Zweigen Kanazawa
IDUtente,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12676,,,3.5,,,1.5,0.5,3.5,,,...,,5.0,0.5,,,,,,0.5,1.5
12679,4.0,,3.0,3.0,3.5,,,2.0,,,...,,4.5,0.5,,,,2.0,3.0,,
12684,1.0,,1.5,,,,,4.0,,,...,,5.0,0.5,,,,3.5,1.0,,2.5
12694,0.5,,4.0,1.0,2.0,0.5,1.0,3.5,1.0,0.5,...,,5.0,1.0,0.5,,2.0,3.5,3.0,,
12699,,,2.5,0.5,2.0,,,,,,...,,3.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11625895,,,,,,,,,,,...,,,,,,,,,,
11675251,,,,,,,,,,,...,,2.0,,,,,,,,
11677337,,,,,,,,,,,...,,,,,,,,,,
11677671,,,,,,,,,,,...,,,,,1.5,,,,,


In [230]:
#swapping columns
raw=df[['IDUtente','Team','Rating']] 
raw.columns = ['n_users','n_items','rating']

rawTrain,rawholdout = train_test_split(raw, test_size=0.25)
# when importing from a DF, you only need to specify the scale of the ratings.
reader = surprise.Reader(rating_scale=(0,5))
#into surprise:
data = surprise.Dataset.load_from_df(rawTrain,reader)
holdout = surprise.Dataset.load_from_df(rawholdout,reader)

## In Pseudo Code, our Algorithm is as follows:
We split the dataset into 10 folds, where we train on 9 of the folds and test on the remaining one, which randomly alternates..
We run several recommender systems on the dataset, and optimize the recommender systems on the 75% system.
intialize a weighted variable alpha to be 1/q, where q is the number of recommender systems we use.
let the rated matrix equal alpha * sum(predicted Ratings Matrices) and compare that with the real rating.
Using Gradient Descent, optimize the alpha term over parameter space to be able to optimize to give the most weight to the model which can represent the best prediction.
### First, lets pick some algorithms to include into our ensemble. We'll choose four.
1. Collaborative Filtering
2. Matrix Factorization
3. Collaborative filtering with co-clustering
4. Collaborative Filtering based on the popular Slope One Algorithm

In [231]:
# split data into folds. 
kSplit = surprise.model_selection.split.KFold(n_splits=10, shuffle=True) 
#initialize error vectors
rmseKNN = []
rmseSVD = []
rmseCo = []
rmseSlope = []

## Collaborative Filtering classic KNN
Number one on our list: Collaborative filtering is a recommender system that recommends based off of similiarity between items. The big idea is that items that are similiar should be similiarly liked by the same user. For example, if you liked Alien, and you really liked Predator, there's a good chance you'll enjoy Alien Versus Predator. We're just doing the same thing with books here. If you'd like to read more, read up here: http://courses.ischool.berkeley.edu/i290-dm/s11/SECURE/a1-koren.pdf

In [92]:
sim_options = sim_options = {'name': 'cosine',
               'user_based': False  # compute  similarities between items
               }
collabKNN = surprise.KNNBasic(k=30, sim_options=sim_options)
for trainset, testset in kSplit.split(data): #data leakage due to pre-processing before splitting?
    collabKNN.fit(trainset)
    predictionsKNN = collabKNN.test(testset)
    rmseKNN.append(surprise.accuracy.rmse(predictionsKNN,verbose=True))

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.3598
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.3651
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.3630
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.3608
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.3633
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.3579
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.3574
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.3602
Computing the cosine similarity matrix...
Done computing similarity matrix.


KeyboardInterrupt: 

In [None]:
time_start = time()
predictionsKNN = []

for team in set(df['Team']):
    predictionsKNN.append(collabKNN.predict(21385, team))

top_n = get_top_n(predictionsKNN, n=10)
print('Time elapsed: ', time() - time_start,'s')
top_n

## Singular Vector Decomposition
This algorithm was created by Simon Funk during the Netflix Prize, and it is called FunkSVD. The big idea behind this algorithm is you try to estimate the best latent factors for the ratings. So, if you have a 100k users and 10k books, you factor the 100k x 10k matrix into the number of factors. In turn, you would be making two 100k x 30 and 30 x 10k matrices. You multiply them together to get the predicted rating. This lets us optimize on the latent factors between users, such as users that are similiar together because they all rated action films, and latent factors between items, like book series like Goosebumps and Steven King. We multiply each of these to get the predicted rating.

If you'd like to read more, look it up here: https://papers.nips.cc/paper/3208-probabilistic-matrix-factorization.pdf

In [232]:
funkSVD = surprise.prediction_algorithms.matrix_factorization.SVD(n_epochs=30, n_factors=60)

for trainset, testset in kSplit.split(data): #iterate through the folds.
    funkSVD.fit(trainset) 
    predictionsSVD = funkSVD.test(testset)   
    rmseSVD.append(surprise.accuracy.rmse(predictionsSVD, verbose=True)) #get root means squared error

RMSE: 0.8235
RMSE: 0.8170
RMSE: 0.8254
RMSE: 0.8258
RMSE: 0.8207
RMSE: 0.8228
RMSE: 0.8225
RMSE: 0.8233
RMSE: 0.8194
RMSE: 0.8228


In [233]:
time_start = time()
predictionsSVD = []

for team in set(df['Team']):
    predictionsSVD.append(funkSVD.predict(3145728, team))

top_n = get_top_n(predictionsSVD, n=10)
print('Time elapsed: ', time() - time_start,'s')
top_n

Time elapsed:  0.15302610397338867 s


defaultdict(list,
            {3145728: [('Manchester City', 3.307493257711414),
              ('Chelsea', 3.2293403568558805),
              ('Molde', 3.1718806020811057),
              ('Rosenborg BK', 3.1190600745468826),
              ('Las Palmas', 3.05434907095665),
              ('Albacete', 3.0399837635779847),
              ('Manchester Utd', 3.0194061703235384),
              ('Villarreal', 2.999943988404988),
              ('Rangers', 2.948045248029164),
              ('Partizan', 2.9400096588928277)]})

### Co-clustering collaborative filtering.
Co-clustering is where you cluster users and items together, using clustering techniques. You identify three clusters. You'll have to sum three things to get a predicted rating:
1. You find the cluster for the specified rating of user u and item i, and identify the mean of that cluster. So you find the mean of cluster u_i.
2. find the mean of the cluster of item i and subtract that from the average rating of that item.
3. find the mean of cluster of user u and substract that from the average rating of that user. 

If you want to learn more about Co-Clustering, read more here: https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.113.6458&rep=rep1&type=pdf

In [211]:
coClus = surprise.prediction_algorithms.co_clustering.CoClustering(n_cltr_u=4, n_cltr_i=4, n_epochs=100) 
for trainset, testset in kSplit.split(data): #iterate through the folds.
    coClus.fit(trainset)
    predictionsCoClus = coClus.test(testset)
    rmseCo.append(surprise.accuracy.rmse(predictionsCoClus,verbose=True))#get root means squared error

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  coClus.fit(trainset)


RMSE: 1.0886
RMSE: 1.0899
RMSE: 1.0918
RMSE: 1.0996
RMSE: 1.0885
RMSE: 1.0843
RMSE: 1.0961
RMSE: 1.0951
RMSE: 1.0987
RMSE: 1.0850


In [103]:
time_start = time()
predictionscoClus = []

for team in set(df['Team']):
    predictionscoClus.append(coClus.predict(3145728, team))

top_n = get_top_n(predictionscoClus, n=10)
print('Time elapsed: ', time() - time_start,'s')
top_n

Time elapsed:  0.10967063903808594 s


defaultdict(list,
            {3145728: [('Chelsea', 3.6514754527398496),
              ('Manchester City', 3.6392024414667015),
              ('Kawasaki Frontale', 3.3179366395122116),
              ('Villarreal', 3.2044865951160504),
              ('Ludogorets 1945 Razgrad', 3.2035993638846136),
              ('FK Haugesund', 3.2013608433749936),
              ('Brentford', 3.197546593125961),
              ('Manchester Utd', 3.1917244146388852),
              ('Liaoning Shenyang Urban FC', 3.1846390495875116),
              ("Newell's Old Boys", 3.184063344348594)]})

### Slope One Collaborative Filtering Algorithm
This algorithm computes the slope of each of the relevant items rated by a user, finds the difference, then computes the prediction. Its a blunt instrument, but its a good heuristic that might improve our ensemble method. You can read more here: https://arxiv.org/abs/cs/0702144

In [212]:
slopeOne = surprise.prediction_algorithms.slope_one.SlopeOne()
for trainset, testset in kSplit.split(data): #iterate through the folds.
    slopeOne.fit(trainset)
    predictionsSlope = slopeOne.test(testset)
    rmseSlope.append(surprise.accuracy.rmse(predictionsSlope,verbose=True))#get root means squared error

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  slopeOne.fit(trainset)


RMSE: 1.1148
RMSE: 1.1147
RMSE: 1.1186
RMSE: 1.1194
RMSE: 1.1172
RMSE: 1.1176
RMSE: 1.1182
RMSE: 1.1102
RMSE: 1.1178
RMSE: 1.1164


In [104]:
time_start = time()
predictionsslope = []

for team in set(df['Team']):
    predictionsslope.append(slopeOne.predict(3145728, team))

    
top_n = get_top_n(predictionsslope, n=10)
print('Time elapsed: ', time() - time_start,'s')
top_n

Time elapsed:  0.10912704467773438 s


defaultdict(list,
            {3145728: [('Malaga', 3.4473335223387),
              ('Leganes', 3.4472449399923484),
              ('Bodo/Glimt', 3.411814865633628),
              ('Zamalek SC', 3.400504198482074),
              ('Al Masry Club', 3.365550670209643),
              ('Molde', 3.296393859911216),
              ('Almeria', 3.2676310392488013),
              ('Alcorcon', 3.24795395323925),
              ('Rosenborg BK', 3.243939587772694),
              ('SK Brann', 3.238746082503729)]})

## Deep factorization

In [105]:
data = CollabDataLoaders.from_df(df, seed=42, valid_pct=0.2, user_name='IDUtente', item_name='Team', rating_name='Rating')
learn = collab_learner(data, y_range=(0,5.5), n_factors=60)
learn.fit_one_cycle(n_epoch=10, lr_max=5e-3, wd=0.1)

epoch,train_loss,valid_loss,time
0,1.103976,1.083666,04:40
1,1.108586,1.094663,05:13
2,1.110226,1.108664,05:28
3,1.092068,1.092242,05:27
4,1.086868,1.058208,05:22
5,0.991498,1.015893,05:15
6,0.971052,0.960583,05:23
7,0.913638,0.902221,05:05
8,0.828764,0.862029,04:52
9,0.757077,0.854476,04:46


In [106]:
learn.show_results()

Unnamed: 0,IDUtente,Team,Rating,Rating_pred
0,15851.0,424.0,1.5,1.601537
1,14890.0,749.0,3.0,2.837229
2,4532.0,64.0,1.5,1.907474
3,11073.0,52.0,3.0,3.243945
4,9208.0,82.0,4.0,2.592829
5,11146.0,460.0,4.0,2.131294
6,6451.0,680.0,4.5,3.48432
7,5537.0,542.0,3.5,1.929368
8,13277.0,497.0,2.0,2.581906


In [225]:
predictionsKNN = []
predictionsSVD = []
predictionsClus = []
predictionsslope = []

user_id = 4718603


for team in set(df['Team']):
    predictionsKNN.append(collabKNN.predict(user_id, team))

top_n = get_top_n(predictionsKNN, n=210)
print("KNN", top_n, "\n")
knn_df = pd.DataFrame(top_n[user_id], columns =['Team', 'Rating'])
print("\n===============================================================\n")

for team in set(df['Team']):
    predictionsSVD.append(funkSVD.predict(user_id, team))

top_n = get_top_n(predictionsSVD, n=210)
print("SVD", top_n, "\n")
svd_df = pd.DataFrame(top_n[user_id], columns =['Team', 'Rating'])
print("\n===============================================================\n")

for team in set(df['Team']):
    predictionscoClus.append(coClus.predict(user_id, team))

top_n = get_top_n(predictionscoClus, n=210)
print("CoClus", top_n, "\n")
coclus_df = pd.DataFrame(top_n[user_id], columns =['Team', 'Rating'])
print("\n===============================================================\n")

for team in set(df['Team']):
    predictionsslope.append(slopeOne.predict(user_id, team))
    
top_n = get_top_n(predictionsslope, n=210)
print("SlopeOne", top_n, "\n")
slope_df = pd.DataFrame(top_n[user_id], columns =['Team', 'Rating'])
print("\n===============================================================\n")

df_user = df[df["IDUtente"]==user_id]
print(df_user)
data = CollabDataLoaders.from_df(df, seed=42, valid_pct=0.2, user_name='IDUtente', item_name='Team', rating_name='Rating')
top_n = learn.get_preds(dl=data.test_dl(df[df["IDUtente"]==user_id]))
results = pd.DataFrame(top_n[0].numpy())
results = round(results * 2) / 2
df_user.insert(5, "Preds", np.array(results))
deep_df = df_user.sort_values(by='Preds', ascending=False)[["Team", "Rating", "Preds"]]
print("\n===============================================================\n")


print(knn_df[:20], "\n")
print(svd_df[:20], "\n")
print(slope_df[:20], "\n")
print(deep_df[:20], "\n") #slope_df, coclus_df, 
top_real = df_user.sort_values(by='Rating', ascending=False)[["Team", "Rating", "Preds"]]
print(top_real[:10])

KNN defaultdict(<class 'list'>, {4718603: [('Shaanxi Changan an Athletic', 4.186440205372335), ('Enugu Rangers', 4.156199414428075), ('Al Masry Club', 4.139328645082893), ('Zamalek SC', 4.1392323741911525), ('Akwa Utd', 4.139153421611319), ('Kilmarnock', 4.135800448829812), ('Brondby', 4.134936408228968), ('BFC Daugavpils', 4.118039617733069), ('Ermis Aradippou FC', 4.1178174893741115), ('Vikingur Gota', 4.11695801213459), ('Tanda', 4.112005376681386), ('Orebro', 4.104192669059536), ('Malmo FF', 4.104182282120609), ('Dundee', 4.103042605283178), ('Riga', 4.101733403642529), ('Apoel Nicosia', 4.101698165599266), ('Bobruichanka', 4.101123609540482), ('Keflavik IF', 4.086849407268486), ('Enosis Neon Paralimni FC', 4.085001751926629), ('Midtjylland', 4.084027229191426), ('SOA', 4.078823077515158), ('Sunderland U23', 4.072878913067888), ('Valur Reykjavik', 4.070914316275358), ('Breidablik Kopavogur', 4.069666296544566), ('IA Akranes', 4.069474599241073), ('Ismaily', 4.068267333391867), ('Zo



SlopeOne defaultdict(<class 'list'>, {4718603: [('Malaga', 4.23658761479624), ('Leganes', 4.230261805078307), ('Bodo/Glimt', 4.186796165194297), ('Zamalek SC', 4.16004084628039), ('Al Masry Club', 4.154445514425198), ('Molde', 4.087612403694409), ('Malmo FF', 4.0448044959799265), ('UD Logrones', 4.038882203722306), ('SK Brann', 4.038242962991968), ('Orebro', 4.034838286930954), ('Almeria', 4.029901286232689), ('Girona', 4.023881867113746), ('Rosenborg BK', 4.019723908718882), ('Alcorcon', 4.010753621294617), ('Kilmarnock', 3.9019812119559045), ('Nordsjaelland', 3.9001652413947987), ('Brondby', 3.8889441938007634), ('Dundee', 3.8780874651279365), ('Midtjylland', 3.8518375667364273), ('Aarhus', 3.8474150129307327), ('Keflavik IF', 3.744899244280123), ('Valur Reykjavik', 3.7367500047521776), ('Randers', 3.725236796926234), ('Copenhagen', 3.722995751472663), ('Rayo Vallecano', 3.6808578256242868), ('CD Castellon', 3.677370191577078), ('Sligo Rovers', 3.6692879299767576), ('Shamrock Rover



                           Team    Rating
0   Shaanxi Changan an Athletic  4.186440
1                 Enugu Rangers  4.156199
2                 Al Masry Club  4.139329
3                    Zamalek SC  4.139232
4                      Akwa Utd  4.139153
5                    Kilmarnock  4.135800
6                       Brondby  4.134936
7                BFC Daugavpils  4.118040
8            Ermis Aradippou FC  4.117817
9                 Vikingur Gota  4.116958
10                        Tanda  4.112005
11                       Orebro  4.104193
12                     Malmo FF  4.104182
13                       Dundee  4.103043
14                         Riga  4.101733
15                Apoel Nicosia  4.101698
16                 Bobruichanka  4.101124
17                  Keflavik IF  4.086849
18     Enosis Neon Paralimni FC  4.085002
19                  Midtjylland  4.084027 

                  Team    Rating
0                 Lugo  4.483062
1               Malaga  4.418946
2              

### Ensemble algorithm

It looks like our KNN is outperforming the rest. Lets try to hybridize the models so we can get the best parts of every model. To do this, we're going to use Suprise to make a new algorithm, and make it out-perform the rest.

Now we'll make a class in Surprise and inherit it from Algobase.

In [None]:
class HybridFacto(surprise.AlgoBase):
    def __init__(self, epochs, learning_rate):
        self.alpha = np.array([0.25]*4)
        self.epochs = epochs
        self.learning_rate = learning_rate
    
    def fit(self, holdout):
        holdout=holdout.build_full_trainset().build_testset()
        
        for epoch in range(self.epochs): 
            print('Starting epoch: ', epoch)
                
            predictions = np.array([collabKNN.test(holdout), funkSVD.test(holdout), coClus.test(holdout), slopeOne.test(holdout)])
                        
            maeGradient = [surprise.accuracy.mae([pred for pred in prediction]) for prediction in predictions] 
            
            newalpha = self.alpha - np.transpose([self.learning_rate * mae for mae in maeGradient])
            
            #convergence check:
            alpha_diff = [x-y for x,y in zip(newalpha, self.alpha)]
            alpha_abs_mean = abs(np.mean(alpha_diff))
             
            print('alpha_abs_mean: ', alpha_abs_mean)
            print('====================================')
            
            if alpha_abs_mean < 0.001:
                break
                    
            self.alpha = newalpha
            
    def estimate(self,u,i):
        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible('User and/or item is unknown.')
        algoResults = np.array([collabKNN.predict(u,i),funkSVD.predict(u,i),coClus.predict(u,i),slopeOne.predict(u,i)])
        return np.sum(np.dot(self.alpha,algoResults))

In [None]:

#Round predicted ratings


class HybridFacto(surprise.AlgoBase):
    def __init__(self, epochs, learning_rate):
        self.alpha = np.array([0.25]*4)
        self.epochs = epochs
        self.learning_rate = learning_rate
    
    def fit(self, holdout):
        holdout=holdout.build_full_trainset().build_testset()
        
        for epoch in range(self.epochs): 
            print('Starting epoch: ', epoch)
                
            predictions = np.array([collabKNN.test(holdout), funkSVD.test(holdout), coClus.test(holdout), slopeOne.test(holdout)])
                 
            print(predictions[0][0])
                
            maeGradient = [surprise.accuracy.rmse([pred for pred in prediction]) for prediction in predictions] 
            
            
            newalpha = self.alpha - np.transpose([self.learning_rate * mae for mae in maeGradient])
            
            #convergence check:
            alpha_diff = [x-y for x,y in zip(newalpha, self.alpha)]
            alpha_abs_mean = abs(np.mean(alpha_diff))
             
            print('alpha_abs_mean: ', alpha_abs_mean)
            print('====================================')
            
            if alpha_abs_mean < 0.001:
                break
                    
            self.alpha = newalpha
            
    def estimate(self,u,i):
        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible('User and/or item is unknown.')
        algoResults = np.array([collabKNN.predict(u,i), funkSVD.predict(u,i), coClus.predict(u,i), slopeOne.predict(u,i)])
        return np.sum(np.dot(self.alpha,algoResults))

In [None]:
holdout = surprise.Dataset.load_from_df(rawholdout, reader)
hybrid = HybridFacto(epochs=2, learning_rate=0.005)
hybrid.fit(holdout)

rmseHyb = []
for trainset, testset in kSplit.split(data): #iterate through the folds.
    predhybrid = hybrid.test(testset)
    rmseHyb.append(surprise.accuracy.rmse(predhybrid))