In [86]:
import pandas as pd
import numpy as np
from implicit.evaluation import train_test_split, ranking_metrics_at_k
from implicit.datasets.movielens import get_movielens
import implicit
from itertools import product

class CrossValidation:
    
    def __init__(self, user_item, k):
        self.user_item = user_item
        self.k = k

    def mpr_per_user(self, model, train, test, num_recs, user):
        recommended_items = model.recommend(user_items=train[user], userid=user, filter_already_liked_items=True, N = num_recs)[0]
        test_items = test[user].nonzero()[1]
        test_items_in_list = test_items[np.isin(test_items, recommended_items)]
        if len(test_items_in_list) == 0:
            return 0.5
        recommended_indices = recommended_items.argsort()
        hit_indices = recommended_indices[np.searchsorted(recommended_items[recommended_indices], test_items_in_list)]
        #return (np.sum(hit_indices) / num_recs) / len(hit_indices)
        return np.mean(hit_indices / num_recs)
   
    def calc_mpr(self, model, train, test):
        mprs = []
        for u in range(self.user_item.shape[0]) :
            mpr = self.mpr_per_user(model, train, test, self.user_item.shape[1], u)
            mprs.append(mpr)
        return {'mpr' : np.mean(mprs)} 
   
    def evaluate_model(self, model, train, test, k):
        metrics = ranking_metrics_at_k(model, train, test, K=k, show_progress=False)
        mpr = self.calc_mpr(model, train, test)
        metrics.update(mpr)
        return pd.DataFrame(metrics, index=['metrics@'+str(k)])  
   
    def split_k_fold(self) :
        split_matrix = self.user_item
        return_dict = {}
        return_dict_train = {}
        for i in range(self.k-1):
            train_temp, test_temp = train_test_split(split_matrix, train_percentage=((self.k-(i+1))/(self.k-i)))
            return_dict[str(i)] = test_temp
            if i == 0:
                return_dict_train[str(i)] = train_temp
                rest = test_temp
            else:
                return_dict_train[str(i)] = (train_temp + rest)
                rest = (rest + test_temp)
            if i == (self.k-2):
                return_dict[str(i+1)] = train_temp
                return_dict_train[str(i+1)] = rest
            split_matrix = train_temp
        return (return_dict, return_dict_train)

    # WICHTIG: hier test, train sind dicts. Output von split_k_fold()
    def k_fold_eval(self, test, train, model, return_type) :
        for i in range(len(test)) :
            model = model
            test_temp = test[str(i)]
            train_temp = train[str(i)]
            #print(test_temp.nnz)
            #print(train_temp.nnz)
            model.fit(train_temp, show_progress=False)
            m = self.evaluate_model(model, train_temp, test_temp, 10)
            if i == 0:
                df = m
            else :
                df = pd.concat((df, m), axis=0)
        if return_type == 'full':
            return df
        if return_type == 'mean':
            return df.mean().to_frame().T

    def hyperp_tuning(self, test, train, param_space, eval):
        keys, values = zip(*param_space.items())
        result = [dict(zip(keys, p)) for p in product(*values)]
        first_iter = True
        for r in result:
            model = implicit.als.AlternatingLeastSquares(factors=r['factors'], regularization=r['regularization'], alpha=r['alpha'])
            if eval == 'cv':
                res = self.k_fold_eval(test, train, model, return_type='mean')
            if eval == 'split':
                res = self.evaluate_model(model, train, test, 10)

            if first_iter == True:
                metrics_frame = res
                first_iter = False
            else:
                metrics_frame = pd.concat((metrics_frame, res), axis=0)
        param_df = pd.DataFrame(result)
        ret = pd.concat((param_df.reset_index(drop=True), metrics_frame.reset_index(drop=True)), axis=1)
        return ret

In [66]:
from implicit.datasets.movielens import get_movielens
movies1, ratings1 = get_movielens("1M")
ratings_t = ratings1.T

In [87]:
cv = CrossValidation(ratings_t, 5)

In [88]:
test, train = cv.split_k_fold()

In [89]:
space = {'factors' : [20, 30], 'regularization' : [0.001, 0.01], 'alpha' : [0.8, 5]}
hyper_res = cv.hyperp_tuning(test, train, space, 'cv')

100%|██████████| 6037/6037 [00:00<00:00, 9401.27it/s] 
100%|██████████| 6037/6037 [00:00<00:00, 10282.13it/s]
100%|██████████| 6035/6035 [00:00<00:00, 10402.82it/s]
100%|██████████| 6035/6035 [00:00<00:00, 10641.33it/s]
100%|██████████| 6036/6036 [00:00<00:00, 10834.17it/s]
100%|██████████| 6037/6037 [00:00<00:00, 10777.92it/s]
100%|██████████| 6037/6037 [00:00<00:00, 10777.96it/s]
100%|██████████| 6035/6035 [00:00<00:00, 10832.35it/s]
100%|██████████| 6035/6035 [00:00<00:00, 10755.15it/s]
100%|██████████| 6036/6036 [00:00<00:00, 10737.78it/s]
100%|██████████| 6037/6037 [00:00<00:00, 10777.91it/s]
100%|██████████| 6037/6037 [00:00<00:00, 10607.43it/s]
100%|██████████| 6035/6035 [00:00<00:00, 10716.93it/s]
100%|██████████| 6035/6035 [00:00<00:00, 10678.99it/s]
100%|██████████| 6036/6036 [00:00<00:00, 10718.73it/s]
100%|██████████| 6037/6037 [00:00<00:00, 9172.69it/s] 
100%|██████████| 6037/6037 [00:00<00:00, 10682.54it/s]
100%|██████████| 6035/6035 [00:00<00:00, 10678.99it/s]
100%|█████

In [90]:
hyper_res

Unnamed: 0,factors,regularization,alpha,precision,map,ndcg,auc,mpr
0,20,0.001,0.8,0.38753,0.257579,0.387762,0.576139,0.077049
1,20,0.001,5.0,0.327916,0.193418,0.324089,0.569156,0.068544
2,20,0.01,0.8,0.3893,0.259558,0.389741,0.576749,0.077075
3,20,0.01,5.0,0.328874,0.19421,0.325073,0.569222,0.068503
4,30,0.001,0.8,0.404384,0.270659,0.405397,0.582143,0.075534
5,30,0.001,5.0,0.335961,0.198239,0.33266,0.573199,0.067554
6,30,0.01,0.8,0.403874,0.270369,0.404911,0.581911,0.075732
7,30,0.01,5.0,0.335527,0.197614,0.332048,0.573091,0.067703


In [84]:
pd.concat((param_df.reset_index(drop=True), hyper_res.reset_index(drop=True)), axis=1)

Unnamed: 0,factors,regularization,alpha,precision,map,ndcg,auc,mpr
0,20,0.001,0.8,0.388039,0.258531,0.388646,0.576439,0.077249
1,20,0.001,5.0,0.328631,0.194272,0.324672,0.569043,0.068649
2,20,0.01,0.8,0.388812,0.258967,0.389065,0.576649,0.076921
3,20,0.01,5.0,0.329897,0.195374,0.32597,0.569327,0.068556
4,30,0.001,0.8,0.404479,0.270703,0.404937,0.582027,0.075523
5,30,0.001,5.0,0.335868,0.19839,0.332643,0.573312,0.067652
6,30,0.01,0.8,0.403522,0.269893,0.403833,0.581823,0.075245
7,30,0.01,5.0,0.336921,0.199476,0.333837,0.573495,0.067508


In [36]:
import pandas as pd
test_df = pd.DataFrame(space)

In [46]:
test_df.mean().to_frame().T

Unnamed: 0,factors,regularization,alpha
0,25.0,0.0055,2.9


In [53]:
init_frame = pd.DataFrame(columns=keys)
init_frame

Unnamed: 0,factors,regularization,alpha


In [54]:
test_added = pd.concat((init_frame, test_df.mean().to_frame().T), axis=0)
test_added

Unnamed: 0,factors,regularization,alpha
0,25.0,0.0055,2.9


In [85]:
param_df = pd.DataFrame(result)
param_df

Unnamed: 0,factors,regularization,alpha
0,20,0.001,0.8
1,20,0.001,5.0
2,20,0.01,0.8
3,20,0.01,5.0
4,30,0.001,0.8
5,30,0.001,5.0
6,30,0.01,0.8
7,30,0.01,5.0


In [None]:
def hyperp_tuning(test, train, param_space, eval):
    keys, values = zip(*param_space.items())
    result = [dict(zip(keys, p)) for p in product(*values)]
    first_iter = True
    for r in result:
        model = implicit.als.AlternatingLeastSquares(factors=r['factors'], regularization=r['regularization'], alpha=r['alpha'])
        if eval == 'cv':
            res = k_fold_eval(test, train, model, return_type='mean')
        if eval == 'split':
            res = evaluate_model(model, train, test, 10)

        if first_iter == True:
            metrics_frame = res
            first_iter = False
        else:
            metrics_frame = pd.concat((metrics_frame, res), axis=0)
    return metrics_frame
        

In [35]:
space = {'factors' : [20, 30], 'regularization' : [0.001, 0.01], 'alpha' : [0.8, 5]}

In [10]:
for f in space['factors']:
    for r in space['reg']:
        for a in space['alpha']:
            print(str(f) + str(r) + str(a))

200.0010.8
200.0015
200.010.8
200.015
300.0010.8
300.0015
300.010.8
300.015


In [25]:
from itertools import product
keys, values = zip(*space.items())
result = [dict(zip(keys, p)) for p in product(*values)]

In [32]:
for r in result:
    model = implicit.als.AlternatingLeastSquares(factors=r['factors'], regularization=r['regularization'], alpha=r['alpha'], )

{'factors': 20, 'regularization': 0.001, 'alpha': 0.8}
{'factors': 20, 'regularization': 0.001, 'alpha': 5}
{'factors': 20, 'regularization': 0.01, 'alpha': 0.8}
{'factors': 20, 'regularization': 0.01, 'alpha': 5}
{'factors': 30, 'regularization': 0.001, 'alpha': 0.8}
{'factors': 30, 'regularization': 0.001, 'alpha': 5}
{'factors': 30, 'regularization': 0.01, 'alpha': 0.8}
{'factors': 30, 'regularization': 0.01, 'alpha': 5}


In [31]:
model = implicit.als.AlternatingLeastSquares(factors=result[0]['factors'], regularization=result[0]['regularization'], alpha=result[0]['alpha'])

In [14]:
params = space.keys()

In [16]:
list(params)[0]

'factors'