In [104]:
import os
import math
from datetime import datetime
from scipy import sparse as sp
import numpy as np
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import ParameterGrid
from tqdm import tqdm, trange

from algorithms.slim.slim import SLIM
from conf import UN_LOG_VAL_STR, UN_LOG_TE_STR, DATA_PATH, DEMO_PATH, UN_OUT_DIR, DEMO_TRAITS
from utils.data_splitter import DataSplitter
from utils.eval import eval_proced, eval_metric
from utils.helper import pickle_dump, pickle_load

import csv



In [105]:
sp_tr_data = []

now = datetime.now()
user_groups = []

for fold_n in trange(5, desc='folds'):

    log_val_str = UN_LOG_VAL_STR.format('slim', now, fold_n)
    log_te_str = UN_LOG_TE_STR.format('slim', now, fold_n)

    ds = DataSplitter(DATA_PATH, DEMO_PATH, out_dir=UN_OUT_DIR)
    pandas_dir_path, scipy_dir_path, uids_dic_path, tids_path = ds.get_paths(fold_n=fold_n)

    
    user_groups.append(ds.get_user_groups_indxs(pandas_dir_path, 'gender'))
    

folds: 100%|██████████| 5/5 [00:04<00:00,  1.03it/s]


In [106]:
male_cnts = []
female_cnts = []
for user_group in user_groups:
    male_cnts.append(len(user_group[0].te_indxs))
    female_cnts.append(len(user_group[1].te_indxs))
    
male_cnt = np.sum(male_indxs)
female_cnt = np.sum(female_indxs)

male_coll_portion = male_cnt/(male_cnt+female_cnt)
female_coll_portion = female_cnt/(male_cnt+female_cnt)

print (male_cnts)
print (female_cnts)
print (male_cnt)
print (female_cnt)
print (male_coll_portion)
print (female_coll_portion)

[3137, 3066, 3115, 3112, 3126]
[858, 928, 879, 882, 868]
15556
4415
0.7789294476991638
0.22107055230083622


[3137, 3066, 3115, 3112, 3126]

In [107]:

results_all = {} 
with open('hparams_table.csv', newline='') as csvfile:
    csv_rd = csv.reader(csvfile, delimiter=',')
    for row_i, row in enumerate(csv_rd):
        if row_i == 0:
            titles = row[1:]
            for x_i, x in enumerate(row[1:]):
                results_all[x] = []
        else:
            for x_i, x in enumerate(row[1:]):
                results_all[titles[x_i]].append(float(x))
        



In [109]:
results_all.keys()

dict_keys(['test/gender/m_ndcg_at_5', 'test/gender/f_ndcg_at_5', 'test/gender/m_recall_at_5', 'test/gender/f_recall_at_5', 'test/gender/m_ndcg_at_10', 'test/gender/f_ndcg_at_10', 'test/gender/m_recall_at_10', 'test/gender/f_recall_at_10', 'test/gender/m_ndcg_at_50', 'test/gender/f_ndcg_at_50', 'test/gender/m_recall_at_50', 'test/gender/f_recall_at_50'])

In [110]:
results_sumqry = {}
results_avg = {}
for _key in results_all:
    item = results_all[_key]
    _metric = _key.split('/')[-1][2:]
    _gender = _key.split('/')[-1][0]
    
    if _metric not in results_avg:
        results_avg[_metric] = {'m': 0, 'f': 0}
    results_avg[_metric][_gender] = np.mean(item)
    _lst = []
    for item_fold_i, item_fold in enumerate(item):
        if _gender == 'm':
            _lst.append(item_fold * male_cnts[item_fold_i])
        elif _gender == 'f':
            _lst.append(item_fold * female_cnts[item_fold_i])
        else:
            print('error')
    
    if _metric not in results_sumqry:
        results_sumqry[_metric] = {'m': 0, 'f': 0}
    results_sumqry[_metric][_gender] = np.sum(_lst)



In [111]:
item

[0.19974204897880554,
 0.19544340670108795,
 0.20901627838611603,
 0.19655463099479675,
 0.20161408185958862]

In [112]:
male_cnts

[3137, 3066, 3115, 3112, 3126]

In [86]:
results_sumqry

{'ndcg_at_5': {'m': 3482.088018089533, 'f': 854.3125765919685},
 'recall_at_5': {'m': 3237.983302742243, 'f': 788.9000030010939},
 'ndcg_at_10': {'m': 3058.222583323717, 'f': 764.6768234819174},
 'recall_at_10': {'m': 2786.9635389894247, 'f': 709.234129935503},
 'ndcg_at_50': {'m': 2642.48194180429, 'f': 743.0262452960014},
 'recall_at_50': {'m': 2849.937973961234, 'f': 884.8376757353544}}

In [124]:
def kl_divergence(p, q):
    return sum(p[i] * np.log2(p[i]/q[i]) for i in range(len(p)))

def new_divergence(p, q):
    return (p[0] * np.log2(p[0]/q[0])) - (p[1] * np.log2(p[1]/q[1]))
    #return (p[0] * np.log2(p[0]/q[0])) + ((1-p[0]) * np.log2(((1-p[0])))/(1-p[1]))




print ("**Collection Distribution**")
print ("Female: %.4f" % female_coll_portion)
print ("Male:   %.4f" % male_coll_portion)
print ()
print ("**Gain Distribution**")
for _m in results_sumqry:
    print (_m)
    item = results_sumqry[_m]
    female_portion = item['f']/(item['m']+item['f'])
    male_portion = item['m']/(item['m']+item['f'])
    
    #divergence = kl_divergence([male_portion, female_portion], [male_coll_portion, female_coll_portion])
    divergence = new_divergence([male_portion, female_portion], [male_coll_portion, female_coll_portion])
    
    print ("Male:       %.3f" % male_portion)
    print ("Female:     %.3f" % female_portion)
    print ("Divergence: %.6f" % (divergence))
    print ()

**Collection Distribution**
Female: 0.2211
Male:   0.7789

**Gain Distribution**
ndcg_at_5
Male:       0.803
Female:     0.197
Divergence: 0.067994

recall_at_5
Male:       0.804
Female:     0.196
Divergence: 0.071034

ndcg_at_10
Male:       0.800
Female:     0.200
Divergence: 0.059637

recall_at_10
Male:       0.797
Female:     0.203
Divergence: 0.051740

ndcg_at_50
Male:       0.781
Female:     0.219
Divergence: 0.004605

recall_at_50
Male:       0.763
Female:     0.237
Divergence: -0.046294

