In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import pickle
from collections import defaultdict

from torch.utils.data import Dataset, DataLoader
from Mmetrics import *

import LTR
import datautil
import permutationgraph
import DTR
import EEL
import PPG
import PL

def df2ds(df_path):
    with open(df_path, 'rb') as f:
        df = pickle.load(f)
    ds = df.to_dict(orient='list')
    for k in ds:
        ds[k] = np.array(ds[k])
    ds['dlr'] = np.concatenate([np.zeros(1), np.where(np.diff(ds['qid'])==1)[0]+1, np.array([ds['qid'].shape[0]])]).astype(int)
    return type('ltr', (object,), ds)


def dict2ds(df_path):
    with open(df_path, 'rb') as f:
        ds = pickle.load(f)
    return type('ltr', (object,), ds)

ds2019 = df2ds('LTR2019.df')
ds2020 = df2ds('LTR2020.df')
sds2019 = dict2ds('s_LTR2019.df')
sds2020 = dict2ds('s_LTR2020.df')

In [None]:
def subsample(ds, percent):
    subsample_size = int((ds.dlr.shape[0]-1) * percent)
    qids = np.random.choice(np.arange(ds.dlr.shape[0]-1), subsample_size)
#     print(qids)
    sds = {'lv':[], 'y_pred':[], 'qid':[], 'g':[], 'dlr':[0]}
    for qid in qids:
        s,e=ds.dlr[qid:qid+2]
#         print(s,e)
        sds['lv'].append(ds.lv[s:e])
        sds['y_pred'].append(ds.y_pred[s:e])
        sds['g'].append(ds.g[s:e])
        sds['qid'].append(ds.qid[s:e])
        sds['dlr'].append(sds['dlr'][-1]+e-s)
    for k in ['lv', 'y_pred', 'g', 'qid']:
        sds[k] = np.concatenate(sds[k])
    sds['dlr'] = np.array(sds['dlr'])
    return sds

sds2019 = subsample(ds2019, 0.1)
sds2020 = subsample(ds2020, 0.1)


In [None]:
with open('s_LTR2019.df', 'wb') as f:
    pickle.dump(sds2019, f)
    
with open('s_LTR2020.df', 'wb') as f:
    pickle.dump(sds2020, f)

In [None]:

def learn_one_PPG(qid, verbose, y_pred, g, dlr, epochs, lr, exposure, grade_levels, samples_cnt, sessions_cnt):
    s, e = dlr[qid:qid+2]
    y_pred_s, g_s, sorted_docs_s, dlr_s = \
        EEL.copy_sessions(y=y_pred[s:e], g=g[s:e], sorted_docs=y_pred[s:e].argsort()[::-1], sessions=sessions_cnt)
    objective_ins = EEL.EEL(y_pred = y_pred_s, g = g_s, dlr = dlr_s, exposure=exposure, grade_levels = grade_levels)
    learner = PPG.Learner(  PPG_mat=None, samples_cnt=samples_cnt, 
                                objective_ins=objective_ins, 
                                sorted_docs = sorted_docs_s, 
                                dlr = dlr_s,
#                                 intra = np.arange(g_s.shape[0]),
                                intra = g_s,
                                inter = np.repeat(dlr_s[:-1], np.diff(dlr_s)))
    vals = learner.fit(epochs, lr, verbose=verbose)
    return vals

def learn_all_PPG(y_pred, g, dlr, epochs, lr, exposure, grade_levels, samples_cnt, sessions_cnt):
    sorted_docs = []
    
    for qid in trange(dlr.shape[0] - 1, leave=False):
#     for qid in range(dlr.shape[0] - 1):
        min_b = learn_one_PPG(qid, 0, y_pred, g, dlr, epochs, lr, exposure, grade_levels, samples_cnt, sessions_cnt)
        sorted_docs.append(min_b)
        

    # print(ndcg_dtr(exposure, lv, np.concatenate(y_rerank), dlr, g, query_counts))
    return sorted_docs



In [None]:

exposure2020 = np.array([1./np.log2(2+i) for i in range(1,np.diff(ds2020.tedlr).max()+2)])
res = learn_all_PPG(y_pred2020, ds2020.teg, ds2020.tedlr, epochs, 0.1, exposure=exposure2020,
        grade_levels=5, samples_cnt=32, sessions_cnt=20)

In [None]:
def evaluate_one(metric, qid, lv, g, dlr, output_permutation, exposure, sessions_cnt):
    s, e = dlr[qid:qid+2]
    permutation = output_permutation[qid]
    lv_s, g_s, sorted_docs_s, dlr_s = \
        EEL.copy_sessions(y=lv[s:e], g=g[s:e], sorted_docs=lv[s:e].argsort()[::-1], sessions=sessions_cnt)
    
    if metric == 'EEL':
        objective_ins = EEL.EEL(y_pred = lv_s, g = g_s, dlr = dlr_s, exposure=exposure, grade_levels = 2)
    else:
        objective_ins = DTR.DTR(y_pred = lv_s, g = g_s, dlr = dlr_s, exposure=exposure)
        
    
    osl = e - s
    argsort = lv[s:e].argsort()[::-1]
    idcg = ((2.**lv[s:e][argsort][:min(osl,10)] - 1.) / (np.log2(2+np.arange(min(osl,10))))).sum()
    ndcg = 0
    for i in range(sessions_cnt):
        ndcg += ((2.**lv[s:e][permutation[i*osl:(i+1)*osl]-(i*osl)][:min(osl,10)] - 1.) / (np.log2(2+np.arange(min(osl,10))))).sum() / idcg
        
    return objective_ins.eval(permutation), ndcg / sessions_cnt
 
def evaluate_all(metric, lv, g, dlr, output_permutation, exposure, sessions_cnt):
    eel_res, eer_res, eed_res, ndcgs = [], [], [], []
    for qid in range(dlr.shape[0] - 1):
        s,e = dlr[qid:qid+2]
        if len(np.unique(g[s:e])) == 1:
            continue
        out1, ndcg = evaluate_one(metric, qid, lv, g, dlr, output_permutation, exposure, sessions_cnt)
#         eel, eer, eed = out1
        eel = out1
        eel_res.append(eel)
#         eer_res.append(eer)
#         eed_res.append(eed)
        ndcgs.append(ndcg)
    return np.array(eel_res), np.array(ndcgs)
#     return np.array(eel_res), np.array(eer_res), np.array(eed_res), np.array(ndcgs)

def estimated_evaluate_one(qid, lv, g, dlr, output_permutation, exposure, sessions_cnt):
    s, e = dlr[qid:qid+2]
    permutation = output_permutation[qid]
    lv_s, g_s, sorted_docs_s, dlr_s = \
        EEL.copy_sessions(y=lv[s:e], g=g[s:e], sorted_docs=lv[s:e].argsort()[::-1], sessions=sessions_cnt)
    objective_ins = EEL.EEL(y_pred = lv_s, g = g_s, dlr = dlr_s, exposure=exposure, grade_levels = 5)
    return objective_ins.eval_detailed(permutation)
 
def estimated_evaluate_all(lv, g, dlr, output_permutation, exposure, sessions_cnt):
    eel_res, eer_res, eed_res = [], [], []
    for qid in range(dlr.shape[0] - 1):
        s,e = ds.tedlr[qid:qid+2]
        if len(np.unique(g[s:e])) == 1:
            continue
        eel, eer, eed = estimated_evaluate_one(qid, lv, g, dlr, output_permutation, exposure, sessions_cnt)
        eel_res.append(eel)
        eer_res.append(eer)
        eed_res.append(eed)
    return np.array(eel_res), np.array(eer_res), np.array(eed_res)

In [None]:
import pickle
import os

def read_results(directory = '_data/PPG'):
    files = os.listdir(directory)
    res = {}
    for file in files:
        if 'pkl' not in file:
            continue
        with open(f'{directory}/{file}', 'rb') as f:
            res[file[:-12]] = pickle.load(f)
    return res
    
res = read_results(directory = '_data/PPG/test')

In [None]:


from tqdm.notebook import tqdm

def get_df_from_results_old(res):
    exposure2020 = np.array([1./np.log2(2+i) for i in range(1,np.diff(ds2020.tedlr).max()+2)])
    exposure2019 = np.array([1./np.log2(2+i) for i in range(1,np.diff(ds2019.tedlr).max()+2)])


    dfl = []
    for alg in tqdm(res, leave=True):
        _res = res[alg]
        alg_params = alg.split('_')
    #     print(alg, '-'*30)
        sessions_cnt = int(alg_params[3])
        metric = alg_params[4]
        learner = alg_params[1] + '_' + alg_params[0]
        samples = int(alg_params[2])

        for key in _res:
            if '2019' in key:
                year = 2019
                ds = ds2019
                ypred = y_pred2019
                exposure = exposure2019
            else:
                year = 2020
                ds = ds2020
                ypred = y_pred2020
                exposure = exposure2020

            lr = key.split('_')[1]
            eel_res, ndcg = evaluate_all(metric, ds.telv, ds.teg, ds.tedlr, _res[key], exposure, sessions_cnt=sessions_cnt)
            dfl.append({'year':year, 'metric':metric, 'learner':learner, 
            'samples':samples, 'sessions':sessions_cnt, 'lr':lr, 'Fairness':eel_res.mean(), 'NDCG':ndcg.mean()})

    return pd.DataFrame(dfl)



def get_df_from_results(res):
    exposure2020 = np.array([1./np.log2(2+i) for i in range(1,np.diff(ds2020.dlr).max()+2)])
    exposure2019 = np.array([1./np.log2(2+i) for i in range(1,np.diff(ds2019.dlr).max()+2)])


    dfl = []
    for alg in tqdm(res, leave=True):
        _res = res[alg]
        alg_params = alg.split('_')
    #     print(alg, '-'*30)
        sessions_cnt = int(alg_params[2])
        metric = alg_params[3]
        learner = alg_params[1] + '_' + alg_params[0]

        for key in _res:
            if '2019' in key:
                year = 2019
                ds = ds2019
                exposure = exposure2019
            else:
                year = 2020
                ds = ds2020
                exposure = exposure2020

            lr = key.split('_')[1]
            samples = int(key.split('_')[2])
            eel_res, ndcg = evaluate_all(metric, ds.lv, ds.g, ds.dlr, _res[key], exposure, sessions_cnt=sessions_cnt)
            dfl.append({'year':year, 'metric':metric, 'learner':learner, 
            'samples':samples, 'sessions':sessions_cnt, 'lr':lr, 'Fairness':eel_res.mean(), 'NDCG':ndcg.mean()})

    return pd.DataFrame(dfl)


df = get_df_from_results(res)

In [None]:
df.sort_values(by=['metric', 'sessions', 'year', 'learner'])

In [None]:
def refine_df(df, sessions, metric, year):
    return df.loc[(df.sessions==sessions) & (df.metric==metric) & (df.year == year), sorted(list(set(df.columns)-set(('metric', 'year'))))].sort_values(by=['samples', 'lr', 'learner'])

# df_joint = df.merge(df_approx, on=['learner', 'lr', 'metric', 'samples', 'sessions', 'year'], how='left')
for sessions in [1,2,4,8,16]:
    print(refine_df(df, sessions, 'EEL', 2019).head(100))
    print('-'*30)

In [None]:

exposure2020 = np.array([1./np.log2(2+i) for i in range(1,np.diff(ds2020.tedlr).max()+2)])
exposure2019 = np.array([1./np.log2(2+i) for i in range(1,np.diff(ds2019.tedlr).max()+2)])


for alg in res:
    _res = res[alg]
    print(alg, '-'*30)
    sessions_cnt = int(alg.split('_')[3])
    metric = alg.split('_')[4]
    for key in _res:
        if '2019' in key:
            ds = ds2019
            ypred = y_pred2019
            exposure = exposure2019
        else:
            ds = ds2020
            ypred = y_pred2020
            exposure = exposure2020
            
#         eel_res, eer_res, eed_res, ndcg = evaluate_all(ds.telv, ds.teg, ds.tedlr, _res[key], exposure, sessions_cnt=sessions_cnt)
        eel_res, ndcg = evaluate_all(metric, ds.telv, ds.teg, ds.tedlr, _res[key], exposure, sessions_cnt=sessions_cnt)
#         es_eel_res, es_eer_res, es_eed_res = estimated_evaluate_all(ypred, ds.teg, ds.tedlr, _res[key], exposure, sessions_cnt=20)
    
        print(f'{key}', eel_res.mean(), '(', ndcg.mean(), ')')
#         print(f'{key}', eel_res.mean(), '(', es_eel_res.mean(), ')')
#         print(f'\t', ndcg.mean())
#         print(f'\t', eer_res.mean(), '(', es_eer_res.mean(), ')')
#         print(f'\t', eed_res.mean(), '(', es_eed_res.mean(), ')')
