# Notes:

This notebook is used to view the metric results from nb1. 

In [1]:
import os

import pandas as pd
import numpy as np


In [2]:
from mylib.result import Result

In [3]:
result_dir = "results/"

# View result dataframe

### Setup

In [4]:

def full_dataframe_from_results(result):
    
    ###          Continous univariate          ###

    feats = ["raw_amount", "td"]
    metrics = ["ks", "wasser"]


    records = []

    for f in feats:
        for m in metrics:

            d = {"index": f + "-" + m, "measures": f, "metric":m, "value": result.univariate_cont_res[f][m]}


            records.append(d)


    ###          N-Grams          ###

    metrics = ["jsd", "jac", "coverage_r", "coverage_g", "NED"]


    ngram_n_vals = result.ngram_res["code"].keys()

    for n in ngram_n_vals:
        f = f"codes_{n}-grams"

        for m in metrics:
            mean_val = np.mean([v[m] for v in result.ngram_res["code"][n].values()]) 

            d = {"index": f + "_mean-" + m, "measures": f, "metric":m, "value": mean_val}

            records.append(d)



    for n in ngram_n_vals:
        f = f"tcode_{n}-grams"

        for m in metrics:
            val = result.ngram_res["tcode"][n][m]

            d = {"index": f + "_mean-" + m, "measures": f, "metric":m, "value": val}

            records.append(d)


    for n in ngram_n_vals:
        f = f"dates_{n}-grams"

        for m in metrics:
            mean_val = np.mean([v[m] for v in result.ngram_res["date"][n].values()]) 

            d = {"index": f + "_mean-" + m, "measures": f, "metric":m, "value": mean_val}

            records.append(d)



    ###          Categorical + amount          ###


    metrics = ["wasser", "ks"]
    cont_fields = ["raw_amount"]



    for m in metrics:
        for cf in cont_fields: 

            d = result.amount_codes_res["code"]

            # this is mean over code, of the distance between the joint distributions 
            mean_val =  np.mean([(v[cf][m + "_r"] + v[cf][m + "_g"])/2. for v in d.values()])

            record = {"index": "joint-codes-" + cf + "_" + m, "measures": "codes, " + cf , "metric":m, "value": mean_val}

            records.append(record)




    for m in metrics:
        for cf in cont_fields: 

            d = result.amount_codes_res["tcode"][cf]

            # val = (d[m + "_r"] + d[m + "_g"])/2.
            val = d[m + "_r"] # just use expectation over real!

            d = {"index": "*r-joint-tcode-" + cf+ "_" + m, "measures": "tcode, " + cf, "metric":m, "value": val}

            records.append(d)



    ###          2 Categorical + amounts          ###

    cat_metrics = ['jsd', 'jac', 'coverage_r',  'coverage_g']
    con_metrics = ['wasser', "ks"]
    cont_fields = ["raw_amount"]


    #####        Codes        #####

    d = result.amount_code_date_res["code"]

    res_d = {}

    for m in cat_metrics:
        vals = []
        res_d[m] = vals
        for field1 in d.keys():
            vals.append([])
            for field2 in d[field1].keys():
                vals[-1].append(d[field1][field2][m])



    for m in cat_metrics:
        mean_val =  np.mean(res_d[m])

        record = {"index": "joint-codes-dates_" + m, "measures": "codes, dates" , "metric":m, "value": mean_val}

        records.append(record)


    ## cont metrics

    # res_d = {}

    for m in con_metrics:

        vals = []
        vals_r = []
    #     res_d[m] = vals
        for field1 in d.keys():
            vals.append([])
            vals_r.append([])
            for field2 in d[field1].keys():

                cmr = d[field1][field2]["cont_metric_results"]["raw_amount"]
                mean_val = (cmr[m + "_r"] + cmr[m + "_g"])/2. # mean between 'expectation over r, expectation over g'

                vals[-1].append(mean_val)
                vals_r[-1].append(cmr[m + "_r"])

        mean_val = np.mean(vals)

        record = {"index": "joint-codes-dates-amt_" + m, "measures": "codes, dates, amt" , "metric":m, "value": mean_val}
        records.append(record)
        
        
        # for expectation wrt R, not ave
        mean_val = np.mean(vals_r)

        record = {"index": "*r-joint-codes-dates-amt_" + m, "measures": "codes, dates, amt" , "metric":m, "value": mean_val}
        records.append(record)





    #####        Tcode        #####
    d = result.amount_code_date_res["tcode"]

    res_d = {}

    for m in cat_metrics:
        vals = []
        res_d[m] = vals
        for field1 in d.keys():
            vals.append(d[field1][m])

    for m in cat_metrics:
        mean_val =  np.mean(res_d[m])

        record = {"index": "joint-tcodes-dates_" + m, "measures": "tcode, dates" , "metric":m, "value": mean_val}

        records.append(record)    




    ## cont metrics


    for m in con_metrics:

        vals = []
        for field1 in d.keys():


            cmr = d[field1]["cont_metric_results"]["raw_amount"]
            # mean_val = (cmr[m + "_r"] + cmr[m + "_g"])/2. # mean between 'expectation over r, expectation over g'
            mean_val = cmr[m + "_r"]  # not a mean, just expectation over real!
            

            vals.append(mean_val)

        mean_val = np.mean(vals)
        record = {"index": "*r-joint-tcode-dates-amt_" + m, "measures": "tcode, dates, amt" , "metric":m, "value": mean_val}
        records.append(record)


        
    record = {"index": "gen_fname", "measures": "name" , "metric":"None", "value": result.gen_df_name}
    records.append(record)
    
    record = {"index": "real_fname", "measures": "name" , "metric":"None", "value": result.real_df_name}
    records.append(record)

    return pd.DataFrame.from_records(records)


# Combine results for all datasets

In [6]:
dfs = []

for fname in os.listdir(result_dir):
    
    if not ".result" in fname:
        continue 
        
        
    r = Result.load(os.path.join(result_dir, fname))
    
    dfs.append(full_dataframe_from_results(r))

In [7]:
from functools import reduce

df_merged = reduce(lambda  left,right: pd.merge(left,right, on=['index', 'measures', 'metric'],
                                            how='outer', suffixes=("_", "_")), dfs)


new_cols= []
val_count = 0

for c in df_merged.columns:
    if "value" in c:
        if not c[-1] == "_":
            c += "_"
        new_cols.append(c + str(val_count))
        val_count += 1
    else:
        new_cols.append(c)
        
df_merged.columns = new_cols

### Values of all metrics

In [8]:
df_merged

Unnamed: 0,index,measures,metric,value_0,value_1,value_2
0,raw_amount-ks,raw_amount,ks,0.34521,0.25975,0.07205
1,raw_amount-wasser,raw_amount,wasser,116.269945,185.79836,42.565028
2,td-ks,td,ks,0.19918,0.09472,0.06134
3,td-wasser,td,wasser,0.58858,0.318746,0.25186
4,codes_1-grams_mean-jsd,codes_1-grams,jsd,0.100878,0.004329,0.007449
5,codes_1-grams_mean-jac,codes_1-grams,jac,0.309091,0.022727,0.0
6,codes_1-grams_mean-coverage_r,codes_1-grams,coverage_r,1.0,1.0,1.0
7,codes_1-grams_mean-coverage_g,codes_1-grams,coverage_g,0.690909,0.977273,1.0
8,codes_1-grams_mean-NED,codes_1-grams,NED,0.598606,0.040921,-0.100771
9,codes_3-grams_mean-jsd,codes_3-grams,jsd,0.337502,0.052853,0.062415


In [9]:
vals_df = df_merged[df_merged["measures"] != "name"]
value_cols = [f"value_{i}" for i in range(len(dfs))]
rank_df = vals_df[value_cols].rank(axis=1)


final_rank_df = pd.concat([vals_df[['index', 'measures', 'metric']], 
                           rank_df], axis=1)

final_rank_df

Unnamed: 0,index,measures,metric,value_0,value_1,value_2
0,raw_amount-ks,raw_amount,ks,3.0,2.0,1.0
1,raw_amount-wasser,raw_amount,wasser,2.0,3.0,1.0
2,td-ks,td,ks,3.0,2.0,1.0
3,td-wasser,td,wasser,3.0,2.0,1.0
4,codes_1-grams_mean-jsd,codes_1-grams,jsd,3.0,1.0,2.0
5,codes_1-grams_mean-jac,codes_1-grams,jac,3.0,2.0,1.0
6,codes_1-grams_mean-coverage_r,codes_1-grams,coverage_r,2.0,2.0,2.0
7,codes_1-grams_mean-coverage_g,codes_1-grams,coverage_g,1.0,2.0,3.0
8,codes_1-grams_mean-NED,codes_1-grams,NED,3.0,2.0,1.0
9,codes_3-grams_mean-jsd,codes_3-grams,jsd,3.0,1.0,2.0


In [10]:
def shortname(k):
    try:
        return fname_map[k].split("/")[1][:15]
    except:
        return k

In [11]:
paper_metrics = """
raw_amount-wasser
td-wasser
tcode_1-grams_mean-jsd
tcode_3-grams_mean-jsd
tcode_3-grams_mean-jac
dates_1-grams_mean-jsd
joint-tcode-raw_amount_wasser
joint-tcode-dates-amt_wasser
*r-joint-tcode-raw_amount_wasser
*r-joint-tcode-dates-amt_wasser
""".split()

['raw_amount-wasser',
 'td-wasser',
 'tcode_1-grams_mean-jsd',
 'tcode_3-grams_mean-jsd',
 'tcode_3-grams_mean-jac',
 'dates_1-grams_mean-jsd',
 'joint-tcode-raw_amount_wasser',
 'joint-tcode-dates-amt_wasser',
 '*r-joint-tcode-raw_amount_wasser',
 '*r-joint-tcode-dates-amt_wasser']

In [12]:
small_df = df_merged[df_merged["index"].apply(lambda x: x in paper_metrics)].reset_index(drop=True)

Unnamed: 0,index,measures,metric,value_0,value_1,value_2
0,raw_amount-wasser,raw_amount,wasser,116.269945,185.79836,42.565028
1,td-wasser,td,wasser,0.58858,0.318746,0.25186
2,tcode_1-grams_mean-jsd,tcode_1-grams,jsd,0.236815,0.01084,0.015011
3,tcode_3-grams_mean-jsd,tcode_3-grams,jsd,0.622138,0.13451,0.155805
4,tcode_3-grams_mean-jac,tcode_3-grams,jac,0.994861,0.641874,0.601944
5,dates_1-grams_mean-jsd,dates_1-grams,jsd,0.087306,0.033889,0.023771
6,*r-joint-tcode-raw_amount_wasser,"tcode, raw_amount",wasser,110.035898,194.943648,40.391657
7,*r-joint-tcode-dates-amt_wasser,"tcode, dates, amt",wasser,131.342361,199.600963,60.116839


In [25]:
gen_fname_row = df_merged[df_merged["index"]== "gen_fname"]
fname_map = dict([(c, gen_fname_row[c].item()) for c in value_cols])

In [22]:
def round_to_digits(x, max_digits = 4):
    max_digits -= 1 # leading / trailing 0
    digits_leftof_dot = max(0, np.floor(np.log10(x)) + 1)
    round_to = max(0, max_digits - digits_leftof_dot)
    # print("round to", round_to)
    return round(x, int(round_to))

In [24]:
final_table = small_df.rename(columns=fname_map).set_index('index').iloc[:, 2:].T
for col in final_table.columns:
    final_table[col] = final_table[col].apply(round_to_digits)
    
final_table

index,raw_amount-wasser,td-wasser,tcode_1-grams_mean-jsd,tcode_3-grams_mean-jsd,tcode_3-grams_mean-jac,dates_1-grams_mean-jsd,*r-joint-tcode-raw_amount_wasser,*r-joint-tcode-dates-amt_wasser
generated_data/tg.csv,116.0,0.589,0.237,0.622,0.995,0.087,110.0,131.0
generated_data/dg.csv,186.0,0.319,0.011,0.135,0.642,0.034,195.0,200.0
generated_data/bf.csv,42.6,0.252,0.015,0.156,0.602,0.024,40.4,60.1
