# Notes:

This notebook is used to view the metric results from nb1. 

In [1]:
import os

import pandas as pd
import numpy as np


In [2]:
from mylib.result import Result

In [3]:
result_dir = "results/"

# View result dataframe

### Setup

In [4]:

def full_dataframe_from_results(result):
    
    ###          Continous univariate          ###

    feats = ["raw_amount", "td"]
    metrics = ["ks", "wasser"]


    records = []

    for f in feats:
        for m in metrics:

            d = {"index": f + "-" + m, "measures": f, "metric":m, "value": result.univariate_cont_res[f][m]}


            records.append(d)


    ###          N-Grams          ###

    metrics = ["jsd", "jac", "coverage_r", "coverage_g", "NED"]


    ngram_n_vals = result.ngram_res["code"].keys()

    for n in ngram_n_vals:
        f = f"codes_{n}-grams"

        for m in metrics:
            mean_val = np.mean([v[m] for v in result.ngram_res["code"][n].values()]) 

            d = {"index": f + "_mean-" + m, "measures": f, "metric":m, "value": mean_val}

            records.append(d)



    for n in ngram_n_vals:
        f = f"tcode_{n}-grams"

        for m in metrics:
            val = result.ngram_res["tcode"][n][m]

            d = {"index": f + "_mean-" + m, "measures": f, "metric":m, "value": val}

            records.append(d)


    for n in ngram_n_vals:
        f = f"dates_{n}-grams"

        for m in metrics:
            mean_val = np.mean([v[m] for v in result.ngram_res["date"][n].values()]) 

            d = {"index": f + "_mean-" + m, "measures": f, "metric":m, "value": mean_val}

            records.append(d)



    ###          Categorical + amount          ###


    metrics = ["wasser", "ks"]
    cont_fields = ["raw_amount"]



    for m in metrics:
        for cf in cont_fields: 

            d = result.amount_codes_res["code"]

            # this is mean over code, of the distance between the joint distributions 
            mean_val =  np.mean([(v[cf][m + "_r"] + v[cf][m + "_g"])/2. for v in d.values()])

            record = {"index": "joint-codes-" + cf + "_" + m, "measures": "codes, " + cf , "metric":m, "value": mean_val}

            records.append(record)




    for m in metrics:
        for cf in cont_fields: 

            d = result.amount_codes_res["tcode"][cf]

            # val = (d[m + "_r"] + d[m + "_g"])/2.
            val = d[m + "_r"] # just use expectation over real!

            d = {"index": "*r-joint-tcode-" + cf+ "_" + m, "measures": "tcode, " + cf, "metric":m, "value": val}

            records.append(d)



    ###          2 Categorical + amounts          ###

    cat_metrics = ['jsd', 'jac', 'coverage_r',  'coverage_g']
    con_metrics = ['wasser', "ks"]
    cont_fields = ["raw_amount"]


    #####        Codes        #####

    d = result.amount_code_date_res["code"]

    res_d = {}

    for m in cat_metrics:
        vals = []
        res_d[m] = vals
        for field1 in d.keys():
            vals.append([])
            for field2 in d[field1].keys():
                vals[-1].append(d[field1][field2][m])



    for m in cat_metrics:
        mean_val =  np.mean(res_d[m])

        record = {"index": "joint-codes-dates_" + m, "measures": "codes, dates" , "metric":m, "value": mean_val}

        records.append(record)


    ## cont metrics

    # res_d = {}

    for m in con_metrics:

        vals = []
        vals_r = []
    #     res_d[m] = vals
        for field1 in d.keys():
            vals.append([])
            vals_r.append([])
            for field2 in d[field1].keys():

                cmr = d[field1][field2]["cont_metric_results"]["raw_amount"]
                mean_val = (cmr[m + "_r"] + cmr[m + "_g"])/2. # mean between 'expectation over r, expectation over g'

                vals[-1].append(mean_val)
                vals_r[-1].append(cmr[m + "_r"])

        mean_val = np.mean(vals)

        record = {"index": "joint-codes-dates-amt_" + m, "measures": "codes, dates, amt" , "metric":m, "value": mean_val}
        records.append(record)
        
        
        # for expectation wrt R, not ave
        mean_val = np.mean(vals_r)

        record = {"index": "*r-joint-codes-dates-amt_" + m, "measures": "codes, dates, amt" , "metric":m, "value": mean_val}
        records.append(record)





    #####        Tcode        #####
    d = result.amount_code_date_res["tcode"]

    res_d = {}

    for m in cat_metrics:
        vals = []
        res_d[m] = vals
        for field1 in d.keys():
            vals.append(d[field1][m])

    for m in cat_metrics:
        mean_val =  np.mean(res_d[m])

        record = {"index": "joint-tcodes-dates_" + m, "measures": "tcode, dates" , "metric":m, "value": mean_val}

        records.append(record)    




    ## cont metrics


    for m in con_metrics:

        vals = []
        for field1 in d.keys():


            cmr = d[field1]["cont_metric_results"]["raw_amount"]
            # mean_val = (cmr[m + "_r"] + cmr[m + "_g"])/2. # mean between 'expectation over r, expectation over g'
            mean_val = cmr[m + "_r"]  # not a mean, just expectation over real!
            

            vals.append(mean_val)

        mean_val = np.mean(vals)
        record = {"index": "*r-joint-tcode-dates-amt_" + m, "measures": "tcode, dates, amt" , "metric":m, "value": mean_val}
        records.append(record)


        
    record = {"index": "gen_fname", "measures": "name" , "metric":"None", "value": result.gen_df_name}
    records.append(record)
    
    record = {"index": "real_fname", "measures": "name" , "metric":"None", "value": result.real_df_name}
    records.append(record)

    return pd.DataFrame.from_records(records)


# Combine results for all datasets

In [5]:
dfs = []

for fname in os.listdir(result_dir):
    
    if not ".result" in fname:
        continue 
        
        
    r = Result.load(os.path.join(result_dir, fname))
    
    dfs.append(full_dataframe_from_results(r))

In [6]:
from functools import reduce

df_merged = reduce(lambda  left,right: pd.merge(left,right, on=['index', 'measures', 'metric'],
                                            how='outer', suffixes=("_", "_")), dfs)


new_cols= []
val_count = 0

for c in df_merged.columns:
    if "value" in c:
        if not c[-1] == "_":
            c += "_"
        new_cols.append(c + str(val_count))
        val_count += 1
    else:
        new_cols.append(c)
        
df_merged.columns = new_cols

  after removing the cwd from sys.path.


### Values of all metrics

In [7]:
df_merged

Unnamed: 0,index,measures,metric,value_0,value_1,value_2,value_3,value_4,value_5
0,raw_amount-ks,raw_amount,ks,0.105645,0.155274,0.161744,0.213238,0.081775,0.161694
1,raw_amount-wasser,raw_amount,wasser,1930.988833,3580.385345,1939.130183,4725.972935,2101.772203,3704.78486
2,td-ks,td,ks,0.178425,0.110206,0.767066,0.233642,0.069934,0.140243
3,td-wasser,td,wasser,1.050423,1.481053,5.275384,2.384143,0.600606,1.176085
4,codes_1-grams_mean-jsd,codes_1-grams,jsd,0.021414,0.00303,0.002671,0.012697,0.001353,0.001866
5,codes_1-grams_mean-jac,codes_1-grams,jac,0.111111,0.0,0.0,0.0,0.0,0.0
6,codes_1-grams_mean-coverage_r,codes_1-grams,coverage_r,1.0,1.0,1.0,1.0,1.0,1.0
7,codes_1-grams_mean-coverage_g,codes_1-grams,coverage_g,0.888889,1.0,1.0,1.0,1.0,1.0
8,codes_1-grams_mean-NED,codes_1-grams,NED,0.140918,0.030297,0.039662,0.078826,0.023743,0.019418
9,codes_3-grams_mean-jsd,codes_3-grams,jsd,0.137375,0.036323,0.054051,0.058461,0.013114,0.022299


In [8]:
vals_df = df_merged[df_merged["measures"] != "name"]
value_cols = [f"value_{i}" for i in range(len(dfs))]
rank_df = vals_df[value_cols].rank(axis=1)


final_rank_df = pd.concat([vals_df[['index', 'measures', 'metric']], 
                           rank_df], axis=1)

final_rank_df

Unnamed: 0,index,measures,metric,value_0,value_1,value_2,value_3,value_4,value_5
0,raw_amount-ks,raw_amount,ks,2.0,3.0,5.0,6.0,1.0,4.0
1,raw_amount-wasser,raw_amount,wasser,1.0,4.0,2.0,6.0,3.0,5.0
2,td-ks,td,ks,4.0,2.0,6.0,5.0,1.0,3.0
3,td-wasser,td,wasser,2.0,4.0,6.0,5.0,1.0,3.0
4,codes_1-grams_mean-jsd,codes_1-grams,jsd,6.0,4.0,3.0,5.0,1.0,2.0
5,codes_1-grams_mean-jac,codes_1-grams,jac,6.0,3.0,3.0,3.0,3.0,3.0
6,codes_1-grams_mean-coverage_r,codes_1-grams,coverage_r,3.5,3.5,3.5,3.5,3.5,3.5
7,codes_1-grams_mean-coverage_g,codes_1-grams,coverage_g,1.0,4.0,4.0,4.0,4.0,4.0
8,codes_1-grams_mean-NED,codes_1-grams,NED,6.0,3.0,4.0,5.0,2.0,1.0
9,codes_3-grams_mean-jsd,codes_3-grams,jsd,6.0,3.0,4.0,5.0,1.0,2.0


In [9]:
def shortname(k):
    try:
        return fname_map[k].split("/")[1][:15]
    except:
        return k

In [10]:
paper_metrics = """
raw_amount-wasser
td-wasser
tcode_1-grams_mean-jsd
tcode_3-grams_mean-jsd
tcode_3-grams_mean-jac
dates_1-grams_mean-jsd
joint-tcode-raw_amount_wasser
joint-tcode-dates-amt_wasser
*r-joint-tcode-raw_amount_wasser
*r-joint-tcode-dates-amt_wasser
""".split()

In [11]:
small_df = df_merged[df_merged["index"].apply(lambda x: x in paper_metrics)].reset_index(drop=True)

In [12]:
gen_fname_row = df_merged[df_merged["index"]== "gen_fname"]
fname_map = dict([(c, gen_fname_row[c].item()) for c in value_cols])

In [13]:
def round_to_digits(x, max_digits = 4):
    max_digits -= 1 # leading / trailing 0
    digits_leftof_dot = max(0, np.floor(np.log10(x)) + 1)
    round_to = max(0, max_digits - digits_leftof_dot)
    # print("round to", round_to)
    return round(x, int(round_to))

In [14]:
final_table = small_df.rename(columns=fname_map).set_index('index').iloc[:, 2:].T
for col in final_table.columns:
    final_table[col] = final_table[col].apply(round_to_digits)
    
final_table

index,raw_amount-wasser,td-wasser,tcode_1-grams_mean-jsd,tcode_3-grams_mean-jsd,tcode_3-grams_mean-jac,dates_1-grams_mean-jsd,*r-joint-tcode-raw_amount_wasser,*r-joint-tcode-dates-amt_wasser
generated_data/tg.csv,1931.0,1.05,0.075,0.337,0.951,0.059,1906.0,3185.0
generated_data/bf-nc.csv,3580.0,1.48,0.158,0.411,0.974,0.006,3567.0,3575.0
generated_data/dg.csv,1939.0,5.28,0.007,0.132,0.547,0.09,2003.0,3350.0
generated_data/tf-v.csv,4726.0,2.38,0.185,0.445,0.973,0.059,4590.0,4922.0
generated_data/bf.csv,2102.0,0.601,0.004,0.042,0.543,0.011,2070.0,2142.0
generated_data/bf-nd.csv,3705.0,1.18,0.009,0.059,0.691,0.059,3684.0,4471.0
