In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os
import os.path as osp
from itertools import cycle
from scipy.stats import norm

import bokeh.plotting as blt
from bokeh.io import output_notebook
from bokeh.resources import INLINE
output_notebook(INLINE)
from bokeh.palettes import Inferno11, Inferno7
from bokeh.models.glyphs import Text

In [None]:
results_dir = "results/100splits_15epochs"
exp_names = []
df_list = []
score_names = set()
for exp_name in os.listdir(results_dir):
    exp_dir = osp.join(results_dir, exp_name)
    if osp.isdir(exp_dir):
        scores_path = osp.join(exp_dir, "scores.csv")
        if osp.exists(scores_path):
            df = pd.read_csv(scores_path)
            if "Unnamed: 0" in df.columns:
                df.drop("Unnamed: 0", axis="columns", inplace=True)
            score_names = score_names.union([col for col in df.columns if not col.startswith("train_")])
            df["exp_name"] = [exp_name] * len(df)
            df_list.append(df)
            exp_names.append(exp_name)

In [None]:
exp_names[exp_names.index("student_nn")] = "copycat_nn"
score_names.remove("average_precision_macro")

In [None]:
palette = Inferno7
bins = 8
for set2plot in ["test set", "train set", "diff"]:
    for score_name in score_names:
        text_glyphs = []
        fig = blt.figure(title=score_name + ': ' + set2plot, width=600, height=400)
        color_gen = cycle(palette)
        to_plot = list(zip(exp_names, df_list))
        to_plot = np.array(to_plot)[[0,2,1]]
        for exp_name, scores_df in to_plot:
            test_df = scores_df[[col for col in scores_df.columns if "train_" not in col]]
            train_df = scores_df[[col for col in scores_df.columns if "train_" in col]]
            train_df.columns = [col.replace("train_", "") for col in train_df.columns]
            diff_df = pd.DataFrame()
            if train_df.size != 0 and test_df.size != 0:
                diff_df = train_df - test_df

            set_name2df = {"test set": test_df, "train set": train_df, "diff": diff_df}
            scores_df = set_name2df[set2plot]

            if scores_df.size != 0 and score_name in scores_df.columns:
#                 hcounts, hedges = np.histogram(scores_df[score_name], bins=bins)
#                 hcounts = hcounts / float(hcounts.sum())
                hcounts, hedges = np.histogram(scores_df[score_name], bins=bins, density=True)
                curr_color = next(color_gen)
                fig.quad(top=hcounts, left=hedges[:-1], right=hedges[1:], bottom=0,
                         color=curr_color, alpha=0.5, legend=exp_name)
                x_pdf = np.linspace(hedges[0], hedges[-1])
                loc_pdf = scores_df[score_name].mean()
                scale_pdf = scores_df[score_name].std() 
                y_pdf = norm.pdf(x_pdf, loc=loc_pdf, scale=scale_pdf)
#                 y_pdf = y_pdf * (hedges[1] - hedges[0])
                fig.line(x_pdf, y_pdf, color=curr_color, line_width=3, alpha=0.7)
#                 _text = Text(x=loc_pdf-0.5*scale_pdf, y=-6, text_color=curr_color, text_font_style="bold",
#                              text=['\u03bc=%.2f\n\u03c3=%.3f' % (loc_pdf, scale_pdf)])
                _text = Text(x=loc_pdf-0.5*scale_pdf, y=-6, text_color=curr_color, text_font_style="bold",
                             text=['\u03bc=%d%%\n\u03c3=%.1f%%' % (loc_pdf*100, scale_pdf*100)])
                text_glyphs.append(_text)
            
        for _text in text_glyphs:
            fig.add_glyph(_text)

        fig.legend.location = "top_left"
        blt.show(fig)

# OLD

In [None]:
raise

In [None]:
palette = Inferno11
bins = 10
for score_name in score_names:
    fig = blt.figure(title=score_name, width=600, height=400)
    color_gen = cycle(palette)
    for exp_name, scores_df in zip(exp_names, df_list):
        test_df = scores_df[[col for col in scores_df.columns if "train_" not in col]]
        train_df = scores_df[[col for col in scores_df.columns if "train_" in col]]
        train_df.columns = [col.replace("train_", "") for col in train_df.columns]
        diff_df = pd.DataFrame()
        if train_df.size != 0 and test_df.size != 0:
            diff_df = train_df - test_df
            
        for exp_name, scores_df in zip(
            [exp_name + "_testset", exp_name + "_trainset", exp_name + "_diff"], [test_df, train_df, diff_df]):
            if scores_df.size != 0 and score_name in scores_df.columns:
                hcounts, hedges = np.histogram(scores_df[score_name], bins=bins)
                hcounts = hcounts / float(hcounts.sum())
                fig.quad(top=hcounts, left=hedges[:-1], right=hedges[1:], bottom=0,
                         color=next(color_gen), alpha=0.6, legend=exp_name)
            
    fig.legend.location = "top_left"
    blt.show(fig)

In [None]:
palette = np.array(Inferno7)[[0,2,4]]
bins = 100
for score_name in score_names:
    fig = blt.figure(title=score_name, width=600, height=400)
    color_gen = cycle(palette)
    to_plot = list(zip(exp_names, df_list))
    to_plot = np.array(to_plot)[[0,2,1]]
    for exp_name, scores_df in to_plot:
        print(exp_name)
        test_df = scores_df[[col for col in scores_df.columns if "train_" not in col]]
        train_df = scores_df[[col for col in scores_df.columns if "train_" in col]]
        train_df.columns = [col.replace("train_", "") for col in train_df.columns]
        diff_df = pd.DataFrame()
        if train_df.size != 0 and test_df.size != 0:
            diff_df = train_df - test_df
            
        for exp_name, scores_df in zip(
            [exp_name + "_diff"], [diff_df]):
            if scores_df.size != 0 and score_name in scores_df.columns:
                hcounts, hedges = np.histogram(scores_df[score_name], bins=bins)
                hcounts = hcounts / float(hcounts.sum())
                fig.quad(top=hcounts, left=hedges[:-1], right=hedges[1:], bottom=0,
                         color=next(color_gen), alpha=0.5, legend=exp_name)
            
    fig.legend.location = "top_left"
    blt.show(fig)