In [66]:
from dataset.msc_summary import MSC_Summaries
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
from tabulate import tabulate

In [2]:
configs_options = {
    "speaker_prefixes": [None, ["<other>", "<self>"]],
    "nofact_token": ['', "<nofact>"],
}
basedir = "/Users/FrankVerhoef/Programming/PEX/data/msc/msc_personasummary/"
subsets = {
    1: ['train', 'valid', 'test'],
    2: ['train', 'valid', 'test'],
    3: ['train', 'valid', 'test'],
    4: ['valid', 'test']
}

In [3]:
configs = {
    "default": {
        "speaker_prefixes": ["<other>", "<self>"],
        "nofact_token": '',
    }
}


In [4]:
MSC_Summaries.set(**configs['default'])

In [5]:
msc_summaries = {
    session: {
        subset: MSC_Summaries(basedir=basedir, session=session, subset=subset) 
        for subset in subsets[session]
    }
    for session in subsets.keys()
}

In [6]:
m = {
    session: {subset: msc_summaries[session][subset].measurements() for subset in subsets[session]}
    for session in subsets.keys()
}

## Show a few examples

In [7]:
for i in range(10):
    print(msc_summaries[1]['train'][i])

(["<self>I need some advice on where to go on vacation, have you been anywhere lately?\n<other>I have been all over the world. I'm military.", '<self>That is good you have alot of travel experience\n<other>Sure do. And a lot of experience blowing things up! Haha. Bora bora is nice.', "<self>I've been working non stop crazy hours and need a break.\n<other>The best breaks are spent with cute cuddly kittens.", '<self>Bora bora sounds nice, you have been there before?\n<other>Nope... Just sounds nice, and repetitive. Bora... Bora. Ha!', '<self>Kittens really? I rather be at the beach.\n<other>Only if the beach was covered in kittens!', '<self>That would be a sight to see.\n<other>Or maybe brownies... I love chocolate.', "<self>I love brownies too but I haven't quite perfected mine yet.\n<other>Well I'm available to taste test!"], "I served or serve in the military.\nI've traveled the world.\nI've blown things up.\nI've never been to Bora Bora.\nI love chocolate.")
(["<self>Hello! What are 

In [8]:
for i in range(10):
    print(msc_summaries[4]['valid'][i])

(["<self>Do you have any new stunt double jobs coming  up?\n<other>No, I don't have any at this time.", '<self>You must have a lot of free time to read then.  Are you reading anything good now?\n<other>I am reading The Butterfly Garden book, and it is good.  How was Fatal Charm?', "<self>It was very good, I love true crime books!  I'm definitely on the look out for something similar.  I spend a lot of time in the library, so I'll find something.\n<other>I also love True Crime!  Do you watch movies about True Crime as well?", "<self>Of course, I'll watch or read anything true crime related.  My kindle is just filled with crime books.  What was your experience with The Butterfly Garden?\n<other>It was so creepy it still enters my dreams.  I see you like Steven King movies, but what about his books?", "<self>I love the books more!  I'll never watch a Steven King movie until I've read the book.  I think the IT movies may have ruined clowns for me forever.\n<other>I did not see IT, but hear

## Measurements overview with Pandas Dataframe

In [11]:
# Collect all item measurements in a dataframe

df_colums = list(m[1]['train']["allitem_measurements"][0].keys())
df = pd.DataFrame(columns=df_colums).astype('int16')

for session in subsets.keys():
    for subset in subsets[session]:
        subset_df = pd.DataFrame.from_dict(m[session][subset]["allitem_measurements"])

        subset_df["session"] = int(session)
        subset_df["subset"] = subset
    
        df = pd.concat([df, subset_df])

df["session"] = df["session"].astype('int')
df.head()


Unnamed: 0,dialog_id,convai_id,inputsentences,inputwords,labelwords,labelsentences,session,subset
0,0,train:ordered_3537,7,132,24,5,1,train
1,1,train:ordered_374,8,166,27,5,1,train
2,2,train:ordered_6926,6,123,21,5,1,train
3,3,train:ordered_6883,8,187,30,6,1,train
4,4,train:ordered_5668,7,159,23,5,1,train


In [12]:
# Overview of statistics, by session and subset

df.groupby(["session", "subset"]).agg({'dialog_id': ['count'], 'inputwords': ['mean', 'std'], 'labelwords': ['mean', 'std'], 'inputsentences': ['mean', 'std'], 'labelsentences': ['mean', 'std']})

Unnamed: 0_level_0,Unnamed: 1_level_0,dialog_id,inputwords,inputwords,labelwords,labelwords,inputsentences,inputsentences,labelsentences,labelsentences
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,mean,std,mean,std,mean,std
session,subset,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
1,test,501,144.301397,19.696167,35.307385,14.267212,6.558882,0.568354,6.357285,2.216775
1,train,4000,157.48125,23.923038,31.498,12.611017,7.48675,0.835164,5.95625,2.183467
1,valid,500,169.402,19.150192,37.976,14.460208,7.768,0.550256,6.932,2.342296
2,test,501,274.984032,75.115882,38.371257,21.222203,5.918164,0.373215,5.682635,2.635352
2,train,4000,273.887,90.725476,32.62075,18.1827,5.76925,0.650469,5.347,2.520551
2,valid,500,279.748,93.249748,38.738,20.717814,5.88,0.440441,5.822,2.600739
3,test,501,292.087824,75.504333,41.702595,23.535789,5.9002,0.407455,6.215569,3.144748
3,train,2285,298.512035,89.501727,36.600875,20.053693,5.885339,0.437894,5.64814,2.654763
3,valid,500,281.026,85.785708,34.674,18.029472,5.87,0.457732,5.424,2.509346
4,test,501,309.672655,86.254093,41.982036,23.47777,5.918164,0.362339,6.045908,2.966461


In [79]:
# Overview of statistics, by session and subset

table_df = df.groupby(["session", "subset"]).agg({'dialog_id': ['count'], 'inputwords': ['mean'], 'labelwords': ['mean']})
table_df = table_df.unstack().swaplevel(1,2, axis=1).swaplevel(0,1, axis=1)[['train', 'valid', 'test']]
print(tabulate(table_df))
print(table_df.to_latex(na_rep='-', float_format="%.0f"))



-  ----  -------  --------  ---  -------  ------  ---  -------  -------
1  4000  157.481   31.498   500  169.402  37.976  501  144.301  35.3074
2  4000  273.887   32.6208  500  279.748  38.738  501  274.984  38.3713
3  2285  298.512   36.6009  500  281.026  34.674  501  292.088  41.7026
4   nan  nan      nan       500  287.86   40.104  501  309.673  41.982
-  ----  -------  --------  ---  -------  ------  ---  -------  -------
\begin{tabular}{lrrrrrrrrr}
\toprule
subset & \multicolumn{3}{l}{train} & \multicolumn{3}{l}{valid} & \multicolumn{3}{l}{test} \\
{} & dialog\_id & inputwords & labelwords & dialog\_id & inputwords & labelwords & dialog\_id & inputwords & labelwords \\
{} &     count &       mean &       mean &     count &       mean &       mean &     count &       mean &       mean \\
session &           &            &            &           &            &            &           &            &            \\
\midrule
1       &      4000 &        157 &         31 &       500 &   

  print(table_df.to_latex(na_rep='-', float_format="%.0f"))


In [None]:
# session=1
# subset='test'

fig, ax = plt.subplots(figsize=(4,4))
color = {'train': 'blue', 'valid':'orange', 'test':'green'}

for subset in subsets[session]:
    selection = (df["session"] == session) & (df["subset"] == subset)
    df_selection = df[selection]
    scatter_ax = df_selection.plot.scatter(ax=ax, x='inputwords', y='labelwords', c=color[subset], alpha=0.3, xlim=(0,400), ylim=(0,100), label=subset)


In [None]:
fig, ax = plt.subplots(figsize=(4,4))
color = {'train': 'blue', 'valid':'orange', 'test':'green'}

for subset in subsets[session]:
    selection = (df["session"] == session) & (df["subset"] == subset)
    df_selection = df[selection]
    scatter_ax = df_selection.plot.scatter(ax=ax, x='inputwords', y='labelsentences', c=color[subset], alpha=0.3, xlim=(0,400), ylim=(0,20), label=subset)


In [None]:
fig, ax = plt.subplots(figsize=(4,4))
color = {'train': 'blue', 'valid':'orange', 'test':'green'}

for subset in subsets[session]:
    selection = (df["session"] == session) & (df["subset"] == subset)
    df_selection = df[selection]
    scatter_ax = df_selection.plot.scatter(ax=ax, x='inputsentences', y='labelsentences', c=color[subset], alpha=0.3, xlim=(0,20), ylim=(0,20), label=subset)


In [None]:
len(df_selection[df_selection['subset'] == 'valid'])

In [None]:
## Using matplottlib - horizontal

fig, ax = plt.subplots(ncols=2, nrows=len(subsets.keys()), figsize=(12, 12), gridspec_kw={'width_ratios': [10, 4]}, sharey=True)

def plot_hist_bar(ax, values, session, title, bins, range):
    labels = [f"{subset}: n={len(vals)}, " + r"$\mu$" + f"={vals.mean():.1f}, " + r"$\sigma$" + f"={vals.std():.1f}" for subset, vals in zip(subsets[session], values)]
    if session==4:
        labels = ['-'] + labels # add empty label
        values = [-1] + values # plot a bar, outside the range
    n, bins, patches = ax.hist(values, bins=bins, range=range, density=True, label=labels)
    legend = ax.legend()
    title = ax.set_title(title)
    return ax

for i, session in enumerate([1,2,3, 4]):
    m_input = [np.hstack([[val] * freq for val, freq in m[session][subset]['inputwords_per_sample']]) for subset in subsets[session]]
    m_label = [np.hstack([[val] * freq for val, freq in m[session][subset]['labelwords_per_sample']]) for subset in subsets[session]]
    bar_axes = plot_hist_bar(ax[i][0], m_input, session, title=f"Session={session}, input", bins=25, range=(0,500))
    bar_axes = plot_hist_bar(ax[i][1], m_label, session, title=f"Session={session}, label", bins=10, range=(0,100))

fig.suptitle("Distribution of number of words per input (all words in the input utterances), and label (extracted facts)")
fig.tight_layout()

In [None]:
## Using matplottlib - horizontal

fig, ax = plt.subplots(ncols=2, nrows=len(subsets.keys()), figsize=(10, 12), gridspec_kw={'width_ratios': [4, 4]}, sharey=True)

def plot_hist_bar(ax, values, session, title, bins, x_range):
    labels = [f"{subset}: n={len(vals)}, " + r"$\mu$" + f"={vals.mean():.1f}, " + r"$\sigma$" + f"={vals.std():.1f}" for subset, vals in zip(subsets[session], values)]
    ax.set_xticks(range(0, x_range[1], x_range[1] // bins))
    if session==4:
        labels = ['-'] + labels # add empty label
        values = [-1] + values # plot a bar, outside the range
    n, bins, patches = ax.hist(values, bins=bins, range=x_range, density=True, label=labels)
    legend = ax.legend()
    
    title = ax.set_title(title)
    return ax

for i, session in enumerate([1,2,3, 4]):
    m_input = []
    m_label = []
    for subset in subsets[session]:
        selection = selection = (df["session"] == session) & (df["subset"] == subset)
        m_input.append(df[selection]['inputsentences'].values)
        m_label.append(df[selection]['labelsentences'].values)
    bar_axes = plot_hist_bar(ax[i][0], m_input, session, title=f"Session={session}, input", bins=10, x_range=(0,10))
    bar_axes = plot_hist_bar(ax[i][1], m_label, session, title=f"Session={session}, label", bins=10, x_range=(0,20))

fig.suptitle("Distribution of number of input utterances, and number of extracted facts")
fig.tight_layout()