# Visualize samples from msc_sessions dataset

This notebook is meant for analysis and visualization of the dialogues from the Multi-Session Chat dataset.

## Import libraries and load dataset

In [1]:
from dataset.msc_sessions import MSC_Session
from dataset.msc_speechact import MSC_SpeechAct
from models.speechact_clf import SpeechactClassifier
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
import itertools
import random
from collections import Counter

In [2]:
# Specify options for configuration of the dataset

configs_options = {
    "speaker_prefixes": [None, ["<other>", "<self>"]],
}
basedir = "/Users/FrankVerhoef/Programming/PEX/data/msc/msc_dialogue/"
checkpoint_dir = "/Users/FrankVerhoef/Programming/PEX/checkpoints/"
subsets = {
    1: ['train', 'valid', 'test'],
    2: ['train', 'valid', 'test'],
    3: ['train', 'valid', 'test'],
    4: ['train', 'valid', 'test'],
    5: ['valid', 'test']
}

In [3]:
# Define a set of configs to choose from

configs = {
    "default": {
        "speaker_prefixes": ["<other>", "<self>"],
        "sessionbreak_token": "<sessionbreak>",
        "speechact_classifier": None
    },
    "speechacts": {
        "speaker_prefixes": ["<other>", "<self>"],
        "sessionbreak_token": "<sessionbreak>",
        "speechact_classifier": SpeechactClassifier(checkpoint_dir=checkpoint_dir, modelname="trained_speechact_bert")
    }
}

variants = {
    "no_persona_no_hist": {"include_persona": False, "include_history": False},
    "persona_no_hist": {"include_persona": True, "include_history": False},
    "persona_and_hist": {"include_persona": True, "include_history": True},
}

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
# Set class-level configuration

MSC_Session.set(**configs['default'])

In [5]:
# Load dataset with specified configuration

max_samples = None # 100
msc_sessions = {}
for session in subsets.keys():
    if session == 1:
        version = ['both', 'revised']
        session = '-'.join(['1'] + version)
    msc_sessions[int(str(session)[0])] = {}
    for option_name in variants.keys():
        msc_sessions[int(str(session)[0])][option_name] = {
            subset: MSC_Session(basedir=basedir, session=session, subset=subset, max_samples=max_samples, **variants[option_name]) 
            for subset in subsets[int(str(session)[0])]
        }


2023-09-02 17:33:35,612 INFO     | Init ConvAI2 with basedir=/Users/FrankVerhoef/Programming/PEX/data/msc/msc_dialogue/ConvAI2/, version=['both', 'revised'], subset=train
2023-09-02 17:33:35,614 INFO     | For ConvAI2 dataset, use 90% of train dataset for training (rest is available as validation dataset)
2023-09-02 17:33:36,252 INFO     | Read 16090 dialogues from ConvAI2 for train dataset
2023-09-02 17:33:36,352 INFO     | Init ConvAI2 with basedir=/Users/FrankVerhoef/Programming/PEX/data/msc/msc_dialogue/ConvAI2/, version=['both', 'revised'], subset=valid
2023-09-02 17:33:36,352 INFO     | For ConvAI2 dataset, use 10% of train dataset as validation dataset
2023-09-02 17:33:37,049 INFO     | Read 1788 dialogues from ConvAI2 for valid dataset
2023-09-02 17:33:37,060 INFO     | Init ConvAI2 with basedir=/Users/FrankVerhoef/Programming/PEX/data/msc/msc_dialogue/ConvAI2/, version=['both', 'revised'], subset=test
2023-09-02 17:33:37,061 INFO     | For ConvAI2 dataset, use validation datas

In [22]:
m = {
    session: {
        option_name: {subset: msc_sessions[session][option_name][subset].measurements() for subset in subsets[session]}
        for option_name in variants.keys()
    }
    for session in subsets.keys()
}

## Show a few examples

In [18]:
# Example for dataset format in session 1
print("keys :", msc_sessions[1]['no_persona_no_hist']['train'].dialogues[0].keys())
msc_sessions[1]['no_persona_no_hist']['train'].dialogues[0]

keys : dict_keys(['init_personas', 'dialog'])


{'init_personas': [['my favorite hobbies are based on old fashioned life skills.',
   'i race large felines who are in captivity to remain healthy.',
   'i was a really good runner when i was younger.',
   'i am a carnivore.'],
  ['i love to redesign houses.',
   'killing for sport is my hobby.',
   'i shot an arrow the other day !.',
   'i like to get dressed up.']],
 'dialog': [{'text': "hi , how are you doing ? i'm getting ready to do some cheetah chasing to stay in shape .",
   'id': 'Speaker 1'},
  {'text': 'you must be very fast . hunting is one of my favorite hobbies .',
   'id': 'Speaker 2'},
  {'text': 'i am ! for my hobby i like to do canning or some whittling .',
   'id': 'Speaker 1'},
  {'text': 'i also remodel homes when i am not out bow hunting .',
   'id': 'Speaker 2'},
  {'text': "that's neat . when i was in high school i placed 6th in 100m dash !",
   'id': 'Speaker 1'},
  {'text': "that's awesome . do you have a favorite season or time of year ?",
   'id': 'Speaker 2'

In [19]:
# Example for dataset format in session 3
print("keys :", msc_sessions[3]['no_persona_no_hist']['train'].dialogues[0].keys())
msc_sessions[3]['no_persona_no_hist']['train'].dialogues[0]

keys : dict_keys(['personas', 'dialog', 'metadata', 'previous_dialogs', 'init_personas'])


{'personas': [["I am a mechanical engineer. I've been working a lot of extra hours. I want to break from my non-stop work.",
   'I like going to the beach.',
   'I love brownies.',
   'My cousin bought a house with unsafe wiring.',
   'I have never had to replace all the wiring in my house. I value home safety.'],
  ["I used to serve in the military. I've traveled the world.",
   "I've blown things up.",
   "I've never been to Bora Bora.",
   'I love chocolate.',
   'I am now an electrical engineer working on wiring and generators. I learned these skills in the military. I like my job, and am good at it. I test and troubleshoot equipment to ensure safety.',
   'I have children.']],
 'dialog': [{'text': "I've booked myself a week long vacation from work next month!",
   'id': 'Speaker 1',
   'convai2_id': 'train:ordered_3537'},
  {'text': 'How nice!  Where are you going?',
   'id': 'Speaker 2',
   'convai2_id': 'train:ordered_3537'},
  {'text': "Did not decided yet, I'd like to get an i

In [12]:
for i in range(10):
    print(msc_sessions[1]['no_persona_no_hist']['train'][i])

("<sessionbreak>new session\n<other>hi , how are you doing ? i'm getting ready to do some cheetah chasing to stay in shape .\n<self>you must be very fast . hunting is one of my favorite hobbies .\n<other>i am ! for my hobby i like to do canning or some whittling .\n<self>i also remodel homes when i am not out bow hunting .\n<other>that's neat . when i was in high school i placed 6th in 100m dash !\n<self>that's awesome . do you have a favorite season or time of year ?\n<other>i do not . but i do have a favorite meat since that is all i eat exclusively .\n<self>what is your favorite meat to eat ?\n<other>i would have to say its prime rib . do you have any favorite foods ?\n<self>i like chicken or macaroni and cheese .\n<other>do you have anything planned for today ? i think i am going to do some canning .\n<self>i am going to watch football . what are you canning ?\n<other>i think i will can some jam . do you also play footfall for fun ?\n", '<self>if i have time outside of hunting and 

In [13]:
for i in range(10):
    print(msc_sessions[2]['persona_and_hist']['train'][i])

("<sessionbreak>personas\n<self>I work in the military.\n<self>I've been all over the world.\n<self>I like things that explode.\n<self>I also like kittens.\n<self>Brownies are my favorite dessert.\n<self>I served or serve in the military. I've traveled the world.\n<self>I've blown things up.\n<self>I've never been to Bora Bora.\n<self>I love chocolate.\n<other>I've been working a lot of extra hours. I want to break from my non-stop work.\n<other>I like going to the beach.\n<other>I love brownies.\n<sessionbreak>2 days ago\n<other>I need some advice on where to go on vacation, have you been anywhere lately?\n<self>I have been all over the world. I'm military.\n<other>That is good you have alot of travel experience\n<self>Sure do. And a lot of experience blowing things up! Haha. Bora bora is nice.\n<other>I've been working non stop crazy hours and need a break.\n<self>The best breaks are spent with cute cuddly kittens.\n<other>Bora bora sounds nice, you have been there before?\n<self>Nop

In [14]:
for i in range(10):
    print(msc_sessions[4]['persona_and_hist']['valid'][i])

("<sessionbreak>personas\n<self>I read twenty books a year.\n<self>I'm a stunt double as my second job.\n<self>I only eat kosher.\n<self>I was raised in a single parent household.\n<self>I have two jobs.\n<self>My favorite hobby is reading. I've read 20 books this year.\n<self>I work in the movies. I work as a stunt double.\n<self>I help out my mom. She is my only surviving parent. I am like my mom.\n<self>I shop at Home Depot.\n<self>Shawshank was a boring yet amazing book. I like Stephen King adaptations. I want to be in one of the movies. I am haunted by The Butterfly Garden book.\n<self>I have kindle unlimited and the libby app. Billy Straight is an amazing book. I have Amazon Prime. I like Dr. Seuss's Old Hat New Hat book.\n<self>I was a stunt double for the actor playing Rocky Balboa in a remake of Rocky.\n<other>I like cooler weather.\n<other>My son is in junior high.\n<other>I used to work in human services. My wife goes to work. I stay at home.\n<other>My dad worked at Home De

## Speechacts

In [None]:
sessions = [3]
variant = "no_persona_no_hist"
subset = 'train'
shuffled_dialogues = [
    (msc_sessions[3][variant]['train'].indices[i], msc_sessions[3][variant]['train'].history[i])
    for i in np.random.permutation(len(msc_sessions[3][variant]['train']))]
ref_stats, ref_selfchats_results = MSC_Session.calc_speechact_stats(shuffled_dialogues)
ref_stats

In [None]:
filename = '/Users/FrankVerhoef/Programming/PEX/notebooks/speechact_test_s3_100.json'
with open (filename, 'r') as f:
    m = {3: json.loads(f.read())['3']}

In [None]:
print(m.keys())
print(m[3].keys())
print(m[3]['no_persona_no_hist'].keys())
Counter(m[3][variant][subset]['speechacts'])

In [None]:
from scipy.stats import chi2_contingency, wasserstein_distance

def chi_squared(observed, floor=0):
    # remove rows with 0 sum
    filtered = observed[observed.sum(axis=1) > max(floor, 0)]
    # print(f"Filtered: {1 - np.sum(filtered) / np.sum(observed):.2%}")
    return chi2_contingency(filtered)

def normalize(counter):
    total = sum(counter.values())
    normalized = {k: v / total for k, v in counter.items()}
    return normalized

def speechact_sim(speechacts_1, speechacts_2):
    """
    Calculates a similarity score between two dialogues based on the probability of the speechacts in each item.
    Based on averag of squared difference between probabilities of speechacts
    """
    dist_1 = normalize(speechacts_1)
    dist_2 = normalize(speechacts_2)
    diff = np.array([dist_1.get(speechact, 0) - dist_2.get(speechact, 0) for speechact in MSC_SpeechAct.classes.keys()])
    sim = sum(diff * diff) / len(speechact_keys)
    return sim

observations = np.array([
    [13, 15],
    [30, 13],
    [44, 65]
])

print(chi_squared(observations))


In [None]:
# Chi-squared test applied to frequency of speechacts

sorted_acts = sorted(sum([Counter(m[s][variant][subset]['speechacts']) for s in sessions for subset in ['train', 'valid', 'test']], Counter()).keys())
x_acts = np.arange(len(sorted_acts))

data = np.array([
    [m[3][variant][subset]["speechacts"].get(k, 0) for k in sorted_acts]
    for subset in ['train', 'valid', 'test']
])

chi_tt = chi_squared(data[np.array([0,1])].T, floor=0)
chi_tv = chi_squared(data[np.array([0,2])].T, floor=0)
chi_vt = chi_squared(data[np.array([1,2])].T, floor=0)

fig, ax = plt.subplots(figsize=(6,3))
for i, (s, subset) in enumerate(itertools.product(sessions, ['train', 'valid', 'test'])):
    subset_m = m[s][variant][subset]
    offset = 0.2 * i - 0.2
    ax.bar(x_acts + offset, height=data[i], width=0.2, label=f"{s}-{subset}")
    # ax.bar(x_acts + offset, height=[subset_m["speechacts"].get(k, 0) for k in sorted_acts], width=0.2, label=f"{s}-{subset}")

ax.set_xticks(x_acts)
ax.set_xticklabels([MSC_SpeechAct.classes[c] for c in sorted_acts], rotation=0)
ax.set_xlabel("Speech acts")
ax.legend()

fig.suptitle(f"Frequency of speech acts in sample of 100 dialogues\n$\chi^2$ "
    f"train-valid: {chi_tv.statistic:.1f} (p={chi_tv.pvalue:.2f}), "
    f"train-test: {chi_tt.statistic:.1f} (p={chi_tt.pvalue:.2f}), "
    f"valid-test: {chi_vt.statistic:.1f} (p={chi_vt.pvalue:.2f}), "
)
fig.tight_layout()


In [None]:
# Wasserstein Distance calculated between probabilities of speechacts per dialogue

for subset in ['train', 'valid', 'test']:
    all_measurements = m[3]['no_persona_no_hist'][subset]['allitem_measurements']
    for measurements in all_measurements:
        measurements['speechacts_normalized'] = normalize(measurements['speechacts'])

speechact_keys = MSC_SpeechAct.classes.keys()

fig, axs = plt.subplots(nrows=len(speechacts), figsize=(9,3 * len(speechacts)), sharex=True)
num_bins = 20
bins = np.linspace(0, 1, num_bins+1)
x = np.arange(num_bins)

for speechact, ax in zip(speechact_keys, axs):
    data = {
        subset: np.array([
            measurements['speechacts_normalized'][speechact] 
            for measurements in m[3]['no_persona_no_hist'][subset]['allitem_measurements']
            if speechact in measurements['speechacts_normalized'].keys()
            ])
        for subset in ['train', 'valid', 'test']
    }
    for i, subset in enumerate(['train', 'valid', 'test']):
        hist = np.histogram(data[subset], bins=bins)
        # print(hist)
        offset = 0.2 * i + 0.3
        ax.bar(x + offset, height=hist[0]/100, width=0.2, label=f"{3}-{subset}")


    wd_tv = wasserstein_distance(data['train'], data['valid']) if len(data['train']) > 0 else -1
    wd_tt = wasserstein_distance(data['train'], data['test'])if len(data['train']) > 0 else -1
    wd_vt = wasserstein_distance(data['valid'], data['test'])if len(data['valid']) > 0 else -1

    ax.set_title(f"Speechact {speechact}: WD(train-valid)={wd_tv:.4f}, WD(train-test)={wd_tt:.4f}, WD(valid-test)={wd_vt:.4f}")
    ax.legend()
    ax.set_xlim(0, num_bins)
    ax.set_xticks(np.arange(num_bins + 1))
    ax.set_xticklabels([f"{b:.2f}" for b in bins])



In [None]:
def speechact_sim(speechacts_1, speechacts_2):

    dist_1 = normalize(speechacts_1)
    dist_2 = normalize(speechacts_2)
    diff = np.array([dist_1.get(speechact, 0) - dist_2.get(speechact, 0) for speechact in MSC_SpeechAct.classes.keys()])
    sim = sum(diff * diff) / len(speechact_keys)

    return sim


In [None]:
# Test to visualize similarity between dialogues on item level
# (but this does not really make sense)

for subset in ['train', 'valid', 'test']:
    all_measurements = m[3]['no_persona_no_hist'][subset]['allitem_measurements']
    for measurements in all_measurements:
        measurements['speechacts_normalized'] = normalize(measurements['speechacts'])

speechact_keys = ['A', 'E'] #, 'P', 'Q', 'R', 'S']
comparisons = [['train', 'valid'], ['train', 'test'], ['valid', 'test']]

fig, axs = plt.subplots(nrows=len(speechact_keys), ncols=len(comparisons), figsize=(3 * len(comparisons),3 * len(speechact_keys)), sharex=True, sharey=True)

num_bins = 20
num_samples = 100
bins = np.linspace(0, 1, num_bins+1)
x = np.arange(num_bins)

for (speechact, comparison), ax in zip(itertools.product(speechact_keys, comparisons), axs.flatten()):

    # Take random samples of two sets and calculate similarity
    print(speechact, comparison)
    list_1 = [item_m['speechacts'] for item_m in m[3]['no_persona_no_hist'][comparison[0]]['allitem_measurements']]
    list_2 = [item_m['speechacts'] for item_m in m[3]['no_persona_no_hist'][comparison[1]]['allitem_measurements']]
    data = np.array([
        speechact_sim(item_1, item_2)
        for item_1, item_2 in zip(random.choices(list_1, k=num_samples), random.choices(list_2, k=num_samples))
    ])
    # print(data)
    hist = np.histogram(data, bins=bins)
    print(hist)



## Measurements overview with Pandas Dataframe

In [11]:
# Collect all item measurements in a dataframe
session = 3

df_colums = list(m[session]["no_persona_no_hist"]['train']["allitem_measurements"][0].keys())
df = pd.DataFrame(columns=df_colums).astype('int16')

for session in subsets.keys():
    for variant in variants.keys():
        for subset in subsets[session]:
            subset_df = pd.DataFrame.from_dict(m[session][variant][subset]["allitem_measurements"])

            subset_df["session"] = int(session)
            subset_df["variant"] = variant
            subset_df["subset"] = subset
        
            df = pd.concat([df, subset_df])

df["session"] = df["session"].astype('int')
df.describe()


Unnamed: 0,session,dialog_id,turn_id,inputwords,inputsentences,labelwords,ref_self,ref_other,ref_context
count,300.0,300.0,300.0,300.0,300.0,300.0,300.0,300.0,300.0
mean,3.0,894.026667,10.73,264.126667,11.73,24.39,0.0,0.0,0.6
std,0.0,1141.881543,0.941659,79.243226,0.941659,12.769551,0.0,0.0,0.490716
min,3.0,6.0,7.0,103.0,8.0,5.0,0.0,0.0,0.0
25%,3.0,184.75,11.0,206.0,12.0,15.75,0.0,0.0,0.0
50%,3.0,360.0,11.0,255.0,12.0,22.0,0.0,0.0,1.0
75%,3.0,992.0,11.0,303.0,12.0,30.0,0.0,0.0,1.0
max,3.0,3966.0,11.0,578.0,12.0,90.0,0.0,0.0,1.0


In [12]:
df.head()

Unnamed: 0,session,dialog_id,turn_id,convai_id,inputwords,inputsentences,labelwords,ref_self,ref_other,ref_context,variant,subset
0,3,56,7,train:ordered_4680,292,8,25,0,0,1,no_persona_no_hist,train
1,3,99,11,train:ordered_770,234,12,20,0,0,0,no_persona_no_hist,train
2,3,122,11,train:ordered_230,272,12,13,0,0,0,no_persona_no_hist,train
3,3,218,11,train:ordered_7371,307,12,34,0,0,1,no_persona_no_hist,train
4,3,231,11,train:ordered_3986,142,12,18,0,0,0,no_persona_no_hist,train


In [None]:
# Overview of statistics, by session and subset

df.groupby(["session", "variant", "subset"]).agg({'turn_id': ['count'], 'inputwords': ['mean', 'std'], 'inputsentences': ['mean', 'std'], 'labelwords': ['mean', 'std'], })

In [None]:
fig, ax = plt.subplots(ncols=3, nrows=5, figsize=(12, 12))

def plot_hist_bar(ax, values, session, title, bins, range):

    if session==5:
        label = '-' # add empty label
        vals = [-1]  # plot a bar, outside the range
        ax.hist(vals, bins=bins, range=range, alpha=0.5, label=label, density=True)
    for subset, vals in zip(subsets[session], values): 
        label = f"{subset}: " + r"$\mu$" + f"={vals.mean():.0f}, " + r"$\sigma$" + f"={vals.std():.0f}"
        ax.hist(vals, bins=bins, range=range, alpha=0.5, label=label, density=True)
    ax.legend()
    ax.set_title(title)
    return ax

for j, variant in enumerate(variants.keys()):
    for i, session in enumerate(subsets.keys()):
        m_input = []
        for subset in subsets[session]:
            selection = selection = (df["session"] == session) & (df["variant"] == variant) & (df["subset"] == subset)
            m_input.append(df[selection]['inputwords'].values)
        bar_axes = plot_hist_bar(ax[i][j], m_input, session, title=f"Session_{session}, {variant}\nn={len(m_input[0])}", bins=40, range=(0,2000))

fig.suptitle(f"Distribution of number of inputwords (persona sentences, history, current dialogue)")
fig.tight_layout()

In [None]:
fig, ax = plt.subplots(ncols=3, figsize=(12, 4), sharey=False, sharex=True)

def plot_mean_with_std(ax, values, subset):
    sessions = [s for s in subsets.keys() if subset in subsets[s]]
    means = np.array([vals.mean() for vals in values])
    stds = np.array([vals.std() for vals in values])
    ax.plot(sessions, means, lw=2, label=subset)
    ax.fill_between(sessions, means+stds, means-stds, alpha=0.2)
    return ax

for j, variant in enumerate(variants.keys()):
    for i, subset in enumerate(['train', 'valid', 'test']):
        m_input = []
        sessions = [s for s in subsets.keys() if subset in subsets[s]]
        for session in sessions:
            selection = selection = (df["session"] == session) & (df["variant"] == variant) & (df["subset"] == subset)
            m_input.append(df[selection]['inputwords'].values)
        bar_axes = plot_mean_with_std(ax[j], m_input, subset)
    ax[j].legend()
    ax[j].set_title(f"Variant: {variant}")

fig.suptitle(f"Distribution of number of inputwords, per subset and variant")
fig.tight_layout()

In [None]:
fig, ax = plt.subplots(ncols=3, nrows=3, figsize=(12, 12), sharey=True, sharex=True)

def plot_mean_with_std(ax, values, subset, title):
    sessions = [s for s in subsets.keys() if subset in subsets[s]]
    means = np.array([vals.mean() for vals in values])
    stds = np.array([vals.std() for vals in values])
    ax.plot(sessions, means, lw=2, label='mean')
    ax.fill_between(sessions, means+stds, means-stds, alpha=0.5)
    ax.legend()
    ax.set_title(title)
    return ax

for j, variant in enumerate(variants.keys()):
    for i, subset in enumerate(['train', 'valid', 'test']):
        m_input = []
        sessions = [s for s in subsets.keys() if subset in subsets[s]]
        for session in sessions:
            selection = selection = (df["session"] == session) & (df["variant"] == variant) & (df["subset"] == subset)
            m_input.append(df[selection]['inputwords'].values)
        bar_axes = plot_mean_with_std(ax[i][j], m_input, subset, title=f"Subset: {subset}, {variant}")

fig.suptitle(f"Distribution of number of inputwords, per subset and variant")
fig.tight_layout()

In [None]:
subset = 'valid'
sessions = [k for k in subsets.keys() if subset in subsets[k]]
fig, ax = plt.subplots(ncols=1, nrows=len(sessions), figsize=(12, 12))

def plot_hist_bar(ax, values, session, title, bins, range):

    for variant, vals in zip(variants.keys(), values): 
        label = f"{variant}: " + r"$\mu$" + f"={vals.mean():.0f}, " + r"$\sigma$" + f"={vals.std():.0f}"
        ax.hist(vals, bins=bins, range=range, alpha=0.5, label=label, density=True)
    ax.legend()
    ax.set_title(title)
    return ax

for i, session in enumerate(sessions):
    m_input = []
    for variant in variants.keys():
        selection = selection = (df["session"] == session) & (df["variant"] == variant) & (df["subset"] == subset)
        m_input.append(df[selection]['inputwords'].values)
    bar_axes = plot_hist_bar(ax[i], m_input, session, title=f"Session_{session}/{subset}\nn={len(m_input[0])}", bins=100, range=(0,2000))

fig.suptitle(f"Distribution of number of inputwords (persona sentences, history, current dialogue)")
fig.tight_layout()

### Test charts

In [None]:
session=3
variant='persona_and_hist'
subset='train'

selection = (df["session"] == session) & (df["variant"] == variant)& (df["subset"] == subset)
ax = df[selection]["inputwords"].plot.hist(bins=10, alpha=0.5)

In [None]:
fig, ax = plt.subplots(ncols=1, nrows=len(subsets.keys()), figsize=(12, 12))

session=5
subset='valid'

selection = (df["session"] == session) & (df["subset"] == subset)

labels = []
for variant in variants.keys():
    subset_df = df.loc[selection & (df["variant"] == variant), "inputwords"]
    labels.append(f"{variant}: "+ r"$\mu$" + f"={subset_df.mean():.0f}, " + r"$\sigma$" + f"={subset_df.std():.0f}")
    plt.hist(subset_df, alpha=0.5, label=variant, density=True, bins=40)
legend = plt.legend(labels=labels)
title = plt.title(f"Distribution of number of inputwords per variant\nDataset: session_{session}/{subset}, n={len(subset_df)}")

In [None]:
session=4
subset='train'

selection = (df["session"] == session) & (df["subset"] == subset)

df_hist = df[selection]
df_hist['inputwords'].hist(by=df_hist['variant'], bins=20, layout=(1,3), figsize=(12,3), density=True, sharey=True)

## Visualization with matplotlib

In [None]:
## Using matplottlib - horizontal

subset = 'valid'
sessions = [k for k in subsets.keys() if subset in subsets[k]]
fig, ax = plt.subplots(ncols=1, nrows=len(sessions), figsize=(12, 12))

def plot_hist_bar(ax, values, session, title, bins, range):
    labels = [f"{variant}: " + r"$\mu$" + f"={vals.mean():.0f}, " + r"$\sigma$" + f"={vals.std():.0f}" for variant, vals in zip(variants.keys(), values)]
    n, bins, patches = ax.hist(values, bins=bins, range=range, density=True, label=labels)
    legend = ax.legend()
    title = ax.set_title(title)
    return ax

for i, session in enumerate(sessions):
    m_input = []
    for variant in variants.keys():
        selection = selection = (df["session"] == session) & (df["variant"] == variant) & (df["subset"] == subset)
        m_input.append(df[selection]['inputwords'].values)
    bar_axes = plot_hist_bar(ax[i], m_input, session, title=f"Session_{session}/{subset}\nn={len(m_input[0])}", bins=40, range=(0,2000))

fig.suptitle(f"Distribution of number of inputwords (persona sentences, history, current dialogue)")
fig.tight_layout()

In [None]:
## Using matplottlib - horizontal

fig, ax = plt.subplots(ncols=2, nrows=len(subsets.keys()), figsize=(12, 12), gridspec_kw={'width_ratios': [10, 4]}, sharey=False)

def plot_hist_bar(ax, values, session, title, bins, range):
    labels = [f"{subset}: n={len(vals)}, " + r"$\mu$" + f"={vals.mean():.0f}, " + r"$\sigma$" + f"={vals.std():.0f}" for subset, vals in zip(subsets[session], values)]
    if session==5:
        labels = ['-'] + labels # add empty label
        values = [-1] + values # plot a bar, outside the range
    n, bins, patches = ax.hist(values, bins=bins, range=range, density=True, label=labels)
    legend = ax.legend()
    title = ax.set_title(title)
    return ax

for i, session in enumerate(subsets.keys()):
    m_input = [np.hstack([[val] * freq for val, freq in m[session]['no_persona_no_hist'][subset]['inputwords_per_sample']]) for subset in subsets[session]]
    m_label = [np.hstack([[val] * freq for val, freq in m[session]['no_persona_no_hist'][subset]['labelwords_per_sample'] if val != 0]) for subset in subsets[session]]
    bar_axes = plot_hist_bar(ax[i][0], m_input, session, title=f"Session={session}, input", bins=25, range=(0,500))
    bar_axes = plot_hist_bar(ax[i][1], m_label, session, title=f"Session={session}, label", bins=10, range=(0,60))

fig.suptitle("Distribution of number of words per input sentence (complete dialogue, except last utterance), and label (last utterance)")
fig.tight_layout()

### Other charts

In [None]:
## Using matplottlib - horizontal

fig, ax = plt.subplots(ncols=1, nrows=len(subsets.keys()), figsize=(10, 12), sharex=True)

def plot_hist_bar(ax, values, session):
    labels = [f"{subset}: n={len(vals)}, " + r"$\mu$" + f"={vals.mean():.0f}, " + r"$\sigma$" + f"={vals.std():.0f}" for subset, vals in zip(subsets[session], values)]
    if session==5:
        labels = ['-'] + labels # add empty label
        values = [-1] + values # plot a bar, outside the range
    n, bins, patches = ax.hist(values, bins=30, range=(0,500), density=True, label=labels)
    legend = ax.legend()
    title = ax.set_title(f"Session={session}")
    return ax

for i, session in enumerate(subsets.keys()):
    values = [np.hstack([[val] * freq for val, freq in m[session]['no_persona_no_hist'][subset]['inputwords_per_sample']]) for subset in subsets[session]]
    bar_axes = plot_hist_bar(ax[i], values, session)

fig.suptitle("Distribution of number of words per input sentence (complete dialogue, except last utterance)")
fig.tight_layout()

In [None]:
fig, ax = plt.subplots(ncols=1, nrows=len(subsets.keys()), figsize=(4,12), sharey=True)

def plot_hist_bar(ax, values, session):
    labels = [f"{subset}: n={len(vals)}, " + r"$\mu$" + f"={vals.mean():.0f}, " + r"$\sigma$" + f"={vals.std():.0f}" for subset, vals in zip(subsets[session], values)]
    if session==5:
        labels = ['-'] + labels # add empty label
        values = [-1] + values # plot a bar, outside the range
    n, bins, patches = ax.hist(values, bins=10, range=(0,50), density=True, label=labels)
    legend = ax.legend()
    title = ax.set_title(f"Session={session}")
    return ax

for i, session in enumerate(subsets.keys()):
    values = [
        np.hstack([
            [val] * freq 
            for val, freq in m[session]['no_persona_no_hist'][subset]['labelwords_per_sample']
            if val != 0 # plot only for sentences that contain a fact
        ]) 
        for subset in subsets[session]
    ]
    bar_axes = plot_hist_bar(ax[i], values, session)

fig.suptitle("Distribution of number of words per label sentence (next utterance)")
fig.tight_layout()

## Test to draw dialogue

In [21]:
dialogue = msc_sessions[1]['persona_and_hist']['train'][10][0]
print(dialogue)

KeyError: 'persona_and_hist'

In [13]:
# Select dataset
session = 1
subset = 'test'
variant = 'no_persona_no_hist'

# Print and plot first dialogue
print("History:\n{}\nNext utterance:\n{}\n".format(*msc_sessions[session][variant][subset][140]))
msc_sessions[session][variant][subset].save_dialogue_fig(140)


IndexError: list index out of range

In [None]:
# Select dialogue
session = 1
subset = 'test'
variant = 'persona_and_hist'
dialog_index = 1

# Make plot
msc_sessions[session][variant][subset].save_dialogue_fig(dialog_index, "./")

In [None]:
msc_sessions[session][variant][subset].indices[1]

In [None]:
import textwrap

prefixes = configs['default']['speaker_prefixes'] + [configs['default']['sessionbreak_token']]

PER_LINE = 0.22 # inch
PER_TURN = 0.15 # inch

def split_speaker_and_text(turn):
    for speaker in prefixes:
        prefix_len = len(speaker)
        if turn[:prefix_len] == speaker:
            return speaker, textwrap.wrap(turn[prefix_len:], width=45)
    assert False, f"None of the speaker prefixes {prefixes} found in turn: {turn}"


def plot_dialogue(turns, next_utterance, title):
    wrapped_turns = [split_speaker_and_text(t) for t in turns+ [prefixes[1] + next_utterance]]
    total_lines = sum([len(t[1]) for t in wrapped_turns])

    # Setup figure
    fig_height = 0.5 + len(wrapped_turns) * PER_TURN + total_lines * PER_LINE
    fig, ax = plt.subplots(figsize=(6, fig_height))
    fig.patch.set_facecolor('ghostwhite')

    # Determine triangle coordinates based on figure size
    triangle = np.array([[0.02, -0.05/fig_height], [0.05, -0.25/fig_height], [0.12, -0.25/fig_height]])

    ypos = 0.2 / fig_height 
    for i, (speaker, wrapped_turn) in enumerate(wrapped_turns):

        # Set alignment alternating left or right
        speaker_index = prefixes.index(speaker)
        alignment = {0: 'left', 1: 'right', 2: 'center'}[speaker_index]
        xpos = {0: 0.05, 1: 0.95, 2: 0.5}[speaker_index]
        bbox_style = dict(
            boxstyle="round", 
            fc={0: 'antiquewhite', 1: 'antiquewhite', 2: 'lightsteelblue'}[speaker_index], 
            ec='tab:blue'
        )
        if i == len(wrapped_turns) - 1:
            bbox_style['fc'] = 'floralwhite'
            bbox_style['linestyle'] = '--'

        # Plot the text
        text = ax.text(xpos, ypos, '\n'.join(wrapped_turn), 
            horizontalalignment=alignment,
            verticalalignment='top',
            wrap=True, 
            multialignment=alignment,
            bbox=bbox_style
        )

        # Increase ypos, for next utterance, depending on number of lines in current turn
        ypos += PER_TURN / fig_height + PER_LINE / fig_height * len(wrapped_turn)

        # Add speaker triangle, except for session breaks
        if speaker_index != 2:
            # Plot triangle below utterance, pointing left or right depending on alignment
            if alignment == 'left':
                triangle_patch = matplotlib.patches.Polygon(np.array([[0, ypos]]) + triangle)
            else:
                triangle_patch = matplotlib.patches.Polygon(np.array([[1, ypos]]) + triangle * np.array([[-1, 1]]))
            ax.add_patch(triangle_patch)

    # Final formatting
    ax.invert_yaxis()
    ax.set_title(title)
    plt.axis('off')
    return fig

# Select dialogue
session = 4
subset = 'valid'
variant = 'persona_and_hist'
dialog_index = 4
dialogue, next_utterance = msc_sessions[session][variant][subset][dialog_index]

# Make plot
dialog_id = msc_sessions[session][variant][subset].indices[dialog_index]
title=f"Dataset: session_{session}/{subset}, dialog_id: {dialog_id['dialog_id']}\nvariant: {variant}"
fig = plot_dialogue(dialogue[:-1].split('\n'), next_utterance, title)  # remove trailing '\n' from dialogue
# print(dialogue)

### Analyse ngram frequency

In [None]:
from collections import Counter

MSC_Session.set(speaker_prefixes=['', ''])

In [None]:
labels = {}
for session in subsets.keys():
    if session == 1:
        version = ['both', 'revised']
        session = '-'.join(['1'] + version)
    labels[int(str(session)[0])] = {}
    option_name = 'no_persona_no_hist'
    for subset in subsets[int(str(session)[0])]:
        msc = MSC_Session(basedir=basedir, session=session, subset=subset, **variants[option_name])
        labels[int(str(session)[0])][subset] = [msc[i][1] for i in range(len(msc))]

In [None]:
def get_ngrams_freq(n, sentence_list):
    counter = Counter()
    for s in sentence_list:
        counter.update(get_ngrams(n, s))
    return counter

def get_ngrams(n, sentence):
    words = sentence.split()
    ngrams = []
    for i in range(len(words) - n):
        ngrams.append(tuple(words[i:i+n]))
    return ngrams

def get_repeating_ngrams(n, sentence_list):
    counter = Counter()
    for s in sentence_list:
        ngrams = get_ngrams(n, s)
        duplicates = [ngram for ngram, freq in Counter(ngrams).items() if freq > 1]
        # if len(duplicates) > 0:
        #     print(len(s.split()), s)
        counter.update(duplicates)
    sorted_counter = sorted(counter.items(), key=lambda x:x[1], reverse=True)
    return sorted_counter

In [None]:
all_targets = labels[4]['train']
# ngrams = get_ngrams_freq(4, all_targets)
# ngrams = sorted(ngrams.items(), key=lambda x:x[1], reverse=True)

duplicates = get_repeating_ngrams(4, all_targets)
len(duplicates)/len(all_targets), len(duplicates), duplicates

In [None]:
for session in subsets.keys():
    for subset in subsets[session]:
        duplicates = get_repeating_ngrams(4, labels[session][subset])
        print(f"session_{session:1}/{subset:6} : {len(duplicates):3d}   {len(duplicates)/len(labels[session][subset]):.2%}")

In [None]:
fig, axs = plt.subplots(ncols=5, figsize=(16, 3), sharey=True, sharex=True)
n_range = np.array([1, 2, 3, 4, 5])
for i, session in enumerate(subsets.keys()):
    for subset in subsets[session]:
        num_duplicates = [len(get_repeating_ngrams(n, labels[session][subset])) for n in n_range]
        axs[i].plot(n_range, num_duplicates, label=subset)
    axs[i].legend()
    axs[i].grid(axis='y', which='major')


In [None]:
fig, axs = plt.subplots(ncols=5, figsize=(16, 3), sharey=True, sharex=True)
n_range = np.array([1, 2, 3, 4, 5])
for i, session in enumerate(subsets.keys()):
    for subset in subsets[session]:
        perc_duplicates = [len(get_repeating_ngrams(n, labels[session][subset])) / len(labels[session][subset]) for n in n_range]
        # perc_duplicates = np.array([d/len(labels[session][subset]) for d in num_duplicates])
        axs[i].plot(n_range, perc_duplicates, label=subset)
    axs[i].legend()
    axs[i].grid(axis='y', which='major')
    # axs[i].set_ylim(0)


### Some examples as input for ChatGPT

In [None]:
session = 5
variant = 'persona_and_hist'
subset = 'test'
dialog_id = 0
print(msc_sessions[session][variant][subset][dialog_id][0])
print(msc_sessions[session][variant][subset][dialog_id][1])