In [1]:
from IPython.display import display
import matplotlib.pyplot as plt
from sklearn import metrics

import json
import os
import pandas as pd

from util import gen_model_preds_df

with open('SWDA_dialogue-acts.json') as f:
    swda_tags = json.load(f)
with open('AMI-DA_dialogue-acts.json') as f:
    ami_tags = json.load(f)

## In-domain / cross-domain pre-training 
First we want to see how in-domain pre-training compares to cross domain pre-training.

In [2]:
def report_metrics(frames, conditions):
    metric_funcs = [
        lambda x,y: metrics.precision_score(x,y,average='macro'), 
        lambda x,y: metrics.recall_score(x,y,average='macro'), 
        lambda x,y: metrics.f1_score(x,y,average='macro'),
        lambda x,y: metrics.precision_score(x,y,average='micro')]
    metric_names = [
        'macro precision',
        'macro recall',
        'macro f1',
        'micro accuracy']
    table = [[
        metric(df['da_tag'], df[cond])
            for df in frames]
            for cond in conditions for metric in metric_funcs]
    multiindex = [[c for c in conditions for m in metric_names],
        [m for c in conditions for m in metric_names]]
    return pd.DataFrame(table, columns=['SWBD', 'AMI'], index=multiindex)

In [3]:
conditions = ['in-domain', 'AMI+SWBD']

pre_corpora = ['SWBD-pre', 'AMI+SWBD-pre']
model_dirs = [f'../../models/SWDA-L_bert_{corpus}_2019-12-03/' for corpus in pre_corpora]
dfs = gen_model_preds_df('SWDA', conditions, model_dirs)

pre_corpora = ['AMI-pre', 'AMI+SWBD-pre']
model_dirs = [f'../../models/AMI-DA-L_bert_{corpus}_2019-12-03/' for corpus in pre_corpora]
dfa = gen_model_preds_df('AMI-DA', conditions, model_dirs)
dfa = dfa[dfa['da_tag'].notnull()]

report_metrics([dfs,dfa], conditions)

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Unnamed: 1,SWBD,AMI
in-domain,macro precision,0.562046,0.523854
in-domain,macro recall,0.417171,0.453892
in-domain,macro f1,0.45476,0.465609
in-domain,micro accuracy,0.770237,0.686575
AMI+SWBD,macro precision,0.589477,0.543774
AMI+SWBD,macro recall,0.441669,0.469115
AMI+SWBD,macro f1,0.477812,0.487214
AMI+SWBD,micro accuracy,0.773529,0.68584


Pre-training on the combined corpus offers a modest improvement in macro-averaged F1 over simple in-domain pre-training, though there is little effect on micro-averaged performance.
This indicates that pre-training on the lareger, more diverse dataset is helpful for classification of lower-frequency tags. 
A larger pre-training corpus may lead to more significant gains.

In [4]:
conditions = ['in-domain', 'standard', 'in-domain_frozen', 'standard_frozen']

model_dirs = [
    '../../models/SWDA-L_bert_SWBD-pre_2019-12-03/',
    '../../models/SWDA-L_bert_2019-11-20',
    '../../models/SWDA-L_bert_SWBD-pre_frozen_2019-12-03',
    '../../models/SWDA-L_bert_frozen_2019-11-20']
dfs = gen_model_preds_df('SWDA', conditions, model_dirs)

model_dirs = [
    '../../models/AMI-DA-L_bert_AMI-pre_2019-12-03/',
    '../../models/AMI-DA-L_bert_2019-11-20',
    '../../models/AMI-DA-L_bert_AMI-pre_frozen_2019-12-03',
    '../../models/AMI-DA-L_bert_frozen_2019-11-20']
dfa = gen_model_preds_df('AMI-DA', conditions, model_dirs)
dfa = dfa[dfa['da_tag'].notnull()]

report_metrics([dfs,dfa], conditions)

Unnamed: 0,Unnamed: 1,SWBD,AMI
in-domain,macro precision,0.562046,0.523854
in-domain,macro recall,0.417171,0.453892
in-domain,macro f1,0.45476,0.465609
in-domain,micro accuracy,0.770237,0.686575
standard,macro precision,0.561119,0.587054
standard,macro recall,0.43047,0.483137
standard,macro f1,0.459891,0.500326
standard,micro accuracy,0.769267,0.669527
in-domain_frozen,macro precision,0.077394,0.270771
in-domain_frozen,macro recall,0.077295,0.174683


The effect of additional pre-training is mixed. For AMI it appears that in-domain pre-training offers a modest performance boost, but there is no discernable effect in the case of Switchboard. Indeed, when BERT is frozen during fine-tuning, the model that received no additional pre-traininig performs better by more than 3 percentage points. 

In [5]:
conditions = ['addl-pre', 'standard', 'addl-pre-NL', 'standard-NL']

model_dirs = [
    '../../models/SWDA-L_bert_SWBD-pre_2019-12-03/',
    '../../models/SWDA-L_bert_2019-11-20',
    '../../models/SWDA-NL_bert_SWBD-pre_2019-12-03/',
    '../../models/SWDA-NL_bert_2019-11-20']
dfs = gen_model_preds_df('SWDA', conditions, model_dirs)

model_dirs = [
    '../../models/AMI-DA-L_bert_AMI-pre_2019-12-03/',
    '../../models/AMI-DA-L_bert_2019-11-20',
    '../../models/AMI-DA-NL_bert_AMI-pre_2019-12-03/',
    '../../models/AMI-DA-NL_bert_2019-11-20']
dfa = gen_model_preds_df('AMI-DA', conditions, model_dirs)
dfa = dfa[dfa['da_tag'].notnull()]


report_metrics([dfs,dfa], conditions)

Unnamed: 0,Unnamed: 1,SWBD,AMI
addl-pre,macro precision,0.562046,0.523854
addl-pre,macro recall,0.417171,0.453892
addl-pre,macro f1,0.45476,0.465609
addl-pre,micro accuracy,0.770237,0.686575
standard,macro precision,0.561119,0.587054
standard,macro recall,0.43047,0.483137
standard,macro f1,0.459891,0.500326
standard,micro accuracy,0.769267,0.669527
addl-pre-NL,macro precision,0.56016,0.55105
addl-pre-NL,macro recall,0.4332,0.456565
