In [None]:
import os
import re
import sys
import random
import time
import json
import farmhash # https://github.com/veelion/python-farmhash
import numpy as np
import pandas as pd
import torch
import warnings
from collections import defaultdict

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    
def k_folds_partition(data, hash_column = 'text', n_splits = 10, seed = 43):
    set_seed(seed)
    data = data.copy()
    partition_hash = data[hash_column].apply(lambda x: farmhash.hash64withseed(x, seed))
    partition = np.abs(partition_hash % n_splits)
    return partition

def remove_tabs_newlines(x):
    return re.sub(r"[\n\t\r]*", "", x)
def remove_multi_spaces(x):
    return re.sub(r"\s\s+", " ", x)
def remove_punctuation(text):
    output_text = re.sub('[^A-Za-z0-9]+', ' ', text)
    return output_text

In [None]:
import sys
sys.path.append("../311_text_classifier/src")
from models.evaluate import get_metrics

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
warnings.filterwarnings("ignore")

In [None]:
pd.set_option("max_rows", 999)
pd.set_option("max_columns", 999)

In [None]:
seed = 43
set_seed(seed)

In [None]:
# Prep data fon D2 task

# d1_files = [
#     "/hub/data_unprocessed/311/ManholeComplainD2POC.csv",
#     "/hub/311_text_classifier/data/raw/Qualifiers.csv"
# ]

# qualifiers = {
#     'Sign Types':['Stop Sign', 'Yield Sign', 'One Way Sign', 'Parking Sign',
#                   'Speed Limit Sign','Pedestrian Sign','Unknown Sign',
#                   'Advance Traffic Control Sign','Public Safety Sign',
#                   'Street Name Sign','Unspecified Sign','Other Sign'],
#     'Locations':['Unspecified Location','In Ditch','On Sidewalk','In Road',
#                  'In Traffic Lane','In Park','Path','In Yard','In Water',
#                  'Roadside','Parking Lot','On Parking Strip','In Median',
#                  'On Private Property','On Shoulder','In Bike Lane','In Street Gutter',
#                  'Sight Obstruction','In Building','In Driveway','Unknown Location',
#                  'Alley','Vacant Lot','Trail','Curb'],
#     'Hazards':['Ice Hazard','Electrical Hazard'],
#     'Graffiti Tags':['Gang','Ideological'],
#     'Vehicle Types':['VehicleType:Bicycle'],
#     'Manhole Complaints':['Cover Missing', 'Sunken Manhole', 'Displaced Cover',
#                           'Damaged Cover', 'Concrete Issue', 'Insecure Cover',
#                           'Raised Manhole', 'Structure Damage', 'Noisy Cover']
# }
# qualifier_map = {v:k for k in qualifiers.keys() for v in qualifiers[k]}

# data = []
# for file in d1_files:
#     temp=pd.read_csv(file)
#     temp.columns = ['text', 'category']
#     data.append(temp)
# data = pd.concat(data, axis=0)
# data['daupler_generated'] = np.nan
# data['qualifier'] = data['category'].map(qualifier_map)

# data.to_csv(f"/hub/311_text_classifier/data/raw/PW-D2-{date_tag}-PROD.csv", index=False)

In [None]:
# Separate D2 data into individual qualifier files

# task = 'D2'
# date_tag = "2021_04_08"
# file = f"/hub/311_text_classifier/data/raw/PW-{task}-{date_tag}-PROD.csv"

# # Break D2 taks into files by qualifier
# data = pd.read_csv(file)

# d2 = {
#     'Manhole Complaints':'MANC',
#     'Locations':'LOC', 
#     'Hazards':'HAZ', 
#     'Sign Types':'SIGNT', 
#     'Graffiti Tags':'GTAG', 
#     'Vehicle Types':'VEHT'
# }

# qual = {}
# for qualifier in data.qualifier.unique():
#     qual[d2[qualifier]] = data[data['qualifier']==qualifier].copy().reset_index(drop=True)
    
# for key in qual.keys():
#     print(key)
#     print(qual[key].shape)
#     print(qual[key]['category'].nunique())
#     display(qual[key].head(1))
    
    

# # dict_keys(['MANC', 'LOC', 'HAZ', 'SIGNT', 'GTAG', 'VEHT'])
    
# for key in qual.keys():
#     task = key
#     file = f"/hub/311_text_classifier/data/raw/PW-{task}-{date_tag}-PROD.csv"
#     print(file)
#     qual[key].to_csv(file, index=False)

### Data with punctuation removed

In [None]:
tasks = ['D0', 'D1', 'D2', 'MANC', 'LOC', 'HAZ', 'SIGNT', 'GTAG', 'VEHT']
fold = 8
date_tag = "2021_04_08"

for task in tasks:
    data_dir = f"/hub/CA-MTL/data/{task}"
    file = f"/hub/311_text_classifier/data/raw/PW-{task}-{date_tag}-PROD.csv"

    out_dir = f"{data_dir}/{date_tag}_WO_PUNCT"
    train_file_out = f"{out_dir}/train.tsv"
    train_dev_file_out = f"{out_dir}/train-dev.tsv"
    dev_file_out = f"{out_dir}/dev.tsv"
    test_file_out = f"{out_dir}/test.tsv"
    metadata_file_out = f"{out_dir}/metadata.json"

    metadata = dict(
        raw_data_file = file, 
        data_version = date_tag, 
        task_name = task, 
        file_paths = {
            'train':train_file_out, 
            'train-dev':train_dev_file_out, 
            'dev':dev_file_out, 
            'test':test_file_out
        },
        partition_rules = [
            f'external/daupler together; 10-fold split; seed = {seed}; selected fold = {fold}',
            f'selected fold used for train_dev, dev, and test as placeholders'
        ]
    )

    try:
        os.mkdir(out_dir)
    except OSError as error:
        print("Directory already exists")
        pass

    # Read data and remove all tabs, multi-spaces, and new lines
    data = pd.read_csv(file)

    data['text'] = data['text'].apply(remove_tabs_newlines)
    data['text'] = data['text'].apply(remove_multi_spaces)
    data['text'] = data['text'].apply(remove_punctuation)
    data = data.drop_duplicates('text').reset_index(drop=True)

    # Handle remapping tasks
    if task == 'D0':
        data = data[data['category'] != 'Fire'].reset_index(drop=True)
        data['category'] = np.where(
            data['category']=='Parks', 'Defer', data['category'])
    elif task == 'D1':
        data['category'] = np.where(
            data['category']=='Water Meter Issue', 'Meter Issue', data['category'])   
    
    #Partition External Data in Train and Train-Dev
    data['partition'] = k_folds_partition(
        data, hash_column = 'text', n_splits = 10, seed = seed)

    train_condition = data['partition']!=fold
    train = data[train_condition].reset_index(drop=True)
    train_dev = data[~train_condition].reset_index(drop=True)
    dev = data[~train_condition].reset_index(drop=True)
    test = data[~train_condition].reset_index(drop=True)

    #Generate Metadata
    metadata['labels'] = data['category'].sort_values().unique().tolist()

    out_cols = {
        'D0':[
            'text',
            'category',
            'internal_id',
            'external_id'
        ],
        'D1':[
            'text',
            'category',
            'internal_id',
            'external_id'
        ],
        'D2':[
            'text',
            'category',
            'qualifier'
        ],
        'MANC':[
            'text',
            'category',
            'qualifier'
        ],
        'LOC':[
            'text',
            'category',
            'qualifier'
        ],
        'HAZ':[
            'text',
            'category',
            'qualifier'
        ],
        'SIGNT':[
            'text',
            'category',
            'qualifier'
        ],
        'GTAG':[
            'text',
            'category',
            'qualifier'
        ],
        'VEHT':[
            'text',
            'category',
            'qualifier'
        ],
    }

    # Write data
    train[out_cols[task]].to_csv(train_file_out,sep='\t',index=False)
    train_dev[out_cols[task]].to_csv(train_dev_file_out,sep='\t',index=False)
    dev[out_cols[task]].to_csv(dev_file_out,sep='\t',index=False)
    test[out_cols[task]].to_csv(test_file_out,sep='\t',index=False)
    json.dump(metadata, open(metadata_file_out, 'w'))

```
python run.py \
--model_name_or_path CA-MTL-tiny \
--data_dir /hub/CA-MTL/data \
--output_dir /hub/CA-MTL/mock_models \
--tasks D0 D1 MANC LOC SIGNT \
--overwrite_cache \
--task_data_folders D0/2021_04_08_WO_PUNCT D1/2021_04_08_WO_PUNCT MANC/2021_04_08_WO_PUNCT LOC/2021_04_08_WO_PUNCT SIGNT/2021_04_08_WO_PUNCT \
--do_train \
--do_eval \
--do_predict \
--evaluate_during_training \
--per_device_train_batch_size 32 \
--per_device_eval_batch_size 32 \
--learning_rate 5e-5 \
--adam_epsilon 1e-8 \
--num_train_epochs 7 \
--warmup_steps 0 \
--save_steps 1500 \
--save_total_limit 1 \
--seed 43
```

### Additional task data
- Water
- Sewer
- Defer

In [None]:
tasks = ['Water', 'Sewer', 'Defer', 'Code Enforcement']
fold = 8
date_tag = "2021_04_08"
parent_files = []

# Read D1 data anf process
high_task = 'D0'
file = f"/hub/311_text_classifier/data/raw/PW-{high_task}-{date_tag}-PROD.csv"
parent_files.append(file)
data = pd.read_csv(file)

data['text'] = data['text'].apply(remove_tabs_newlines)
data['text'] = data['text'].apply(remove_multi_spaces)
data = data.drop_duplicates('text').reset_index(drop=True)

# Handle remapping tasks
if high_task == 'D0':
    data = data[data['category'] != 'Fire'].reset_index(drop=True)
    data['category'] = np.where(
        data['category']=='Parks', 'Defer', data['category'])

# Break data up into sub-tasks
cat_data = {}
for cat in tasks:
    cat_data[cat] = data[data.category == cat].copy().reset_index(drop=True)
    print(f"{cat}: {cat_data[cat].shape}")

# Read D1 data and process
high_task = 'D1'
file = f"/hub/311_text_classifier/data/raw/PW-{high_task}-{date_tag}-PROD.csv"
parent_files.append(file)

# Read data and remove all tabs, multi-spaces, and new lines
data = pd.read_csv(file)

data['text'] = data['text'].apply(remove_tabs_newlines)
data['text'] = data['text'].apply(remove_multi_spaces)
data = data.drop_duplicates('text').reset_index(drop=True)

#Partition Data
data['partition'] = k_folds_partition(
    data, hash_column = 'text', n_splits = 10, seed = seed)

# Handle remapping tasks
if high_task == 'D1':
    data['category'] = np.where(
        data['category']=='Water Meter Issue', 'Meter Issue', data['category'])   

# Select data for task based on D0 groupings
d1_cat_data = {}
for cat in tasks:
    cat_text = cat_data[cat].text.to_list()
    d1_cat_data[cat] = data[data.text.isin(cat_text)]

metadata_review = []
# Write out new tasks
for task in tasks:
    data_dir = f"/hub/CA-MTL/data/{task.upper().replace(' ', '')}"

    out_dir = f"{data_dir}/{date_tag}"
    train_file_out = f"{out_dir}/train.tsv"
    train_dev_file_out = f"{out_dir}/train-dev.tsv"
    dev_file_out = f"{out_dir}/dev.tsv"
    test_file_out = f"{out_dir}/test.tsv"
    metadata_file_out = f"{out_dir}/metadata.json"

    metadata = dict(
        raw_data_file = parent_files, 
        data_version = date_tag, 
        task_name = task.upper().replace(' ', ''), 
        file_paths = {
            'train':train_file_out, 
            'train-dev':train_dev_file_out, 
            'dev':dev_file_out, 
            'test':test_file_out
        },
        partition_rules = [
            f'external/daupler together; 10-fold split; seed = {seed}; selected fold = {fold}',
            f'selected fold used for train_dev, dev, and test as placeholders',
            f'task category selected from full dataset'
        ]
    )
    metadata_review.append(metadata)

    try:
        os.mkdir(data_dir)
    except OSError as error:
        print("Directory already exists")
        pass
    
    try:
        os.mkdir(out_dir)
    except OSError as error:
        print("Directory already exists")
        pass


    train_condition = d1_cat_data[task]['partition']!=fold
    train = d1_cat_data[task][train_condition].reset_index(drop=True)
    train_dev = d1_cat_data[task][~train_condition].reset_index(drop=True)
    dev = d1_cat_data[task][~train_condition].reset_index(drop=True)
    test = d1_cat_data[task][~train_condition].reset_index(drop=True)

    #Generate Metadata
    metadata['labels'] = d1_cat_data[task]['category'].sort_values().unique().tolist()

    out_cols = ['text',
                'category',
                'internal_id',
                'external_id']
    
    # Write data
    train[out_cols].to_csv(train_file_out,sep='\t',index=False)
    train_dev[out_cols].to_csv(train_dev_file_out,sep='\t',index=False)
    dev[out_cols].to_csv(dev_file_out,sep='\t',index=False)
    test[out_cols].to_csv(test_file_out,sep='\t',index=False)
    json.dump(metadata, open(metadata_file_out, 'w'))
    
for i in range(len(metadata_review)):
    print(metadata_review[i]['task_name'])
    print(len(metadata_review[i]['labels']))

```
python run.py \
--model_name_or_path CA-MTL-tiny \
--data_dir /hub/CA-MTL/data \
--output_dir /hub/CA-MTL/mock_models \
--tasks D0 D1 MANC LOC SIGNT WATER SEWER DEFER CODEENFORCEMENT \
--overwrite_cache \
--task_data_folders D0/2021_04_08 D1/2021_04_08 MANC/2021_04_08 LOC/2021_04_08 SIGNT/2021_04_08 WATER/2021_04_08 SEWER/2021_04_08 DEFER/2021_04_08 CODEENFORCEMENT/2021_04_08  \
--do_train \
--do_eval \
--do_predict \
--evaluate_during_training \
--per_device_train_batch_size 32 \
--per_device_eval_batch_size 32 \
--learning_rate 5e-5 \
--adam_epsilon 1e-8 \
--num_train_epochs 7 \
--warmup_steps 0 \
--save_steps 1500 \
--save_total_limit 1 \
--seed 43
```

# Run

```
python run.py \
--model_name_or_path CA-MTL-tiny \
--data_dir /hub/CA-MTL/data \
--output_dir /hub/CA-MTL/mock_models \
--tasks D0 D1 MANC LOC SIGNT \
--overwrite_cache \
--task_data_folders D0/2021_04_08_WO_PUNCT D1/2021_04_08_WO_PUNCT MANC/2021_04_08_WO_PUNCT LOC/2021_04_08_WO_PUNCT SIGNT/2021_04_08_WO_PUNCT \
--do_train \
--do_eval \
--do_predict \
--evaluate_during_training \
--per_device_train_batch_size 32 \
--per_device_eval_batch_size 32 \
--learning_rate 5e-5 \
--adam_epsilon 1e-8 \
--num_train_epochs 7 \
--warmup_steps 0 \
--save_steps 1500 \
--save_total_limit 1 \
--seed 43
```

```
python run_inference.py \
--model_name_or_path /hub/CA-MTL/mock_models/vital-smoke-40-9000 \
--data_dir /hub/CA-MTL/data \
--output_dir /hub/CA-MTL/data/SCORED \
--overwrite_cache \
--task_data_folders TOSCORE \
--do_predict
```

# Move data

In [None]:
# Move dev set to output folder

# Evaluate 10-Fold Results

Set up dictionary of files for data loading

In [None]:
model_dir = "/hub/CA-MTL/mock_models/"
runs = [
    'vibrant-river-1',
    'colorful-elevator-2',
    'super-deluge-3',
    'elated-wave-5',
    'magic-lake-6',
    'neat-wind-7',
    'faithful-music-8',
    'winter-bush-9',
    'stellar-paper-10',
    'still-glade-11'
]
files = os.listdir(model_dir)

In [None]:
run_files = defaultdict(defaultdict)
for run in runs:
    for d in ['D0', 'D1']:
        run_files[run][d] = {}
        for partition in ['test', 'dev']:
            run_files[run][d][partition] = [
                file for file in files if (run in file) & file.startswith(f"{d}_{partition}")]

Load data for each partition run

In [None]:
run_data = defaultdict(defaultdict)
for run in runs:
    for d in ['D0', 'D1']:
        data = pd.read_csv(
            f"{model_dir}/{run_files[run][d]['dev'][0]}", sep="\t")
        data['prediction'] = pd.read_csv(
            f"{model_dir}/{run_files[run][d]['test'][0]}", sep="\t")['prediction']
        data['run'] = run
        run_data[run][d] = data

Append data into a single frame for metric analysis

In [None]:
d0_data = pd.DataFrame()
d1_data = pd.DataFrame()
for run in run_data.keys():
    d0_data = d0_data.append(run_data[run]['D0'], ignore_index = True)
    d1_data = d1_data.append(run_data[run]['D1'], ignore_index = True)

##### Calculate metrics across all folds

In [None]:
metrics = {}
metrics['D0'] = get_metrics(
    data=d0_data, true_y="category", predicted_y=f"prediction", 
            labels=d0_data["category"].unique().tolist())
metrics['D1'] = get_metrics(
    data=d1_data, true_y="category", predicted_y=f"prediction", 
            labels=d1_data["category"].unique().tolist())

In [None]:
metrics['D0']

In [None]:
metrics['D1']

##### Calculate metrics by fold and plot


In [None]:
d0_data.head()

In [None]:
run_metrics = {}
run_metrics['D0'] = {}
run_metrics['D1'] = {}
for run in runs:
    d0_temp = d0_data[d0_data['run']==run].copy()
    d1_temp = d1_data[d1_data['run']==run].copy()
    
    run_metrics['D0'][run] = get_metrics(
        data=d0_temp, true_y="category", predicted_y=f"prediction", 
        labels=d0_temp["category"].unique().tolist())
    run_metrics['D1'][run] = get_metrics(
        data=d1_temp, true_y="category", predicted_y=f"prediction", 
        labels=d1_temp["category"].unique().tolist()) 

In [None]:
plot_metrics = pd.DataFrame()
d0_f1, d1_f1 = [], []
for run in runs:
    d0_f1.append(run_metrics['D0'][run].loc[0, 'f1'])
    d1_f1.append(run_metrics['D1'][run].loc[0, 'f1'])

temp = pd.DataFrame({
    "wghtd_f1":d0_f1+d1_f1, 
    "D":[f"D0"]*len(runs) + [f"D1"]*len(runs)})
plot_metrics = plot_metrics.append(temp, ignore_index=True)

In [None]:
f = plt.figure(figsize=(15, 8))
f.suptitle(f"Weighted F1 Performance on Daupler vs External Data\n(CA-MTL: tinyBERT)", 
          fontsize=18)
gs = f.add_gridspec(1, 1)
f.patch.set_facecolor('white')
with sns.axes_style("whitegrid"):
    ax = f.add_subplot(gs[0, 0])
    sns.violinplot(x="D", y="wghtd_f1", data=plot_metrics)
    plt.xticks(fontsize=16)
    plt.yticks(fontsize=16)

##### Calculate metrics by fold and source and plot

In [None]:
d0_data['daupler_generated'] = np.where(
    d0_data['external_id'].str.contains('daupler'), 1, 0)
d1_data['daupler_generated'] = np.where(
    d1_data['external_id'].str.contains('daupler'), 1, 0)

In [None]:
source_metrics = {}
source_metrics['D0'] = defaultdict(defaultdict)
source_metrics['D1'] = defaultdict(defaultdict)
for run in runs:
    d0_dau = d0_data[
        (d0_data['run']==run) & (d0_data['daupler_generated']==1)].copy()
    d0_ext = d0_data[
        (d0_data['run']==run) & (d0_data['daupler_generated']!=1)].copy()
    d1_dau = d1_data[
        (d1_data['run']==run) & (d1_data['daupler_generated']==1)].copy()
    d1_ext = d0_data[
        (d1_data['run']==run) & (d1_data['daupler_generated']!=1)].copy()
    
    source_metrics['D0'][run]['dau'] = get_metrics(
        data=d0_dau, true_y="category", predicted_y=f"prediction", 
        labels=d0_dau["category"].unique().tolist())
    source_metrics['D0'][run]['ext'] = get_metrics(
        data=d0_ext, true_y="category", predicted_y=f"prediction", 
        labels=d0_ext["category"].unique().tolist())
        
    source_metrics['D1'][run]['dau'] = get_metrics(
        data=d1_dau, true_y="category", predicted_y=f"prediction", 
        labels=d1_dau["category"].unique().tolist()) 
    source_metrics['D1'][run]['ext'] = get_metrics(
        data=d1_ext, true_y="category", predicted_y=f"prediction", 
        labels=d1_ext["category"].unique().tolist())     

In [None]:
plot_metrics = pd.DataFrame()
for source in ['ext', 'dau']:
    d0_f1, d1_f1 = [], []
    for run in runs:
        d0_f1.append(source_metrics['D0'][run][source].loc[0, 'f1'])
        d1_f1.append(source_metrics['D1'][run][source].loc[0, 'f1'])

    temp = pd.DataFrame({
        "wghtd_f1":d0_f1+d1_f1, 
        "D":[f"D0_{source}"]*len(runs) + [f"D1_{source}"]*len(runs)})
    plot_metrics = plot_metrics.append(temp, ignore_index=True)

In [None]:
f = plt.figure(figsize=(15, 8))
f.suptitle(f"Weighted F1 Performance on Daupler vs External Data\n(CA-MTL: tinyBERT)", 
           fontsize=18)
gs = f.add_gridspec(1, 1)
f.patch.set_facecolor('white')
with sns.axes_style("whitegrid"):
    ax = f.add_subplot(gs[0, 0])
    sns.violinplot(x="D", y="wghtd_f1", data=plot_metrics)
    plt.xticks(fontsize=16)
    plt.yticks(fontsize=16)

# Evaluate Qualifier Task Runs

In [None]:
model_dir = "/hub/CA-MTL/mock_models/"
runs = {
    'stellar-paper-10': ['D0', 'D1'],
    'kind-field-12': ['D0', 'D1', 'D2'],
    'lilac-dream-14': ['D0', 'D1', 'MANC', 'LOC', 'SIGNT'],
    'happy-monkey-15': ['D1', 'MANC', 'LOC', 'SIGNT'],
    'dandy-gorge-16': ['D0', 'MANC', 'LOC', 'SIGNT'],
    'cool-puddle-17': ['D0', 'D1', 'MANC', 'LOC', 'SIGNT'],
    'bright-yogurt-18': ['D0', 'D1'],
    }
files = os.listdir(model_dir)

In [None]:
# generate a dictionary mapping the run files to the appropriate task and fold
run_files = defaultdict(defaultdict)
for run in runs:
    for task in runs[run]:
        run_files[run][task] = {}
        for partition in ['test', 'dev']:
            run_files[run][task][partition] = [
                file for file in files if (run in file) & file.startswith(f"{task}_{partition}")]
# read data            
run_data = defaultdict(defaultdict)
for run in run_files:
    for task in run_files[run]:
        dev_path = f"{model_dir}{run_files[run][task]['dev'][0]}"
        test_path = f"{model_dir}{run_files[run][task]['test'][0]}"
        data = pd.read_csv(dev_path, sep="\t")
        data['prediction'] = pd.read_csv(test_path, sep="\t")['prediction']
        data['run'] = run
        run_data[run][task] = data

##### Get Metrics

In [None]:
metrics = defaultdict(defaultdict)
run_metrics = {}
for run in run_data.keys():
    run_metrics[run] = pd.DataFrame()
    for task in run_data[run].keys():
        metrics[run][task] = get_metrics(
            data=run_data[run][task], true_y="category", predicted_y=f"prediction", 
            labels=run_data[run][task]["category"].unique().tolist())
        metrics[run][task]['task'] = task
        run_metrics[run] = run_metrics[run].append(metrics[run][task])

In [None]:
run_metrics.keys()

##### Get DistilBERT Fold 8 metrics for each task for comparison

Get a list of text to match for each task in order to make an apples to apples comparison

In [None]:
text_to_match = {}
for task in run_data['lilac-dream-14'].keys():
    text_to_match[task] = run_data['lilac-dream-14'][task].text.tolist()

Load data

In [None]:
dbert_dir = "/hub/311_text_classifier/models"
dbert = {
    'D0': f"{dbert_dir}/D0_development_v1.0_2021_04_08_data_val_preds.csv",
    'D1': f"{dbert_dir}/D1_development_v1.0_2021_04_08_data_val_preds.csv",
    'LOC': f"{dbert_dir}/D2_distilbert_locations_prototype_val_preds.csv",
    'MANC': f"{dbert_dir}/D2_distilbert_manhole_complaint_prototype_val_preds.csv",
    'SIGNT': f"{dbert_dir}/D2_distilbert_sign_types_prototype_val_preds.csv",
}

dbert_data = {}
for task in dbert.keys():
    # load and clean data to match (unfortunately this new data cleaning was not done originally)
    temp = pd.read_csv(dbert[task])
    temp['text'] = temp['text'].apply(remove_tabs_newlines)
    temp['text'] = temp['text'].apply(remove_multi_spaces)
    temp = temp.drop_duplicates('text').reset_index(drop=True)
    
    # grab only fold 8 text predictions
    dbert_data[task] = temp[temp['text'].isin(text_to_match[task])].copy().reset_index()

Get metrics
- we are going to be adding these to the metrics dictionaries started above

In [None]:
task_cat_pred = {
    'D0':{'act': 'D0_category', 'pred': 'D0_predicted_category'},
    'D1':{'act': 'D1_category', 'pred': 'D1_predicted_category'},
    'LOC':{'act': 'D2_category', 'pred': 'D2_predicted_category'},
    'MANC':{'act': 'D2_category', 'pred': 'D2_predicted_category'},
    'SIGNT':{'act': 'D2_category', 'pred': 'D2_predicted_category'},
}

In [None]:
run = 'dbert'
run_metrics[run] = pd.DataFrame()
for task in dbert.keys():
    metrics[run][task] = get_metrics(
        data=dbert_data[task], true_y=task_cat_pred[task]['act'], 
        predicted_y=task_cat_pred[task]['pred'], 
        labels=dbert_data[task][task_cat_pred[task]['act']].unique().tolist())
    metrics[run][task]['task'] = task
    run_metrics[run] = run_metrics[run].append(metrics[run][task])

In [None]:
run_metrics['dbert']

### Evaluate All Iterations

In [None]:
def prep_data_for_merge(data, run, comp_cols, index_col=['task', 'index']):
    temp = data[comp_cols].copy()
    temp.set_index(index_col, inplace = True)
    
    run_name = [run]*len(temp.columns)
    temp.columns = pd.MultiIndex.from_arrays((run_name, temp.columns))
    
    return temp

In [None]:
runs = list(run_metrics.keys())
comp_cols = ['index','f1', 'task']
comparison = prep_data_for_merge(
    data=run_metrics[runs[0]], run=runs[0], comp_cols=comp_cols)

for run in runs[1:]:
    temp =  prep_data_for_merge(
        data=run_metrics[run], run=run, comp_cols=comp_cols)
    comparison = comparison.merge(temp, how = 'outer', left_index=True, right_index=True)

All metrics from all tasks

In [None]:
comparison

Overall metrics from all tasks

In [None]:
comparison[comparison.index.get_level_values('index').isin(['Overall'])]

Metrics from specific task

In [None]:
comparison[comparison.index.get_level_values('task').isin(['D1'])]

Specific label from specific task

In [None]:
task = 'D1'
label = 'Water Leaking'
pd.DataFrame(comparison.loc[(task, label)])

# Focused, deep-dive comparison

Subset to relvant tasks, drop any task/labels for which all models are NaN, and compress hierarchical column index for ease of slicing...

In [None]:
# comp_model_1 = "bright-yogurt-18"
comp_model_1 = "dbert"
comp_model_2 = 'cool-puddle-17'

In [None]:
dd_tasks = [
    (comp_model_2, 'f1'),
    (comp_model_1, 'f1')]
dd_comp = comparison[dd_tasks].copy().dropna(how='all')

In [None]:
dd_comp.columns = [col[0] for col in dd_comp.columns]

Find which model did the best for each label in each task

In [None]:
dd_comp['winner'] = dd_comp.idxmax(axis=1)

In [None]:
dd_overall = dd_comp[dd_comp.index.get_level_values('index').isin(['Overall'])]
dd_comp = dd_comp[~dd_comp.index.get_level_values('index').isin(['Overall'])]

In [None]:
with sns.axes_style("whitegrid"):
    pd.DataFrame(dd_comp['winner'].value_counts().sort_index()).plot(
        kind='bar',
        title='All Tasks: Count of labels where model metrics are highest')

In [None]:
for task in dd_comp.index.get_level_values('task').unique():
    task_comp = dd_comp[dd_comp.index.get_level_values('task').isin([task])]
    with sns.axes_style("whitegrid"):
        pd.DataFrame(task_comp['winner'].value_counts().sort_index()).plot(
            kind='bar',
            title=f'{task}: Count of labels where model metrics are highest')

How much better or worse is DistilBERT doing?

In [None]:
dd_comp['abs_diff'] = np.abs(dd_comp[comp_model_2] - dd_comp[comp_model_1])

In [None]:
f = plt.figure(figsize=(15, 8))
f.suptitle(
    f"All Tasks: Absolute difference between DistilBERT metrics and competitor metrics when each is correct", 
    fontsize=18)
gs = f.add_gridspec(1, 1)
f.patch.set_facecolor('white')
with sns.axes_style("whitegrid"):
    ax = f.add_subplot(gs[0, 0])
    sns.violinplot(x="winner", y="abs_diff", data=dd_comp)
    plt.xticks(fontsize=16)
    plt.yticks(fontsize=16)

In [None]:
for task in dd_comp.index.get_level_values('task').unique():
    task_comp = dd_comp[dd_comp.index.get_level_values('task').isin([task])].sort_values('winner')
    f = plt.figure(figsize=(15, 8))
    f.suptitle(f"{task}: Absolute difference between {comp_model_1} metrics and {comp_model_2} metrics (by winner)", 
               fontsize=18)
    gs = f.add_gridspec(1, 1)
    f.patch.set_facecolor('white')
    with sns.axes_style("whitegrid"):
        ax = f.add_subplot(gs[0, 0])
        sns.violinplot(x="winner", y="abs_diff", data=task_comp)
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)

What are the top 10 labels that each model is doing better at?

In [None]:
dd_comp[dd_comp['winner'] == comp_model_1].sort_values(
    by = 'abs_diff', ascending = False).head(10)

In [None]:
dd_comp[dd_comp['winner'] == comp_model_2].sort_values(
    by = 'abs_diff', ascending = False).head(10)

##### By task comparisons

In [None]:
dd_comp[dd_comp['winner'] == comp_model_1].sort_index()

In [None]:
dd_comp[dd_comp['winner'] == comp_model_2].sort_index()

In [None]:
dd_comp['diff'] = dd_comp[comp_model_1] - dd_comp[comp_model_2]

In [None]:
for task in dd_comp.index.get_level_values('task').unique():
    task_comp = dd_comp[dd_comp.index.get_level_values('task').isin([task])].sort_values(
        'winner').copy()
    task_comp = task_comp.reset_index(0, drop=True).sort_values(by='diff')
    figsize = (15,8)
    if task == 'D1':
        figsize = (15, 35)
    f = plt.figure(figsize=figsize)
    f.suptitle(f"{task}: F1 Metric Difference ({comp_model_1} - {comp_model_2})\n {comp_model_1}=Blue",
               fontsize=18)
    gs = f.add_gridspec(1, 1)
    f.patch.set_facecolor('white')
    with sns.axes_style("whitegrid"):
        ax = f.add_subplot(gs[0, 0])
        task_comp['diff'].plot(
            kind='barh', color=task_comp.winner.map(
                {comp_model_1: 'blue', comp_model_2: 'orange'}))
        plt.axvline(x=0, color = 'black')
        plt.xticks(fontsize=12)
        plt.yticks(fontsize=12)
        plt.show()


# Text Comparisons
- When the models are wrong, are they wrong in the same way?
- When one model is right and the other is wrong, how is it wrong?
- When the comparison model is right and the DistilBERT model is wrong, is the DistilBERT model very confident?

In [None]:
task_cols = {
    'dbert':{
        'D0': ['text', 'D0_category', 'D0_predicted_category'],
        'D1': ['text', 'D1_category', 'D1_predicted_category'],
        'LOC': ['text', 'D2_category', 'D2_predicted_category'],
        'SIGNT': ['text', 'D2_category', 'D2_predicted_category'],
        'MANC': ['text', 'D2_category', 'D2_predicted_category'],
    },
    'other':{
        'D0': ['text', 'category', 'prediction'],
        'D1': ['text', 'category', 'prediction'],
        'LOC': ['text', 'category', 'prediction'],
        'SIGNT': ['text', 'category', 'prediction'],
        'MANC': ['text', 'category', 'prediction'],
    },
    

}
cols = ['text', 'category_1', 'prediction_1']

In [None]:
comp_data = run_data[comp_model_2].copy()

Merge comparison model and distilbert model data together for direct comparison
- Calculate where
    - Models disagree
    - Models disagree and DistilBERT is wrong
    - Models disagree and comparison model is wrong
    - Models disagree and both are wrong

In [None]:
non_present_tasks = []
if comp_model_1 =='dbert':
    comp_temp = dbert_data.copy()
else:
    comp_temp = run_data[comp_model_1].copy()
for task in comp_data.keys():
    if task in comp_temp.keys():
        if comp_model_1 =='dbert':
            temp = comp_temp[task][task_cols['dbert'][task]].copy()
            temp.columns = cols
        else:
            temp = comp_temp[task][task_cols['other'][task]].copy()
            temp.columns = cols
        comp_data[task] = comp_data[task].merge(temp, how = 'left', on = 'text')
        comp_data[task]['disagree'] = comp_data[task]['prediction']!=comp_data[task]['prediction_1']
        comp_data[task]['wrong_1']=comp_data[task]['category_1']!=comp_data[task]['prediction_1']
        comp_data[task]['wrong_2']=comp_data[task]['category']!=comp_data[task]['prediction']
        comp_data[task]['disagree_w_1_wrong'] = np.where(
            comp_data[task]['disagree'] & comp_data[task]['wrong_1'], True, False)
        comp_data[task]['disagree_w_1_wrong_2_right'] = np.where(
            comp_data[task]['disagree'] 
            & comp_data[task]['wrong_1']
            & (~comp_data[task]['wrong_2']), True, False)
        comp_data[task]['disagree_w_2_wrong'] = np.where(
            comp_data[task]['disagree'] & comp_data[task]['wrong_2'], True, False)
        comp_data[task]['disagree_w_2_wrong_1_right'] = np.where(
            comp_data[task]['disagree'] 
            & comp_data[task]['wrong_2']
            & (~comp_data[task]['wrong_1']), True, False)
        comp_data[task]['disagree_w_both_wrong'] = np.where(
            comp_data[task]['disagree'] 
            & comp_data[task]['wrong_1']
            & comp_data[task]['wrong_2'], True, False)
        comp_data[task]['agree_w_both_wrong'] = np.where(
            (~comp_data[task]['disagree'])
            & comp_data[task]['wrong_1']
            & comp_data[task]['wrong_2'], True, False)
        comp_data[task] = comp_data[task].rename(
            columns = {'category':'category_2', 'prediction':'prediction_2'})
    else:
        non_present_tasks += [task]
for task in non_present_tasks:
    comp_data.pop(task)

How often do the two models disagree?

In [None]:
for task in comp_data.keys():
    with sns.axes_style("whitegrid"):
        pd.DataFrame(
            comp_data[task]['disagree'].value_counts()/comp_data[task].shape[0]).plot(
            kind='barh', title = f"{task}: Model Agreement Percentages")

How often do they disagree but comp_model_1 is right?

In [None]:
for task in comp_data.keys():
    with sns.axes_style("whitegrid"):
        pd.DataFrame(
            comp_data[task]['disagree_w_2_wrong_1_right'].value_counts()/comp_data[task].shape[0]).plot(
            kind='barh', title = f"{task}: Model Disagreement Percentages, {comp_model_1} right, {comp_model_2} wrong")

How often do they disagree but comp_model_2 is right?

In [None]:
for task in comp_data.keys():
    with sns.axes_style("whitegrid"):
        pd.DataFrame(
            comp_data[task]['disagree_w_1_wrong_2_right'].value_counts()/comp_data[task].shape[0]).plot(
            kind='barh', title = f"{task}: Model Disagreement Percentages, {comp_model_1} wrong, {comp_model_2} right")

How often do they disagree when they are both wrong

In [None]:
for task in comp_data.keys():
    with sns.axes_style("whitegrid"):
        pd.DataFrame(
            comp_data[task]['disagree_w_both_wrong'].value_counts()/comp_data[task].shape[0]).plot(
            kind='barh', title = f"{task}: Model Agreement Percentages when both are wrong")

In [None]:
for task in comp_data.keys():
    print(f"\nBOTH MODELS ARE WRONG AND DISAGREE")
    print("+"*40 + " " + task + " " + "+"*40)
    temp = comp_data[task][
        comp_data[task]['disagree_w_both_wrong']].copy().reset_index(drop=True).sort_values('category_2')
    display(temp['category_1'].value_counts())
    for i in range(temp.shape[0]):
        record = temp.iloc[i]
        print("="*80)
        print(record['text'])
        print(f"Category: {record['category_2']}")
        print(f"Prediction: {comp_model_1} = {record['prediction_1']}; {comp_model_2} = {record['prediction_2']}")
        print("\n")

Where is model 1 right but the model 2 wrong?

In [None]:
task = 'D1'
right_1 = (~comp_data[task]['wrong_1'])
wrong_2 = comp_data[task]['wrong_2']
wrong_1 = comp_data[task]['wrong_1']
right_2 = (~comp_data[task]['wrong_2'])
right_1_wrong_2 = comp_data[task][right_1 & wrong_2]
wrong_1_right_2 = comp_data[task][wrong_1 & right_2]

print(f"Count of {task} where {comp_model_1} is right and {comp_model_2} is wrong {right_1_wrong_2.shape[0]}")
print(f"Count of {task} where {comp_model_1} is wrong and {comp_model_2} is right {wrong_1_right_2.shape[0]}")

wrong_comp = pd.DataFrame(
    wrong_1_right_2['category_1'].value_counts()).rename(columns={'category_1':'wrong_1_right_2'})
right_comp = pd.DataFrame(
    right_1_wrong_2['category_1'].value_counts()).rename(columns={'category_1':'right_1_wrong_2'})
right_wrong_comp = wrong_comp.merge(right_comp, how = 'outer', 
                                    left_index=True, right_index=True).fillna(0).astype(int)
right_wrong_comp.loc["Total"] = right_wrong_comp.sum()
right_wrong_comp

In [None]:
category = 'Traffic Study'

In [None]:
temp = right_1_wrong_2[right_1_wrong_2['category_1']==category]
for i in range(temp.shape[0]):
    record = temp.iloc[i]
    print("="*80)
    print(record['text'])
    print(f"Category: {record['category_1']} ({record['category_2']})")
    print(f"Prediction: {comp_model_2} = {record['prediction_2']}; {comp_model_1} = {record['prediction_1']}")
    print("\n")


In [None]:
temp = wrong_1_right_2[wrong_1_right_2['category_1']==category]
for i in range(temp.shape[0]):
    record = temp.iloc[i]
    print("="*80)
    print(record['text'])
    print(f"Category: {record['category_1']} ({record['category_2']})")
    print(f"Prediction: {comp_model_2} = {record['prediction_2']}; {comp_model_1} = {record['prediction_1']}")
    print("\n")


In [None]:
text = "The caller forgot to pay her water bill and she is stating that she will come in tomorrow to pay it in full."
comp_data['D1'][comp_data['D1'].text==text]

# TESTING

In [None]:
# python run.py \
# --model_name_or_path CA-MTL-tiny \
# --data_dir /hub/CA-MTL/data \
# --output_dir /hub/CA-MTL/mock_models \
# --tasks D0 D1 MANC LOC SIGNT \
# --overwrite_cache \
# --task_data_folders D0/2021_04_08 D1/2021_04_08 MANC/2021_04_08 LOC/2021_04_08 SIGNT/2021_04_08 \
# --do_train \
# --do_eval \
# --do_predict \
# --evaluate_during_training \
# --per_device_train_batch_size 32 \
# --per_device_eval_batch_size 32 \
# --learning_rate 5e-5 \
# --adam_epsilon 1e-8 \
# --num_train_epochs 7 \
# --warmup_steps 0 \
# --save_steps 1500 \
# --save_total_limit 1 \
# --seed 43

In [None]:
[file for file in os.listdir('/hub/CA-MTL/mock_models/') if 'royal-lion' in file]

In [None]:
test = pd.read_csv('/hub/CA-MTL/mock_models/D0_test_iter_royal-lion-29.tsv', sep='\t')
logits = test['logits'].apply(lambda x: pd.Series(eval(x)))

In [None]:
test.head()

In [None]:
logits.head()