In [None]:
import os
import re
import sys
import random
import time
import json
import farmhash # https://github.com/veelion/python-farmhash
import numpy as np
import pandas as pd
import torch
import warnings
from collections import defaultdict

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    
def k_folds_partition(data, hash_column = 'text', n_splits = 10, seed = 43):
    set_seed(seed)
    data = data.copy()
    partition_hash = data[hash_column].apply(lambda x: farmhash.hash64withseed(x, seed))
    partition = np.abs(partition_hash % n_splits)
    return partition

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
warnings.filterwarnings("ignore")

In [None]:
pd.set_option("max_rows", 999)
pd.set_option("max_columns", 999)

In [None]:
seed = 43
set_seed(seed)

In [None]:
task = "D2"
fold = 8
date_tag = "2021_04_08"
data_dir = f"/hub/CA-MTL/data/{task}"
file = f"/hub/311_text_classifier/data/raw/PW-{task}-{date_tag}-PROD.csv"

In [None]:
out_dir = f"{data_dir}/{date_tag}"
train_file_out = f"{out_dir}/train.tsv"
train_dev_file_out = f"{out_dir}/train-dev.tsv"
dev_file_out = f"{out_dir}/dev.tsv"
test_file_out = f"{out_dir}/test.tsv"
metadata_file_out = f"{out_dir}/metadata.json"

In [None]:
metadata = dict(
    raw_data_file = file, 
    data_version = date_tag, 
    task_name = task, 
    file_paths = {
        'train':train_file_out, 
        'train-dev':train_dev_file_out, 
        'dev':dev_file_out, 
        'test':test_file_out
    },
    partition_rules = [
        'external/daupler seperate; train/train_dev 0.85/0.15; dev/test 0.5/0.5'
    ]
)

In [None]:
try:
    os.mkdir(out_dir)
except OSError as error:
    print("Directory already exists")
    pass

Read data and remove all tabs, multi-spaces, and new lines

In [None]:
data = pd.read_csv(file)

In [None]:
def remove_tabs_newlines(x):
    return re.sub(r"[\n\t\r]*", "", x)
def remove_multi_spaces(x):
    return re.sub(r"\s\s+", " ", x)
data['text'] = data['text'].apply(remove_tabs_newlines)
data['text'] = data['text'].apply(remove_multi_spaces)
data = data.drop_duplicates('text').reset_index(drop=True)

Partition External Data in Train and Train-Dev

In [None]:
data['partition'] = k_folds_partition(
    data, hash_column = 'text', n_splits = 10, seed = seed)

In [None]:
train_condition = data['partition']!=fold
train = data[train_condition].reset_index(drop=True)
train_dev = data[~train_condition].reset_index(drop=True)
dev = data[~train_condition].reset_index(drop=True)
test = data[~train_condition].reset_index(drop=True)

In [None]:
train.shape

In [None]:
dev.shape

Generate Metadata

In [None]:
metadata['labels'] = data['category'].sort_values().unique().tolist()

In [None]:
out_cols = {
    'D0':[
        'text',
        'category',
        'internal_id',
        'external_id'
    ],
    'D1':[
        'text',
        'category',
        'internal_id',
        'external_id'
    ],
    'D2':[
        'text',
        'category',
        'qualifier'
    ],
}

In [None]:
train[out_cols[task]].to_csv(train_file_out,sep='\t',index=False)
train_dev[out_cols[task]].to_csv(train_dev_file_out,sep='\t',index=False)
dev[out_cols[task]].to_csv(dev_file_out,sep='\t',index=False)
test[out_cols[task]].to_csv(test_file_out,sep='\t',index=False)
json.dump(metadata, open(metadata_file_out, 'w'))

In [None]:
data['category'].nunique()

# Run

```
python run.py \
--model_name_or_path CA-MTL-tiny \
--data_dir /hub/CA-MTL/data \
--output_dir /hub/CA-MTL/mock_models \
--tasks D0 D1 \
--overwrite_cache \
--task_data_folders D0/2021_04_08 D1/2021_04_08 \
--do_train \
--do_eval \
--do_predict \
--evaluate_during_training \
--per_device_train_batch_size 32 \
--per_device_eval_batch_size 32 \
--learning_rate 5e-5 \
--adam_epsilon 1e-8 \
--num_train_epochs 7 \
--warmup_steps 0 \
--save_steps 10000 \
--seed 43
```

# Move data

In [None]:
# Move dev set to output folder

# Evaluate 10-Fold Results

Set up dictionary of files for data loading

In [None]:
model_dir = "/hub/CA-MTL/mock_models/"
runs = [
    'vibrant-river-1',
    'colorful-elevator-2',
    'super-deluge-3',
    'elated-wave-5',
    'magic-lake-6',
    'neat-wind-7',
    'faithful-music-8',
    'winter-bush-9',
    'stellar-paper-10',
    'still-glade-11'
]
files = os.listdir(model_dir)

In [None]:
run_files = defaultdict(defaultdict)
for run in runs:
    for d in ['D0', 'D1']:
        run_files[run][d] = {}
        for partition in ['test', 'dev']:
            run_files[run][d][partition] = [
                file for file in files if (run in file) & file.startswith(f"{d}_{partition}")]

Load data for each partition run

In [None]:
run_data = defaultdict(defaultdict)
for run in runs:
    for d in ['D0', 'D1']:
        data = pd.read_csv(
            f"{model_dir}/{run_files[run][d]['dev'][0]}", sep="\t")
        data['prediction'] = pd.read_csv(
            f"{model_dir}/{run_files[run][d]['test'][0]}", sep="\t")['prediction']
        data['run'] = run
        run_data[run][d] = data

Append data into a single frame for metric analysis

In [None]:
d0_data = pd.DataFrame()
d1_data = pd.DataFrame()
for run in run_data.keys():
    d0_data = d0_data.append(run_data[run]['D0'], ignore_index = True)
    d1_data = d1_data.append(run_data[run]['D1'], ignore_index = True)

##### Calculate metrics across all folds

In [None]:
import sys
sys.path.append("../311_text_classifier/src")
from models.evaluate import get_metrics

In [None]:
metrics = {}
metrics['D0'] = get_metrics(
    data=d0_data, true_y="category", predicted_y=f"prediction", 
            labels=d0_data["category"].unique().tolist())
metrics['D1'] = get_metrics(
    data=d1_data, true_y="category", predicted_y=f"prediction", 
            labels=d1_data["category"].unique().tolist())

In [None]:
metrics['D0']

In [None]:
metrics['D1']

##### Calculate metrics by fold and plot


In [None]:
d0_data.head()

In [None]:
run_metrics = {}
run_metrics['D0'] = {}
run_metrics['D1'] = {}
for run in runs:
    d0_temp = d0_data[d0_data['run']==run].copy()
    d1_temp = d1_data[d1_data['run']==run].copy()
    
    run_metrics['D0'][run] = get_metrics(
        data=d0_temp, true_y="category", predicted_y=f"prediction", 
        labels=d0_temp["category"].unique().tolist())
    run_metrics['D1'][run] = get_metrics(
        data=d1_temp, true_y="category", predicted_y=f"prediction", 
        labels=d1_temp["category"].unique().tolist()) 

In [None]:
plot_metrics = pd.DataFrame()
d0_f1, d1_f1 = [], []
for run in runs:
    d0_f1.append(run_metrics['D0'][run].loc[0, 'f1'])
    d1_f1.append(run_metrics['D1'][run].loc[0, 'f1'])

temp = pd.DataFrame({
    "wghtd_f1":d0_f1+d1_f1, 
    "D":[f"D0"]*len(runs) + [f"D1"]*len(runs)})
plot_metrics = plot_metrics.append(temp, ignore_index=True)

In [None]:
f = plt.figure(figsize=(15, 8))
f.suptitle(f"Weighted F1 Performance on Daupler vs External Data\n(CA-MTL: tinyBERT)", 
          fontsize=18)
gs = f.add_gridspec(1, 1)
f.patch.set_facecolor('white')
with sns.axes_style("whitegrid"):
    ax = f.add_subplot(gs[0, 0])
    sns.violinplot(x="D", y="wghtd_f1", data=plot_metrics)
    plt.xticks(fontsize=16)
    plt.yticks(fontsize=16)

##### Calculate metrics by fold and source and plot

In [None]:
d0_data['daupler_generated'] = np.where(
    d0_data['external_id'].str.contains('daupler'), 1, 0)
d1_data['daupler_generated'] = np.where(
    d1_data['external_id'].str.contains('daupler'), 1, 0)

In [None]:
source_metrics = {}
source_metrics['D0'] = defaultdict(defaultdict)
source_metrics['D1'] = defaultdict(defaultdict)
for run in runs:
    d0_dau = d0_data[
        (d0_data['run']==run) & (d0_data['daupler_generated']==1)].copy()
    d0_ext = d0_data[
        (d0_data['run']==run) & (d0_data['daupler_generated']!=1)].copy()
    d1_dau = d1_data[
        (d1_data['run']==run) & (d1_data['daupler_generated']==1)].copy()
    d1_ext = d0_data[
        (d1_data['run']==run) & (d1_data['daupler_generated']!=1)].copy()
    
    source_metrics['D0'][run]['dau'] = get_metrics(
        data=d0_dau, true_y="category", predicted_y=f"prediction", 
        labels=d0_dau["category"].unique().tolist())
    source_metrics['D0'][run]['ext'] = get_metrics(
        data=d0_ext, true_y="category", predicted_y=f"prediction", 
        labels=d0_ext["category"].unique().tolist())
        
    source_metrics['D1'][run]['dau'] = get_metrics(
        data=d1_dau, true_y="category", predicted_y=f"prediction", 
        labels=d1_dau["category"].unique().tolist()) 
    source_metrics['D1'][run]['ext'] = get_metrics(
        data=d1_ext, true_y="category", predicted_y=f"prediction", 
        labels=d1_ext["category"].unique().tolist())     

In [None]:
plot_metrics = pd.DataFrame()
for source in ['ext', 'dau']:
    d0_f1, d1_f1 = [], []
    for run in runs:
        d0_f1.append(source_metrics['D0'][run][source].loc[0, 'f1'])
        d1_f1.append(source_metrics['D1'][run][source].loc[0, 'f1'])

    temp = pd.DataFrame({
        "wghtd_f1":d0_f1+d1_f1, 
        "D":[f"D0_{source}"]*len(runs) + [f"D1_{source}"]*len(runs)})
    plot_metrics = plot_metrics.append(temp, ignore_index=True)

In [None]:
f = plt.figure(figsize=(15, 8))
f.suptitle(f"Weighted F1 Performance on Daupler vs External Data\n(CA-MTL: tinyBERT)", 
           fontsize=18)
gs = f.add_gridspec(1, 1)
f.patch.set_facecolor('white')
with sns.axes_style("whitegrid"):
    ax = f.add_subplot(gs[0, 0])
    sns.violinplot(x="D", y="wghtd_f1", data=plot_metrics)
    plt.xticks(fontsize=16)
    plt.yticks(fontsize=16)