In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Dissertation/experiments/idiom_principle_on_magpie_corpus/

Mounted at /content/drive
/content/drive/MyDrive/Dissertation/experiments/idiom_principle_on_magpie_corpus


In [None]:
!pip install -r ./requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.7.0
  Downloading transformers-4.7.0-py3-none-any.whl (2.5 MB)
[K     |████████████████████████████████| 2.5 MB 5.3 MB/s 
[?25hCollecting datasets==1.6.1
  Downloading datasets-1.6.1-py3-none-any.whl (220 kB)
[K     |████████████████████████████████| 220 kB 57.6 MB/s 
[?25hCollecting tqdm==4.49.0
  Downloading tqdm-4.49.0-py2.py3-none-any.whl (69 kB)
[K     |████████████████████████████████| 69 kB 6.9 MB/s 
[?25hCollecting nltk==3.6.2
  Downloading nltk-3.6.2-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 57.2 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 62.9 MB/s 
Collecting huggingface-hub==0.0.8
  Downloading huggingface_hub-0.0.8-py3-none-any.whl (34 kB)
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manyli

# Experiment: Exp0

In [None]:
import pandas as pd
pd.options.display.max_colwidth=500

import os
import sys

In [None]:
from exp_helpers import run_glue_f1_macro

## Experiment Setup

In [None]:
exp_name = 'exp0'
exp_model = 'bert-base-cased'
exp_seed = 26

In [None]:
base_dir = 'data/magpie/'
# NOTE: This notebook should ideally modify only the contents of this exp_dir.
exp_dir = 'experiments/' + exp_name + '/'
data_file = base_dir + 'processed_MAGPIE_filtered_split_typebased.csv'

tmp_dir = exp_dir + 'tmp/'
model_checkpoint_dir = exp_dir + 'models/'

In [None]:
# %rm -rf $exp_dir

In [None]:
if os.path.isdir(exp_dir):
    print(f'ERROR! The experiment directory {exp_dir} already exists!')
    print("Run '%rm -rf $exp_dir")
    assert not os.path.isdir(exp_dir)

if not os.path.isdir(tmp_dir):
    os.makedirs(tmp_dir)

if not os.path.isdir(model_checkpoint_dir):
    os.makedirs(model_checkpoint_dir)

In [None]:
df_data = pd.read_csv(data_file)
df_data

Unnamed: 0,sentence_0,idiom,confidence,label,split,variant_type
0,"For example , with fell running and mountain marathons gaining in popularity , how about some ideas for safe running off the beaten track ?",off the beaten track,1.000000,i,training,identical
1,I 'd keep him well in the running .,in the running,0.770109,i,training,identical
2,"He gives me the creeps , so I looked round , hmm hmm .",give someone the creeps,1.000000,i,training,combined-inflection
3,"‘ He 's done us proud , as well,’ says Granville .",do someone proud,1.000000,i,training,combined-inflection
4,"People quickly embraced formal democracy , but the tolerance and compromise that is at the heart of the democratic process took time to take root .",take root,1.000000,i,training,identical
...,...,...,...,...,...,...
48390,Many also have second or third jobs to make ends meet .,make ends meet,0.854973,i,test,identical
48391,"Take people to objections , take them to where you want them to be and bear in mind you 're always looking for an objection",bear in mind,1.000000,i,training,identical
48392,"Indeed we are rarely aware of them as rules , until they are broken , since they are typical of the settings in which we received our moral training .",as a rule,1.000000,l,training,deletion-determiner
48393,"Unlike in a firm that is a jack of all trades , the supplier is an independent business subject to market disciplines rather than another bit of a big bureaucracy .",jack of all trades,1.000000,i,training,identical


In [None]:
columns=['sentence_0', 'idiom', 'confidence', 'label', 'split', 'variant_type']

## Prepare & save the train, dev & test sets

In [None]:
label_to_id = {'i': 0, 'l': 1}

In [None]:
df_data['split'].value_counts()

training       38715
test            4840
development     4840
Name: split, dtype: int64

In [None]:
df_tmp = df_data[['sentence_0', 'label', 'split']]

df_train = df_tmp[df_tmp['split'] == 'training']
df_dev = df_tmp[df_tmp['split'] == 'development']
df_test = df_tmp[df_tmp['split'] == 'test']

def clean_df(df):
    """Clean each of the datasets"""
    df = df.drop(columns=['split'])
    df['label'] = df['label'].map(label_to_id)
    return df

# Clean the datasets
df_train, df_dev, df_test = [clean_df(df) for df in [df_train, df_dev, df_test]]

In [None]:
# Save data to tmp files
train_csv = tmp_dir + 'train.csv'
dev_csv = tmp_dir + 'dev.csv'
test_csv = tmp_dir + 'test.csv'

df_train.to_csv(train_csv, index=False)
df_dev.to_csv(dev_csv, index=False)
df_test.to_csv(test_csv, index=False)
print(f'Saved the files to {tmp_dir}')

Saved the files to experiments/exp0/tmp/


# Training & Evaluation

## Fine-tune, Save & Evaluate

TODO: Note that `dev_csv` file is not used anywhere here!

In [None]:
# Run the helper script that trains, saves the sentence classification model
!python exp_helpers/run_glue_f1_macro.py \
        --model_name_or_path $exp_model \
    	--do_train \
    	--do_eval \
        --do_predict \
    	--max_seq_length 128 \
    	--per_device_train_batch_size 32 \
    	--learning_rate 2e-5 \
    	--num_train_epochs 6 \
    	--evaluation_strategy "epoch" \
    	--output_dir $model_checkpoint_dir \
    	--seed $exp_seed \
    	--train_file      $train_csv \
    	--validation_file $dev_csv \
        --test_file       $test_csv \
        --test_metrics \
        --evaluation_strategy "epoch" \
        --save_strategy "epoch"  \
        --load_best_model_at_end \
        --metric_for_best_model "f1" \
        --save_total_limit 3

07/14/2022 13:49:06 - INFO - __main__ -   Training/evaluation parameters TrainingArguments(
_n_gpu=0,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=True,
do_train=True,
eval_accumulation_steps=None,
eval_steps=500,
evaluation_strategy=IntervalStrategy.EPOCH,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
greater_is_better=True,
group_by_length=False,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=2e-05,
length_column_name=length,
load_best_model_at_end=True,
local_rank=-1,
log_on_each_node=True,
logging_dir=runs/Jul14_13-49-06_ec4a5acc10d9,
logging_first_step=False,
logging_steps=500,
logging_strategy=IntervalStrategy.STEPS,
lr_scheduler_type=SchedulerType.LINEAR,
max_grad_norm=1.0,
max_