https://github.com/huggingface/transformers/blob/v2.11.0/src/transformers/training_args.py  
https://github.com/huggingface/transformers/blob/v2.11.0/src/transformers/trainer.py

In [None]:
import transformers

### Playing around with GLUE

In [None]:
from transformers import glue_tasks_num_labels

In [None]:
glue_tasks_num_labels['sst-2']

In [None]:
import jsonlines

In [None]:
with jsonlines.open("/hub/CA-MTL/glue_data/data/rte/test.jsonl") as f:
    for i in range(10):
        print(next(f.iter()))
#         print line['doi'] # or whatever else you'd like to do

### What should the input mimic?

In [None]:
from transformers import (
    BertTokenizer
)
from transformers.data.processors import InputExample

In [None]:
examples = []
text_a = ['this is one sentence', 'this is another', 'there are flowers in her hair', 
         'hair is weird', 'i am glad that it is spring time', 'things are really uncertain']
label = [1, 2, 1, 1, 2, 2] 
for guid in range(len(label)):
    examples.append(InputExample(guid=guid, text_a=text_a[guid], text_b=None, label=label[guid]))

In [None]:
examples

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
batch_encoding = tokenizer.batch_encode_plus(
        [(example.text_a, example.text_b) for example in examples],
        max_length=256,
        pad_to_max_length=True
    )

In [None]:
 for i in range(len(examples)):
        inputs = {k: batch_encoding[k][i] for k in batch_encoding}
        print(inputs)

In [None]:
inputs.keys()

In [None]:
import sys
sys.path.append('.')
from src.data.task_data_processors import task_processors

In [None]:
task = 'D1'
date_tag = "2021_04_08"
data_dir= f"/hub/CA-MTL/data/{task}/{date_tag}"
set_type = 'train'

In [None]:
processor = task_processors[task]()

In [None]:
test_examples = processor.get_test_examples(data_dir)

In [None]:
processor.get_labels(data_dir)

In [None]:
len(processor.get_labels(data_dir))

In [None]:
import os
import sys
import re
import json
import logging

import torch
from transformers import (
    HfArgumentParser,
    set_seed,
#     AutoTokenizer,
#     AutoConfig,
    EvalPrediction,
    BertConfig, 
    BertTokenizer,
    AutoTokenizer
)

from src.model.ca_mtl import CaMtl, CaMtlArguments
from src.utils.misc import MultiTaskDataArguments, Split
from src.mtl_trainer import MultiTaskTrainer, MultiTaskTrainingArguments
from src.data.mtl_dataset import MultiTaskDataset
from src.data.task_dataset import TaskDataset

# data_args = MultiTaskDataArguments(
#     data_dir='/hub/CA-MTL/data', tasks=['D0', 'D1'], 
#     task_data_folders=['D0/2021_04_08', 'D1/2021_04_08'])
data_args = MultiTaskDataArguments(
    data_dir='/hub/CA-MTL/data', tasks=['D0', 'D1', 'LOC'], 
    task_data_folders=['D0/2021_04_08', 'D1/2021_04_08', 'LOC/2021_04_08'])
model_args = CaMtlArguments(model_name_or_path='CA-MTL-tiny')

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

train_dataset = MultiTaskDataset(data_args, tokenizer, limit_length=None)

In [None]:
for i, task in enumerate(data_args.tasks):
    print(task)
    print(i)

In [None]:
for i, batch in enumerate(train_dataset):
    if i == 10:
        print(batch)
    elif i == 3308:
        print(batch)

### How should I think about the DataCollator

In [None]:
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data.sampler import RandomSampler, Sampler, SequentialSampler
from transformers import (
    Trainer, 
    TrainingArguments, 
    EvalPrediction, 
    DataCollator,
    DefaultDataCollator,
)
from torch.utils.data.dataloader import DataLoader
local_rank = -1
train_batch_size = 32

train_sampler = (
    RandomSampler(train_dataset)
    if local_rank == -1
    else DistributedSampler(train_dataset)
)
data_collator = DefaultDataCollator()
data_loader = DataLoader(
    train_dataset,
    batch_size=train_batch_size,
    sampler=train_sampler,
    collate_fn=data_collator.collate_batch,
)

In [None]:
for i, batch in enumerate(data_loader):
    if i == 1000:
        print(batch)

In [None]:
data_loader

In [None]:
import numpy as np
m = torch.nn.Softmax(dim=1)
input = np.random.rand(2, 3)
input_t = torch.Tensor(input)
output = m(torch.Tensor(input)).numpy()
output_t = m(input_t)

In [None]:
input

In [None]:
output

In [None]:
output_t

In [None]:
import torch
task_embedding = torch.nn.Embedding(5, 768)
task_ids = torch.Tensor([0, 1, 1, 0, 0, 0, 0, 0, 0, 2, 1, 1, 0, 1, 0, 0]).long()

In [None]:
task_embedding(task_ids).shape

# Trial Run

This doesn't work just yet because of missing arguments in the mt uncertainty sampling... waiting on feedback from the authors to help

In [None]:
--save_steps
--save_strategy

In [None]:
python run.py \
--model_name_or_path CA-MTL-base \
--data_dir /hub/CA-MTL/data \
--output_dir /hub/CA-MTL/models \
--tasks D0 D1 \
--task_data_folders D0/2021_04_08 D1/2021_04_08 \
--do_train \
--evaluate_during_training \
--per_device_train_batch_size 32 \
--per_device_eval_batch_size 32 \
--learning_rate 5e-5 \
--adam_epsilon 1e-8 \
--num_train_epochs 7 \
--warmup_steps 0 \
--save_steps 8000 \
--seed 43 \
--use_mt_uncertainty

Everything looks good at first but when we go to run we run out of GPU memory

In [None]:
python run.py \
--model_name_or_path CA-MTL-base \
--data_dir /hub/CA-MTL/data \
--output_dir /hub/CA-MTL/models \
--tasks D0 D1 \
--task_data_folders D0/2021_04_08 D1/2021_04_08 \
--do_train \
--evaluate_during_training \
--per_device_train_batch_size 32 \
--per_device_eval_batch_size 32 \
--learning_rate 5e-5 \
--adam_epsilon 1e-8 \
--num_train_epochs 7 \
--warmup_steps 0 \
--save_steps 8000 \
--seed 43 

TinyBERT Run without Uncertainty Sampling

In [None]:
python run.py \
--model_name_or_path CA-MTL-tiny \
--data_dir /hub/CA-MTL/data \
--output_dir /hub/CA-MTL/models \
--tasks D0 D1 \
--task_data_folders D0/2021_04_08 D1/2021_04_08 \
--do_train \
--evaluate_during_training \
--per_device_train_batch_size 32 \
--per_device_eval_batch_size 32 \
--learning_rate 5e-5 \
--adam_epsilon 1e-8 \
--num_train_epochs 7 \
--warmup_steps 0 \
--save_steps 8000 \
--seed 43

TinyBERT Run without Uncertainty Sampling with Evaluation and Prediction
- prediction works but I don't want to look at the test set... I need a way to look at the evaluation set in detail... evaluation data does not write out
- this was an easy fix... I just replaced the "dev" set with the "train_dev" set and then used the "dev" set as the "test" set in the code... should have thought of this immediately... the "test" set can be evaluated easily in the future by switching this up by for now this allows us to get metrics on both... the "test" (i.e. : "dev" for us) set metrics do not log to wandb yet so I need to get that set up

- batch sizes: 8, 16, 32, 64, 128
- learning rates: 3e-4, 1e-4, 5e-5, 3e-5

In [None]:
python run.py \
--model_name_or_path CA-MTL-tiny \
--data_dir /hub/CA-MTL/data \
--output_dir /hub/CA-MTL/models \
--tasks D0 D1 \
--task_data_folders D0/2021_04_08 D1/2021_04_08 \
--do_train \
--do_eval \
--do_predict \
--evaluate_during_training \
--per_device_train_batch_size 32 \
--per_device_eval_batch_size 32 \
--learning_rate 5e-5 \
--adam_epsilon 1e-8 \
--num_train_epochs 7 \
--warmup_steps 0 \
--save_steps 8000 \
--seed 43

In [None]:
python run.py \
--model_name_or_path CA-MTL-tiny \
--data_dir /hub/CA-MTL/data \
--output_dir /hub/CA-MTL/models \
--tasks D0 D1 \
--task_data_folders D0/2021_04_08 D1/2021_04_08 \
--do_train \
--do_eval \
--do_predict \
--evaluate_during_training \
--per_device_train_batch_size 32 \
--per_device_eval_batch_size 32 \
--learning_rate 5e-5 \
--adam_epsilon 1e-8 \
--num_train_epochs 14 \
--warmup_steps 0 \
--save_steps 8000 \
--seed 1025

NOTE: If you run with a limit length, your .lock files remain and the data won't be regenerated... you have to delete the .lock files and cached* directories first

NOTE: prediction files get overwritten as well... naming has to be controlled somewhere so I will need to figure out where that is and create a way to control it... they also always say "_test_" which I do not want

In [None]:
python run.py \
--model_name_or_path CA-MTL-base \
--data_dir /hub/CA-MTL/data \
--output_dir /hub/CA-MTL/models \
--tasks D0 D1 \
--task_data_folders D0/2021_04_08 D1/2021_04_08 \
--do_train \
--do_eval \
--do_predict \
--evaluate_during_training \
--per_device_train_batch_size 32 \
--per_device_eval_batch_size 32 \
--learning_rate 5e-5 \
--adam_epsilon 1e-8 \
--num_train_epochs 7 \
--warmup_steps 0 \
--save_steps 8000 \
--seed 43

Run without CUDA to look for BERT model load issues

In [None]:
python run.py \
--model_name_or_path CA-MTL-base \
--data_dir /hub/CA-MTL/data \
--output_dir /hub/CA-MTL/models \
--tasks D0 D1 \
--max_seq_length 128 \
--task_data_folders D0/2021_04_08 D1/2021_04_08 \
--do_train \
--do_eval \
--do_predict \
--evaluate_during_training \
--per_device_train_batch_size 32 \
--per_device_eval_batch_size 32 \
--learning_rate 5e-5 \
--adam_epsilon 1e-8 \
--num_train_epochs 7 \
--warmup_steps 0 \
--save_steps 8000 \
--seed 43

In [None]:
python run.py \
--model_name_or_path CA-MTL-base \
--data_dir /hub/CA-MTL/data \
--output_dir /hub/CA-MTL/models \
--tasks D1 \
--overwrite_cache \
--max_seq_length 128 \
--task_data_folders D1/2021_04_08 \
--do_train \
--do_eval \
--do_predict \
--evaluate_during_training \
--per_device_train_batch_size 32 \
--per_device_eval_batch_size 32 \
--learning_rate 5e-5 \
--adam_epsilon 1e-8 \
--num_train_epochs 7 \
--warmup_steps 0 \
--save_steps 8000 \
--seed 43

In [None]:
python run.py \
--model_name_or_path CA-MTL-base-uncased \
--data_dir /hub/CA-MTL/data \
--output_dir /hub/CA-MTL/models \
--tasks D0 D1 \
--max_seq_length 128 \
--task_data_folders D0/2021_04_08 D1/2021_04_08 \
--do_train \
--do_eval \
--do_predict \
--evaluate_during_training \
--per_device_train_batch_size 32 \
--per_device_eval_batch_size 32 \
--learning_rate 5e-5 \
--adam_epsilon 1e-8 \
--num_train_epochs 7 \
--warmup_steps 0 \
--save_steps 8000 \
--seed 43

Retesting uncertainty sampling after author update

In [None]:
python run.py \
--model_name_or_path CA-MTL-tiny \
--data_dir /hub/CA-MTL/data \
--output_dir /hub/CA-MTL/models \
--tasks D0 D1 \
--task_data_folders D0/2021_04_08 D1/2021_04_08 \
--do_train \
--do_eval \
--do_predict \
--evaluate_during_training \
--per_device_train_batch_size 32 \
--per_device_eval_batch_size 32 \
--learning_rate 5e-5 \
--adam_epsilon 1e-8 \
--num_train_epochs 7 \
--warmup_steps 0 \
--save_steps 8000 \
--seed 43 \
--use_mt_uncertainty

Single Runs
- HAVE TO OVERWRITE THE CACHE ANY TIME YOU MAKE A CHANGE TO THE TASKS

In [None]:
python run.py \
--model_name_or_path CA-MTL-tiny \
--data_dir /hub/CA-MTL/data \
--output_dir /hub/CA-MTL/models \
--tasks D0 \
--overwrite_cache \
--task_data_folders D0/2021_04_08 \
--do_train \
--do_eval \
--do_predict \
--evaluate_during_training \
--per_device_train_batch_size 32 \
--per_device_eval_batch_size 32 \
--learning_rate 5e-5 \
--adam_epsilon 1e-8 \
--num_train_epochs 7 \
--warmup_steps 0 \
--save_steps 8000 \
--seed 43

In [None]:
python run.py \
--model_name_or_path CA-MTL-tiny \
--data_dir /hub/CA-MTL/data \
--output_dir /hub/CA-MTL/models \
--tasks D1 \
--overwrite_cache \
--task_data_folders D1/2021_04_08 \
--do_train \
--do_eval \
--do_predict \
--evaluate_during_training \
--per_device_train_batch_size 32 \
--per_device_eval_batch_size 32 \
--learning_rate 5e-5 \
--adam_epsilon 1e-8 \
--num_train_epochs 7 \
--warmup_steps 0 \
--save_steps 8000 \
--seed 43

In [None]:
python run.py --model_name_or_path CA-MTL-base-uncased --data_dir /hub/CA-MTL/data --output_dir /hub/CA-MTL/models --tasks D1 --overwrite_cache --max_seq_length 128 --task_data_folders D1/2021_04_08 --do_train --do_eval --do_predict --evaluate_during_training --per_device_train_batch_size 32 --per_device_eval_batch_size 32 --learning_rate 5e-5 --adam_epsilon 1e-8 --num_train_epochs 7 --warmup_steps 100 --save_steps 8000 --seed 43

In [None]:
python run.py --model_name_or_path bert-base-uncased --data_dir /hub/CA-MTL/data --output_dir /hub/CA-MTL/models --tasks D0 D1 --overwrite_cache --max_seq_length 128 --task_data_folders D0/2021_04_08 D1/2021_04_08 --do_train --do_eval --do_predict --evaluate_during_training --per_device_train_batch_size 32 --per_device_eval_batch_size 32 --learning_rate 5e-5 --adam_epsilon 1e-8 --num_train_epochs 7 --warmup_steps 0 --save_steps 8000 --seed 43

In [None]:
python run.py --model_name_or_path bert-base-uncased --data_dir /hub/CA-MTL/data --output_dir /hub/CA-MTL/models --tasks D1 --overwrite_cache --max_seq_length 128 --task_data_folders D1/2021_04_08 --do_train --do_eval --do_predict --evaluate_during_training --per_device_train_batch_size 32 --per_device_eval_batch_size 32 --learning_rate 5e-5 --adam_epsilon 1e-8 --num_train_epochs 7 --warmup_steps 0 --save_steps 8000 --seed 43

In [None]:
python run.py --model_name_or_path bert-base-cased --data_dir /hub/CA-MTL/data --output_dir /hub/CA-MTL/models --tasks D0 D1 --overwrite_cache --max_seq_length 128 --task_data_folders D0/2021_04_08 D1/2021_04_08 --do_train --do_eval --do_predict --evaluate_during_training --per_device_train_batch_size 32 --per_device_eval_batch_size 32 --learning_rate 5e-5 --adam_epsilon 1e-8 --num_train_epochs 7 --warmup_steps 0 --save_steps 8000 --seed 43

In [None]:
python run_stl.py --model_name_or_path distilbert-base-uncased --data_dir /hub/CA-MTL/data --output_dir /hub/CA-MTL/models --tasks D0 --overwrite_cache --max_seq_length 128 --task_data_folders D0/2021_04_08 --do_train --do_eval --do_predict --evaluate_during_training --per_device_train_batch_size 32 --per_device_eval_batch_size 32 --learning_rate 5e-5 --adam_epsilon 1e-8 --num_train_epochs 7 --warmup_steps 0 --save_steps 8000 --seed 43

In [None]:
run.py --model_name_or_path CA-MTL-tiny --encoder_type CA-MTL-tiny --data_dir /hub/CA-MTL/data --output_dir /hub/CA-MTL/mock_models --tasks D0 D1 MANC LOC SIGNT --overwrite_cache --task_data_folders D0/2021_04_08 D1/2021_04_08 MANC/2021_04_08 LOC/2021_04_08 SIGNT/2021_04_08 --do_train --do_eval --do_predict --evaluate_during_training --per_device_train_batch_size 32 --per_device_eval_batch_size 32 --learning_rate 5e-5 --adam_epsilon 1e-8 --num_train_epochs 7 --warmup_steps 0 --save_steps 1500 --save_total_limit 1 --seed 43
