In [1]:
import os
import warnings

CUDA_VISIBLE = '5'
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = CUDA_VISIBLE
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'

### Training params, you descriptions allows to not make cli and use notebook so I used it)


In [2]:
experiment_name = 'vanila_attn_mnli'
custom_attention = False
log_dir = './logs'
checkpoints_dir = './runs'
task = 'mnli'
lr = 4e-5
min_lr = 1e-6
warmup_epochs = 0.5
lr_decay_epochs = 3.5
num_cpu = 16
batch_size = 40
grad_acc_steps = 1
eval_acc_steps = 250
log_per_steps = 100
eval_per_steps = 1000
save_per_steps = 1000
weight_decay = 0.01
no_progress_stop = 5
checkpoint_path = None
seed = 42
num_gpu = sum([1 if x else 0 for x in CUDA_VISIBLE.split(',')])

In [3]:
output_dir = os.path.join(checkpoints_dir, experiment_name)
logging_dir = os.path.join(log_dir, experiment_name)
best_models_store_dir = os.path.join('./best_models', experiment_name)

if not os.path.exists(output_dir):
    os.makedirs(output_dir)
        
if not os.path.exists(logging_dir):
    os.makedirs(logging_dir)
    
if not os.path.exists(best_models_store_dir):
    os.makedirs(best_models_store_dir)

In [4]:
import logging
import numpy as np
import random
import torch
import transformers
from transformers import AutoTokenizer, DataCollatorWithPadding, EarlyStoppingCallback, set_seed, \
                        TrainingArguments, Trainer, AdamW, get_cosine_with_hard_restarts_schedule_with_warmup
from datasets import concatenate_datasets, DatasetDict
from model import get_model, MODEL_NAME
from data import get_processed_dataset, change_labels
from utils import compute_metrics, save_labels_to_csv

In [5]:
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
set_seed(seed)
torch.backends.cudnn.enabled=False
torch.backends.cudnn.deterministic=True

In [6]:
#added logging

file_formatter = logging.Formatter(fmt="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
                                       datefmt="%m/%d/%Y %H:%M:%S", )
file_handler = logging.FileHandler(
    os.path.join(logging_dir, f"log.{os.getpid()}.txt"))
file_handler.setFormatter(file_formatter)
logging.root.addHandler(file_handler)

In [7]:
if checkpoint_path:
    tokenizer = AutoTokenizer.from_pretrained(best_models_store_dir)
    model = get_model(custom_attention=custom_attention,
                      glue_task=task,
                      path=checkpoint_path)
else:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = get_model(custom_attention=custom_attention,
                      glue_task=task)
dataset = get_processed_dataset(tokenizer, task, seed)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=8)

train_data = dataset["train"]
if task == "mnli":
    val_data = concatenate_datasets([dataset["validation_matched"], dataset["validation_mismatched"]])
    test_data = DatasetDict({'matched': dataset["test_matched"].map(lambda x: change_labels(x)),
                            'mismatched': dataset["test_mismatched"].map(lambda x: change_labels(x))
    })
else:
    val_data = dataset["validation"]
    test_data = dataset["test"].map(lambda x: change_labels(x))

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['classification_head.dense.bias', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight', 'classification_head.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/5 [00:00<?, ?it/s]

In [8]:
effective_batch = batch_size * num_gpu * grad_acc_steps
step_size = batch_size * grad_acc_steps
warmup_steps = len(train_data) * warmup_epochs // step_size
decay_steps = len(train_data) * lr_decay_epochs // step_size
cos_cycles = np.arccos(min_lr/ lr) / np.pi
optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay, no_deprecation_warning=True)
scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, warmup_steps, decay_steps, cos_cycles)

# Training

In [9]:
training_args = TrainingArguments(
    output_dir=output_dir,
    logging_dir=logging_dir,
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=grad_acc_steps,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=np.ceil(warmup_epochs + lr_decay_epochs),
    evaluation_strategy="steps",
    eval_steps=eval_per_steps,
    eval_accumulation_steps=eval_acc_steps,
    save_strategy="steps",
    save_steps=save_per_steps,
    save_total_limit=5,
    logging_strategy ="steps",
    logging_steps=log_per_steps,
    dataloader_num_workers=num_cpu,
    load_best_model_at_end=True,
    report_to="tensorboard",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=no_progress_stop)]
)

In [10]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy
1000,0.7448,0.631562,0.736092
2000,0.5955,0.518636,0.795389
3000,0.572,0.482763,0.810251
4000,0.5489,0.481939,0.815697
5000,0.5438,0.491543,0.817275
6000,0.5048,0.453921,0.830101
7000,0.4978,0.442716,0.832901
8000,0.4756,0.435667,0.835751
9000,0.5009,0.433115,0.838194
10000,0.399,0.455467,0.837227


TrainOutput(global_step=23000, training_loss=0.4633067027382229, metrics={'train_runtime': 2940.9591, 'train_samples_per_second': 534.114, 'train_steps_per_second': 13.353, 'total_flos': 5.528273905454573e+16, 'train_loss': 0.4633067027382229, 'epoch': 2.34})

In [11]:
trainer.save_model(best_models_store_dir)

# INFERENCE
### If not runned consequently with training rerun cells above training to initialize all required variables and import necessary libraries

In [9]:
tokenizer = AutoTokenizer.from_pretrained(best_models_store_dir)
model = get_model(custom_attention=custom_attention,
                  glue_task=task,
                  path=best_models_store_dir)

In [10]:
test_args = TrainingArguments(
    output_dir = "evals",
    do_train = False,
    do_predict = True,
    per_device_eval_batch_size = batch_size,
    eval_accumulation_steps = eval_acc_steps,
    dataloader_drop_last = False,
    report_to="none"
)

trainer = Trainer(
    model = model, 
    args = test_args,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics = compute_metrics)

In [11]:
#added at the last moment since huggingface test mnli dataset not equal to real mnli test

import pandas as pd    
from datasets import Dataset
from data import tokenize

drop_columns=["annotator_labels", "genre", "gold_label", "promptID", "sentence1_binary_parse",
              "sentence1_parse", "sentence2_binary_parse", "sentence2_parse"]
rename_map = {"pairID": "idx", "sentence1": "premise", "sentence2": "hypothesis"}

def get_test_data_mnli(matched=True):
    if matched:
        f_name = 'multinli_0.9_test_matched_unlabeled.jsonl'
    else:
        f_name = 'multinli_0.9_test_mismatched_unlabeled.jsonl'
        
    with open(f_name, 'r') as f:
        df = pd.read_json(f, lines=True)
    df = df.drop(drop_columns, axis=1)
    df = df.rename(columns=rename_map)
    df["label"] = 0
    data_dict = df.to_dict(orient='list')
    dataset = Dataset.from_dict(data_dict)
    tokenized_dataset = dataset.map(lambda x: tokenize("mnli", tokenizer, x), batched=True)
    return tokenized_dataset

In [12]:
mapping = {ix: val for ix, val in enumerate(dataset["train"].features["label"]._int2str)}

if task == "mnli":
    test_data_matched = get_test_data_mnli(matched=True)
    raw_matched = trainer.predict(test_data_matched)
    predictions_matched = np.argmax(raw_matched[0][0], axis=1)
    idx_matched = []
    for example in test_data_matched:
        idx_matched.append(example["idx"])
    idx_matched = np.array(idx_matched)
    test_data_mismatched = get_test_data_mnli(matched=False)
    raw_mismatched = trainer.predict(test_data_mismatched)
    predictions_mismatched = np.argmax(raw_mismatched[0][0], axis=1)
    idx_mismatched = []
    for example in test_data_mismatched:
        idx_mismatched.append(example["idx"])
    idx_mismatched = np.array(idx_mismatched)
    sps = (raw_matched[2]['test_samples_per_second'] + raw_mismatched[2]['test_samples_per_second']) / 2
    save_labels_to_csv(idx_matched, predictions_matched, mapping, experiment_name + "_matched.csv")
    save_labels_to_csv(idx_mismatched, predictions_mismatched, mapping, experiment_name + "_mismatched.csv")
else:
    raw_predicts = trainer.predict(test_data)
    predictions = np.argmax(raw_predicts[0][0], axis=1)
    idx = []
    for example in test_data:
        idx.append(example["idx"])
    sps = raw_predicts[2]['test_samples_per_second']
    save_labels_to_csv(idx, predictions, mapping, experiment_name + ".csv")

Map:   0%|          | 0/9796 [00:00<?, ? examples/s]

Map:   0%|          | 0/9847 [00:00<?, ? examples/s]

In [13]:
# samples per second
print(sps)

449.29650000000004
