In [1]:
#!pip install "adapter-transformers@git+https://github.com/akufeldt/adapter-transformers.git@debug#egg=adapter-transformers&subdirectory=adapter-transformers"

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn import Softmax

from typing import List, Optional, Tuple, Union, Dict, Any

from datasets import load_dataset, Dataset, DatasetDict, load_metric, load_from_disk
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer, AutoTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, EarlyStoppingCallback
from transformers import PreTrainedModel, TrainingArguments, Trainer
from transformers.adapters import AdapterTrainer

import pandas as pd
import numpy as np
import evaluate

import random
import math
import time
from tqdm import tqdm
import os
import json

In [3]:
from datasets import concatenate_datasets

In [4]:
seed = 42
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
_numpy_rng = np.random.default_rng(seed)
random.seed(seed)
np.random.seed(seed)
torch.use_deterministic_algorithms(False)
os.environ['PYTHONHASHSEED'] = str(seed)

In [5]:
os.environ["WANDB_DISABLED"] = "true"

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load in model

In [7]:
model_name = 'm2m100_418M'
experiment = 'en-ha_finetune_base_model-1'
dataset_name = 'data/en-ha'

In [8]:
model = M2M100ForConditionalGeneration.from_pretrained(f"facebook/{model_name}")
# model = torch.nn.DataParallel(model, device_ids=[2, 3, 4])
model = model.to(device)
tokenizer = M2M100Tokenizer.from_pretrained(f"facebook/{model_name}")

# Prepare data

In [12]:
src_lang = 'en'
tgt_lang = 'ha'
tokenizer.src_lang = "en"
tokenizer.tgt_lang = "ha"

In [13]:
dataset = DatasetDict({'train':Dataset.from_pandas(pd.read_csv(f'{dataset_name}/cleaned_train.csv')).shuffle(seed=seed),
                        'validation':Dataset.from_pandas(pd.read_csv(f'{dataset_name}/cleaned_dev.csv')).shuffle(seed=seed),
                        'test':Dataset.from_pandas(pd.read_csv(f'{dataset_name}/test.csv')).shuffle(seed=seed),
})

In [14]:
dataset['test'] = dataset['test'].rename_column('sentence_eng_Latn','en')
dataset['test'] = dataset['test'].rename_column('sentence_hau_Latn','ha')

In [15]:
dataset

DatasetDict({
    train: Dataset({
        features: ['en', 'ha'],
        num_rows: 9818
    })
    validation: Dataset({
        features: ['en', 'ha'],
        num_rows: 1113
    })
    test: Dataset({
        features: ['en', 'ha'],
        num_rows: 1012
    })
})

In [16]:
def preprocess_function(examples):
    inputs = [example for example in examples[src_lang]]
    targets = [example for example in examples[tgt_lang]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=256, truncation=True, padding="max_length")
    return model_inputs

In [17]:
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset.column_names['train'])

Map:   0%|          | 0/9818 [00:00<?, ? examples/s]

Map:   0%|          | 0/1113 [00:00<?, ? examples/s]

Map:   0%|          | 0/1012 [00:00<?, ? examples/s]

In [18]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 9818
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1113
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1012
    })
})

# Training Setup

In [19]:
sacrebleu = evaluate.load("sacrebleu")
wer = evaluate.load("wer")

In [20]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    labels = eval_preds.label_ids
    pred_ids = eval_preds.predictions
    if isinstance(pred_ids, tuple):
        pred_ids = pred_ids[0]
    
    preds = np.argmax(pred_ids, axis=-1)

    # removeme
    #import warnings
    #warnings.warn(f"unprocessed preds: {preds[0]}\n)")
    
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) 

    # removeme
    #warnings.warn(f"unprocessed decoded labels: {tokenizer.batch_decode(labels)[0]}\n)")

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    # remove me
    #inputs = eval_preds.input_ids
    #decoded_inputs = tokenizer.batch_decode(inputs)
    
    # Removeme
    import warnings
    warnings.warn(f"preds: {decoded_preds[0]}\n)")
    warnings.warn(f"labels: {decoded_labels[0]}\n)")

    bleu_result = sacrebleu.compute(predictions=decoded_preds, references=decoded_labels)
    metrics = {"bleu": bleu_result["score"]}

    flattened_decoded_labels = [' '.join([str(x) for x in l]) for l in decoded_labels]
    wer_score = wer.compute(predictions=decoded_preds, references=flattened_decoded_labels)
    metrics["wer"] = wer_score

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    metrics["gen_len"] = np.mean(prediction_lens)
    metrics = {k: round(v, 4) for k, v in metrics.items()}
    return metrics


In [21]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [23]:
training_args = TrainingArguments(
    f"./lang_adapters/{experiment}/model",
    # evaluation_strategy="steps",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=15,
    warmup_steps=0,
    # lr_scheduler_type='cosine_with_restarts',
    # gradient_accumulation_steps=4,
    eval_accumulation_steps=16,
    # gradient_checkpointing=True,
    # predict_with_generate=True,
    fp16=True,
    do_train=True,
    do_eval=True,
    logging_steps=5,
    # eval_steps=5,
    save_strategy="epoch",
    metric_for_best_model="bleu",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    #optimizers=(optimizer, lr_scheduler),
    compute_metrics=compute_metrics,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using cuda_amp half precision backend


In [None]:
trainer.train()

***** Running training *****
  Num examples = 9818
  Num Epochs = 15
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 2310
  Number of trainable parameters = 4757760


Epoch,Training Loss,Validation Loss,Bleu,Wer,Gen Len
1,10.9566,10.893058,4.1028,0.9942,256.0
2,10.5878,10.351295,4.1008,0.9934,256.0
3,9.8436,9.64674,3.9236,0.9908,256.0
4,9.1804,8.45971,2.9432,0.9914,256.0
5,7.7179,7.466567,1.0299,1.3373,256.0
6,7.2357,7.052766,0.6462,1.7914,256.0
7,7.0111,6.922141,0.5818,1.9943,256.0
8,7.0047,6.872288,0.5721,2.0267,256.0


***** Running Evaluation *****
  Num examples = 1113
  Batch size = 16
)
)
Saving model checkpoint to ./lang_adapters/en-ha_adapters_new_wd01_lr5e-6_e15/model/checkpoint-154
Configuration saved in ./lang_adapters/en-ha_adapters_new_wd01_lr5e-6_e15/model/checkpoint-154/enc_en/adapter_config.json
Module weights saved in ./lang_adapters/en-ha_adapters_new_wd01_lr5e-6_e15/model/checkpoint-154/enc_en/pytorch_adapter.bin
Configuration saved in ./lang_adapters/en-ha_adapters_new_wd01_lr5e-6_e15/model/checkpoint-154/enc_en/head_config.json
Module weights saved in ./lang_adapters/en-ha_adapters_new_wd01_lr5e-6_e15/model/checkpoint-154/enc_en/pytorch_model_head.bin
Configuration saved in ./lang_adapters/en-ha_adapters_new_wd01_lr5e-6_e15/model/checkpoint-154/dec_ha/adapter_config.json
Module weights saved in ./lang_adapters/en-ha_adapters_new_wd01_lr5e-6_e15/model/checkpoint-154/dec_ha/pytorch_adapter.bin
Configuration saved in ./lang_adapters/en-ha_adapters_new_wd01_lr5e-6_e15/model/checkpoint-

In [None]:
# Save model
if not os.path.exists(f'./base_model/{experiment}'):
    os.mkdir(f'./base_model/{experiment}')
    
trainer.save_model(f"./base_model/{experiment}")

# Eval finetuned model on test set

In [None]:
# Evaluate performance
src_lang = 'en'
tgt_lang = 'ha'
tokenizer.src_lang = "en"
tokenizer.tgt_lang = "ha"

In [None]:
model = M2M100ForConditionalGeneration.from_pretrained(f"./base_model/{experiment}")
# model = torch.nn.DataParallel(model, device_ids=[2, 3, 4])
model = model.to(device)
tokenizer = M2M100Tokenizer.from_pretrained(f"./base_model/{experiment}")

In [None]:
test_outputs = trainer.evaluate(tokenized_dataset['test']) #, forced_bos_token_id=tokenizer.get_lang_id("ha")
#test_output_texts = tokenizer.batch_decode(torch.LongTensor(test_outputs.predictions), skip_special_tokens=True)

In [None]:
test_outputs