In [None]:
%pip install --upgrade transformers Keras-Preprocessing wandb pytorch-lightning sacremoses sentencepiece

In [None]:
import wandb
import os
os.environ["WANDB__SERVICE_WAIT"] = "300"
key="d75571bf9259088cd0a735d5f9e10de08e105a99"
wandb.login(key=key)

In [None]:
import os
import random
import pandas as pd
import numpy as np
import csv
import tensorflow as tf
import torch
from sklearn.model_selection import train_test_split
import textwrap
import progressbar
import keras
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
import time
import datetime
import json
from sklearn.metrics import (accuracy_score, f1_score, recall_score, precision_score, confusion_matrix)

In [None]:
#load data
df = pd.read_csv('ILDC.csv') # path to multi_dataset
train_set = df.query(" split=='train' ")
test_set = df.query(" split=='test' ")
validation_set = df.query(" split=='dev' ")

In [None]:
# load all models and select roberta
from transformers import PreTrainedModel, PreTrainedTokenizer, PretrainedConfig
from transformers import BertForSequenceClassification, BertTokenizer, BertConfig
from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig
from transformers import XLNetForSequenceClassification, XLNetTokenizer, XLNetConfig
from transformers import XLMForSequenceClassification, XLMRobertaTokenizer, XLMRobertaConfig
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig

MODEL_CLASSES = {
    'bert': (BertForSequenceClassification, BertTokenizer, BertConfig),
    'xlnet': (XLNetForSequenceClassification, XLNetTokenizer, XLNetConfig),
    'xlm': (XLMForSequenceClassification, XLMRobertaTokenizer, XLMRobertaConfig),
    'roberta': (RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig),
    'distilbert': (DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig)}

model_type = 'xlm' ###--> CHANGE WHAT MODEL YOU WANT HERE!!! <--###
model_class, tokenizer_class, config_class = MODEL_CLASSES[model_type]
model_name = 'xlm-roberta-large'

In [None]:
tokenizer = tokenizer_class.from_pretrained(model_name)

In [None]:
def input_id_maker(dataf, tokenizer):
  input_ids = []
  lengths = []

  for i in progressbar.progressbar(range(len(dataf['text']))):
    sen = dataf['text'].iloc[i]
    sen = tokenizer.tokenize(sen)
    CLS = tokenizer.cls_token
    SEP = tokenizer.sep_token
    if(len(sen) > 510):
      sen = sen[len(sen)-510:]

    sen = [CLS] + sen + [SEP]
    encoded_sent = tokenizer.convert_tokens_to_ids(sen)
    input_ids.append(encoded_sent)
    lengths.append(len(encoded_sent))

  input_ids = pad_sequences(input_ids, maxlen=512, value=0, dtype="long", truncating="pre", padding="post")
  return input_ids, lengths

In [None]:
train_input_ids, train_lengths = input_id_maker(train_set, tokenizer)
validation_input_ids, validation_lengths = input_id_maker(validation_set, tokenizer)

In [None]:
def att_masking(input_ids):
  attention_masks = []
  for sent in input_ids:
    att_mask = [int(token_id > 0) for token_id in sent]
    attention_masks.append(att_mask)
  return attention_masks

In [None]:
train_attention_masks = att_masking(train_input_ids)
validation_attention_masks = att_masking(validation_input_ids)

train_labels = train_set['label'].to_numpy().astype('int')
validation_labels = validation_set['label'].to_numpy().astype('int')

In [None]:
train_inputs = train_input_ids
validation_inputs = validation_input_ids
train_masks = train_attention_masks
validation_masks = validation_attention_masks

train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)

validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)

In [None]:
batch_size = 8

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = XLMForSequenceClassification.from_pretrained("xlm-roberta-large", num_labels=2)
model.to(device)

In [None]:
train_dataloader = torch.load(f"train_xlm-roberta-large_dataloader_batch_size_8.pt")
validation_dataloader = torch.load(f"validation_xlm-roberta-large_dataloader_batch_size_8.pt")

In [None]:
lr = 2e-6
max_grad_norm = 1.0
epochs = 15
num_total_steps = len(train_dataloader)*epochs
num_warmup_steps = 1000
warmup_proportion = float(num_warmup_steps) / float(num_total_steps)  # 0.1
optimizer = AdamW(model.parameters(), lr=lr, correct_bias=True)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = num_warmup_steps, num_training_steps = num_total_steps)

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

seed_val = 2212

np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
loss_values = []
wandb.init(
        project=f"{model_type}",
        name=f"{model_name}_L{lr}_B{batch_size}_E{epochs}",

        config={
        "architecture": model_name,
        "dataset": "ILDC",
        "learning_rate": lr,
        "epochs": epochs,
        "batch_size": batch_size,
        "token_length": 512,
        "model_name": model_name
        }
    )

# For each epoch...
for epoch_i in range(0, epochs):
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()
    total_loss = 0

    model.train()

    for step, batch in enumerate(train_dataloader):
        if step % 40 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}. '.format(step, len(train_dataloader)))


        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()

        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)

        loss = outputs[0]
        total_loss += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))

    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
          outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

        logits = outputs[0]

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy

        nb_eval_steps += 1
        avg_eval_accuracy = eval_accuracy / nb_eval_steps

    # Report the final accuracy for this validation run.
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    pred_flat = np.argmax(logits, axis=1).flatten()
    labels_flat = label_ids.flatten()
    macro_f1 = f1_score(pred_flat, labels_flat, average='macro')
    micro_f1 = f1_score(pred_flat, labels_flat, average='micro')
    accuracy = accuracy_score(pred_flat, labels_flat)
    precision = precision_score(pred_flat, labels_flat)
    recall = recall_score(pred_flat, labels_flat)
    confusion = confusion_matrix(labels_flat, pred_flat)
    epoch_metrics = {
        'epoch': epoch_i,
        'macro_f1': macro_f1,
        'micro_f1': micro_f1,
        "flat_accuracy":avg_eval_accuracy,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        "Avg Train Loss": avg_train_loss,
        'confusion_matrix': confusion.tolist()
    }
    print(f"epoch={epoch_i}, macro_f1: {macro_f1}, micro_f1: {micro_f1}, accuracy={accuracy}, precision: {precision}, recall: {recall}, loss={loss}")
    checkpoint_folder = f"./Transformers_GPU/{model_type}/{model_name}_L{lr}_E{epochs}_B{batch_size}"
    epochs_folder = f"{checkpoint_folder}/epochs"
    if not os.path.exists(epochs_folder):
        os.makedirs(epochs_folder)

    with open(f'{epochs_folder}/epoch{epoch_i}_metrics.json', 'w') as json_file:
        json.dump(epoch_metrics, json_file, indent=4)

    print(f"epoch_{epoch_i}_metrics.json saved to {epochs_folder}\n")

    wandb.log({"Flat Accuracy":avg_eval_accuracy, "Accuracy":accuracy , "Macro_f1":macro_f1,"Micro_f1":micro_f1, "Precision":precision, "Recall":recall, "Avg Train Loss": avg_train_loss})

    print(f"epoch_{epoch_i} logging is done...\n")

wandb.finish()
print(f"Now Training for {model_name} is Completed.......\n")

print("Saving model to %s\n" % checkpoint_folder)

model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(checkpoint_folder)
tokenizer.save_pretrained(checkpoint_folder)

print("Model Saved to %s\n" % checkpoint_folder)

In [None]:
torch.save(model_to_save, f"{checkpoint_folder}/model.pt")