## Objective

Fine tune BERT on word-wise labels (3,4,5).

First, we need things to be tokenized and adjusted as [here](https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt#preparing-the-data)

Then, prepare an architecture for NER, as in BertForTokenClassification.
[original impl](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py#L1691)

In [None]:
from google.colab import drive
import pandas as pd
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from transformers import AutoModel, AutoTokenizer

In [None]:
!pip install pytorch_lightning



In [None]:

import torch
import torch.nn as nn
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
import pytorch_lightning as pl
from torch.utils.data import DataLoader, TensorDataset
import torchmetrics
import math
from torch.optim.lr_scheduler import ReduceLROnPlateau
from pytorch_lightning.callbacks import EarlyStopping
from sklearn.metrics import f1_score, accuracy_score, classification_report
early_stopping = EarlyStopping(monitor='val_loss', patience=8, verbose=True, mode='min')
import torch.nn.functional as F
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.decomposition import PCA

In [None]:
import numpy as np

### Custom Tokenization

1. Make all data into texts of split words and corresponding labels


In [None]:
frag_train = pd.read_csv('../data/all_train_aligned.csv')
frag_dev = pd.read_csv('../data/all_dev_aligned.csv')
frag_test = pd.read_csv('../data/all_test_aligned.csv')


frag_train = frag_train[frag_train.apply(lambda x: type(x['0']) == str, axis = 1)]
frag_dev = frag_dev[frag_dev.apply(lambda x: type(x['0']) == str, axis = 1)]
frag_test = frag_test[frag_test.apply(lambda x: type(x['0']) == str, axis = 1)]

In [None]:
frag_dev['0']

0                                      إبراهيم#3 الكاتب#3
1                      إبراهيم#3 عبد#3 القادر#3 المازني#3
2                                         الفصل#3 الأول#3
3                                           وكان#3 مساء#3
4       شوشو#3 فتاة#3 يقول#3 لك#3 جسمها#3 إنها#3 ناهزت...
                              ...                        
2962               فإلى#3 الغد#3 في#3 مثل#3 هذا#3 الوقت#3
2963                                     فوقفوا#3 جميعا#3
2964    وقالوا#3 بصوت#3 واحد#3 حفظ#3 الله#3 الملك#3 ال...
2965                                       ثم#3 انصرفوا#3
2966                           وفي#3 كل#3 نفس#3 هواجسها#4
Name: 0, Length: 2948, dtype: object

In [None]:
model_checkpoint = 'CAMeL-Lab/bert-base-arabic-camelbert-msa'
tokenizer = AutoTokenizer.from_pretrained('CAMeL-Lab/bert-base-arabic-camelbert-msa')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# we will tag all subwords with the label of the word

def tokenize_step1(data):
  words = []
  tags = []
  for frag in data:
    splits = [w.split('#') for w in frag.split(' ')]

    words.append([f[0] for f in splits])
    tags.append([int(f[1]) - 3 for f in splits])

  return words, tags

def tokenize_unmarked_whole(data):
  words = [frag.split(' ') for frag in data]
  tokenized_inputs = tokenizer(words, max_length = 30, pad_to_max_length=True, truncation=True, is_split_into_words=True)

  return tokenized_inputs


def align_labels(labels, word_ids):
  aligned_labels = []
  for id in word_ids:
    if id is None:
      aligned_labels.append(0)
    else:
      aligned_labels.append(labels[id])

  return aligned_labels

def tokenize_and_align_labels(fragments, labels_lists):
  tokenized_inputs = tokenizer(fragments, max_length = 30, pad_to_max_length=True, truncation=True, is_split_into_words=True)
  new_labels = []
  for i, labels in enumerate(labels_lists):
    word_ids = tokenized_inputs.word_ids(i)
    new_labels.append(align_labels(labels, word_ids))

  tokenized_inputs["labels"] = new_labels
  return tokenized_inputs


In [None]:
### non marked only
def get_inference_dl(test):
  batch_size = 32
  test_ds = tokenize_unmarked_whole(test)
  all_ids = [test_ds.word_ids(i) for i in range(len(test))]
  test_seq = torch.tensor(test_ds['input_ids'])
  test_mask = torch.tensor(test_ds['attention_mask'])
  test_data = TensorDataset(test_seq, test_mask)
  test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False, num_workers=2)

  return test_loader, all_ids


In [None]:
### apply undersampling??

def get_all_dataloaders(train, test, dev):
  batch_size = 32

  train_words_step1, train_labels_step1 = tokenize_step1(train)
  test_words_step1, test_labels_step1 = tokenize_step1(test)
  dev_words_step1, dev_labels_step1 = tokenize_step1(dev)

  train_ds = tokenize_and_align_labels(train_words_step1, train_labels_step1)
  test_ds = tokenize_and_align_labels(test_words_step1, test_labels_step1)
  dev_ds = tokenize_and_align_labels(dev_words_step1, dev_labels_step1)

  all_ids = [test_ds.word_ids(i) for i in range(len(test))]

  train_seq = torch.tensor(train_ds['input_ids'])
  train_mask = torch.tensor(train_ds['attention_mask'])
  train_y = torch.tensor(train_ds['labels'])

  dev_seq = torch.tensor(dev_ds['input_ids'])
  dev_mask = torch.tensor(dev_ds['attention_mask'])
  dev_y = torch.tensor(dev_ds['labels'])

  test_seq = torch.tensor(test_ds['input_ids'])
  test_mask = torch.tensor(test_ds['attention_mask'])
  test_y = torch.tensor(test_ds['labels'])

  train_data = TensorDataset(train_seq, train_mask, train_y)
  dev_data = TensorDataset(dev_seq, dev_mask, dev_y)
  test_data = TensorDataset(test_seq, test_mask, test_y)

  class_weights = compute_class_weight(class_weight = 'balanced', classes = np.unique(np.concatenate(train_labels_step1)), y = np.concatenate(train_labels_step1))
  weights = torch.tensor(class_weights,dtype=torch.float)

  train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=2)
  dev_loader = DataLoader(dev_data, batch_size=batch_size, shuffle=False, num_workers=2)
  test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False, num_workers=2)

  return train_loader, dev_loader, test_loader, weights, all_ids



### Experimental setup

#### Model architecture (BertForSequenceClassification)
Do two variable experiments: unfreezed and freezed

In [None]:
class BertForTokenClassification(pl.LightningModule):
  def __init__(self, bert, lr = 1e-6, weighted = False, weights = []):
    super(BertForTokenClassification, self).__init__()
    self.bert = bert
    self.tok_classifier = nn.Linear(768, 3)

    if weighted:
      self.lossFn = nn.CrossEntropyLoss(weight = weights)
    else:
      self.lossFn = nn.CrossEntropyLoss()
    self.lr = lr

    self.all_pred = []
    self.all_gt = []
    self.all_train_loss = []
    self.all_dev_loss = []
    self.initialize_weights()

  def forward(self, tokens, mask):
    bert_output = self.bert(tokens, attention_mask = mask)
    sequence_output = bert_output[0]
    logits = self.tok_classifier(sequence_output)

    return logits

  def loss(self, logits, labels):
    return self.lossFn(logits.view(-1, 3), labels.view(-1))

  def training_step(self, train, i):
    x, mask, y = train
    probs = self.forward(x, mask)
    loss = self.loss(probs, y)
    self.log('train_loss', loss)
    self.all_train_loss.append(loss)
    return loss

  def validation_step(self, val, i):
    x, mask, y = val
    probs = self.forward(x, mask)
    loss = self.loss(probs, y)
    self.log('val_loss', loss)
    self.all_dev_loss.append(loss)
    return loss

  def test_step(self, test, i):
    x, mask, y = test
    probs = self(x, mask)

    self.all_pred.append(probs)
    self.all_gt.append(y)

    loss = self.loss(probs, y)
    self.log('test_loss', loss)

  def predict_step(self, predict, i):
    x, mask = predict
    probs = self.forward(x, mask)
    return probs


  def initialize_weights(self):
    nn.init.xavier_uniform_(self.tok_classifier.weight)
    nn.init.zeros_(self.tok_classifier.bias)


  def configure_optimizers(self):
      optimizer = torch.optim.AdamW(self.parameters(), lr=self.lr)
      scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=4, verbose=True)
      return optimizer


### Experimental protocol

In [None]:
from sklearn.metrics import classification_report

def train(model, train_loader, dev_loader, trainer):
  trainer.fit(model, train_loader, dev_loader)

def benchmark(model, test_loader, trainer, freeze = False, weighted = False):
  results = trainer.test(model, test_loader)
  all_pred_test = np.concatenate([x.detach().cpu() for x in model.all_pred])
  all_gt_test = np.concatenate([x.detach().cpu() for x in model.all_gt])
  pred_labels = np.argmax(all_pred_test, axis=2)

  all_labs = []
  all_gts = []

  for labs, gts in zip(pred_labels, all_gt_test):
    all_labs.extend([l for l, g in zip(labs, gts) if g != -100])
    all_gts.extend([g for l, g in zip(labs, gts) if g != -100])


  return {
      'pred': [x.detach().cpu() for x in model.all_pred],
      'apt': all_pred_test,
      'agt': all_gt_test,
      'labels': all_labs,
      'gt': all_gts,
      'report': classification_report(all_gts, all_labs, output_dict = True)
  }

def run_experiment(model, train_set, test_set, dev_set, freeze = False, weighted = False, n = 0):
  print('Importing model and tokenizer...')
  bert_model = AutoModel.from_pretrained(model)
  device = torch.device("cuda")
  bert_model = bert_model.to(device)

  if freeze:
    for param in bert_model.parameters():
      param.requires_grad = False



  print('Setting up data...')
  train_dl, dev_dl, test_dl, weights, test_word_ids = get_all_dataloaders(train_set, test_set, dev_set)
  early_stopping = EarlyStopping(monitor='val_loss', patience=8, verbose=True, mode='min')

  print('Setting up architecture...')

  arch = BertForTokenClassification(bert_model, 5e-5, weighted = weighted, weights = weights)
  trainer = pl.Trainer(callbacks=[early_stopping],accelerator="gpu", max_epochs = 10)

  print('Training start')
  train(arch, train_dl, dev_dl, trainer)
  return benchmark(arch, test_dl, trainer), arch, test_word_ids





In [None]:
res_1, model_1, dev_ids = run_experiment(model_checkpoint, frag_train['0'], frag_dev['0'], frag_test['0'], freeze = False, weighted = False, n=1)
res_2, model_2, _ = run_experiment(model_checkpoint, frag_train['0'], frag_dev['0'], frag_test['0'], freeze = False, weighted = True,n=2)


Importing model and tokenizer...
Setting up data...


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type             | Params
----------------------------------------------------
0 | bert           | BertModel        | 109 M 
1 | tok_classifier | Linear           | 2.3 K 
2 | lossFn         | CrossEntropyLoss | 0     
----------------------------------------------------
109 M     Trainable params
0         Non-trainable params
109 M     Total params
436.335   Total estimated model params size (MB)


Setting up architecture...
Training start


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

  self.pid = os.fork()


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_loss improved. New best score: 0.099


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Monitored metric val_loss did not improve in the last 8 records. Best score: 0.099. Signaling Trainer to stop.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

Importing model and tokenizer...
Setting up data...


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name           | Type             | Params
----------------------------------------------------
0 | bert           | BertModel        | 109 M 
1 | tok_classifier | Linear           | 2.3 K 
2 | lossFn         | CrossEntropyLoss | 0     
----------------------------------------------------
109 M     Trainable params
0         Non-trainable params
109 M     Total params
436.335   Total estimated model params size (MB)


Setting up architecture...
Training start


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

  self.pid = os.fork()


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Metric val_loss improved. New best score: 0.386


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.callbacks.early_stopping:Monitored metric val_loss did not improve in the last 8 records. Best score: 0.386. Signaling Trainer to stop.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

In [None]:
import pickle
all_res = [res_1, res_2]
def results_to_csv(result_arr):
  all_rows = []
  for resu in result_arr:
    cr = resu['report']

    arr_oov = np.concatenate([[cr[x]['f1-score'],
            cr[x]['precision'],
            cr[x]['recall'],] for x in ['0', '1', '2']])
    arr_oov = np.append(arr_oov, cr['accuracy'])
    arr_oov = np.append(arr_oov, cr['macro avg']['f1-score'])

    all_rows.append(arr_oov)

  return all_rows

In [None]:
all_rows = results_to_csv(all_res)

df_results = pd.DataFrame(all_rows, columns = ['f1_3','3_prec','3_recall','f1_4','4_prec','4_recall','f1_5','5_prec','5_recall','accuracy','f1_macro'])


Results on the Dev Set - Word Classification

In [None]:
df_results

Unnamed: 0,f1_3,3_prec,3_recall,f1_4,4_prec,4_recall,f1_5,5_prec,5_recall,accuracy,f1_macro
0,0.985368,0.983631,0.987112,0.673475,0.69897,0.649775,0.613532,0.619951,0.607245,0.964552,0.757459
1,0.983743,0.984785,0.982703,0.653369,0.6875,0.622466,0.614404,0.554235,0.689228,0.961273,0.750505


### Get word-wise results instead of subword wise

In [None]:
res_wordwise_decisions = []
for res in all_res:
  frags = []
  for subwords, ids in zip(np.argmax(res['apt'], axis=2), dev_ids):
    w = [0 for i in range(max([i for i in ids if not(i is None)])+1)]
    for i, x in enumerate(subwords):
      corresponding_word = ids[i]
      if not (ids[i] is None):
        w[corresponding_word] = max(w[corresponding_word], x)
    frags.append(w)
  res_wordwise_decisions.append(frags)


### save wordwise decisions
with open('pickled_results/res_wordwise_decisions.pkl', 'wb') as f:
  pickle.dump(res_wordwise_decisions, f)

