<a href="https://colab.research.google.com/github/DatNguyen2084/DLDH-Metaphor-detection/blob/main/MBERT_Token_classification_VUA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Intermediate-task fine-tunning MBERT for Metaphor detection
We apply here the principle of transfer learning. We train the based MBERT on the intermediate-task with the VUA-dataset. The trained model will be used for the final task on KontextBruch-dataset

## Install transformers and import packages

In [None]:
!pip install -q sentence_transformers
!pip install -q datasets
!pip install seqeval



In [None]:
import gc, torch
gc.collect()
torch.cuda.empty_cache()

In [None]:
import pandas as pd
import os
import os.path
import numpy as np
import torch
from sklearn.model_selection import train_test_split

##Utils

### Mount to data folder

In [None]:
# Mount Google Drive
# The following data is needed: https://drive.google.com/drive/folders/1uPnLexQh8kbV5ErVR7ksagVKP_wDd4a0?usp=sharing
# Create a shortcut to your Drive ("Drive-Verknüpfung hinzufügen" zu "Meine Ablage")
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
ROOT_PATH = '/content/drive/My Drive/DLDH'
DATA_PATH = '/data'
MODEL_PATH = '/model'
MIP_PATH = '/content/drive/My Drive/Annotationen - MIP - 11 Datensätze'

### Load MBERT and Tokenizer

In [None]:
from transformers import BertTokenizerFast, BertForTokenClassification, AutoTokenizer

tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-uncased')

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/851k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.64M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

## TroFi Intermediate-task Fine-tunning

### Convert VUA data set for token classification

In [None]:
def transform_data():
  vua_df = pd.read_csv(ROOT_PATH + DATA_PATH + '/VUA/VUA_train_features2.csv', sep=',', header=0)
  sentences_array = []
  labels_array = []
  for sentence in vua_df.sentence.unique():
    s = ''
    labels = [-100]
    indices = vua_df.loc[vua_df.sentence == sentence].index
    for index in indices:
      s = s + vua_df.iloc[index].word + ' '
      labels.append(vua_df.iloc[index].label)
    labels.append(-100)
    sentences_array.append(s[:-1])
    labels_array.append(labels)

  bert_vua_df = pd.DataFrame({'sentence': sentences_array, 'labels': labels_array})
  return bert_vua_df

In [None]:
bert_vua_df = transform_data()
bert_vua_df.head(10)

Unnamed: 0,sentence,labels
0,Latest corporate unbundler reveals laid-back a...,"[-100, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, ..."
1,By FRANK KANE,"[-100, 0, 0, 0, -100]"
2,IT SEEMS that Roland Franklin the latest unbun...,"[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,He has not properly investigated the target 's...,"[-100, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, -100]"
4,The 63-year-old head of Pembridge Investments ...,"[-100, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, ..."
5,If he had taken his own rule seriously he woul...,"[-100, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6,There are other things he has on his own admis...,"[-100, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, ..."
7,When the bid was launched last week Mr Frankli...,"[-100, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, ..."
8,He regards the charges as unfounded,"[-100, 0, 1, 0, 1, 0, 0, -100]"
9,On property he is blunt,"[-100, 1, 0, 0, 0, 1, -100]"


### Define Dataset

In [None]:
class VUADataset(torch.utils.data.Dataset):

    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {
            key: torch.tensor(val[idx]) for key, val in self.encodings.items()
        }
        labelcopy = self.labels[idx]
        for i in range(len(item['input_ids'])-len(self.labels[idx])):
          labelcopy.append(-100)
        item["labels"] = torch.tensor(labelcopy)

        return item

    def __len__(self):
        return len(self.labels)

###Compute Metric

In [None]:
from datasets import load_metric
from sklearn.metrics import accuracy_score, f1_score
metric = load_metric("seqeval")
label_names = ['N', 'M']
def compute_metrics(model_outputs):

    predictions, labels = model_outputs
    predictions = np.argmax(predictions, axis=-1)

    true_predictions = [
        # label_list[pred]
        [pred for pred, label in zip(prediction_seq, label_seq)
            if label != -100]
        for prediction_seq, label_seq in zip(predictions, labels)
    ]
    true_labels = [
        [label for pred, label in zip(prediction_seq, label_seq)
            if label != -100]
        for prediction_seq, label_seq in zip(predictions, labels)
    ]

    results = metric.compute(
        predictions=true_predictions,
        references=true_labels,
    )

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

### Load MBERT and train model

In [None]:
model = BertForTokenClassification.from_pretrained(
        "bert-base-multilingual-uncased",
        num_labels=2,
        #id2label=id2tag,
        #label2id=tag2id,
    )

Downloading:   0%|          | 0.00/641M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint a

In [None]:
from transformers import EarlyStoppingCallback, TrainingArguments, Trainer

max_length = 512
train_df, testdf = train_test_split(bert_vua_df, test_size=0.2, random_state=32)

train_encodings = tokenizer(train_df['sentence'].to_list(), padding=True)
test_encodings = tokenizer(testdf['sentence'].to_list(), padding=True)

train_dataset = VUADataset(train_encodings, train_df['labels'].values)
val_dataset = VUADataset(test_encodings, testdf['labels'].values)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=6,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=1,
    save_strategy="epoch",
    logging_strategy="epoch",
    #metric_for_best_model="f1",
    #load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    #callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()
trainer.save_model(ROOT_PATH + "/intermediate-task-vua/model")
tokenizer.save_pretrained(ROOT_PATH + "/intermediate-task-vua/model")

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 8727
  Num Epochs = 6
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3276


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1307,0.344708,0.0,0.0,0.0,0.897516
2,0.1287,0.298036,0.0,0.0,0.0,0.904613
3,0.1208,0.284521,0.0,0.0,0.0,0.907464
4,0.0794,0.363127,0.0,0.0,0.0,0.901508
5,0.0507,0.404966,0.0,0.0,0.0,0.903916
6,0.0312,0.474509,0.0,0.0,0.0,0.904106


***** Running Evaluation *****
  Num examples = 2182
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
Saving model checkpoint to ./results/checkpoint-546
Configuration saved in ./results/checkpoint-546/config.json
Model weights saved in ./results/checkpoint-546/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2182
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
Saving model checkpoint to ./results/checkpoint-1092
Configuration saved in ./results/checkpoint-1092/config.json
Model weights saved in ./results/checkpoint-1092/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-546] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2182
  Batch size = 16
  _warn_prf(

('/content/drive/My Drive/DLDH/intermediate-task-vua/model/tokenizer_config.json',
 '/content/drive/My Drive/DLDH/intermediate-task-vua/model/special_tokens_map.json',
 '/content/drive/My Drive/DLDH/intermediate-task-vua/model/vocab.txt',
 '/content/drive/My Drive/DLDH/intermediate-task-vua/model/added_tokens.json',
 '/content/drive/My Drive/DLDH/intermediate-task-vua/model/tokenizer.json')