In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
cd "/content/drive/MyDrive/GUC/NLP_Final"

/content/drive/MyDrive/GUC/NLP_Final


# <center>Model Training</center>

The pipeline consists of 5 stages as follows:
1. Prepare the train/dev/test sets:
> - The testing data is split into dev and test data and shuffle all the data with seed equal to 42.
> - All the data is then mapped into one vector of length 160 to train the model.
2. Load the PreTrained 'arabert' model:
> - Load "aubmindlab/bert-base-arabert" version of the model and fine-tune it.
3. Configure the model to train:
> - Configure the needed hyperparameters of the model such as `batch size`, `pad_token_id`, `max_length`, and `vocab_size` 
> - Build the compute matrix for the trainer and initialize the trainer.
4. Fine-tune the model:
> - Strart the model training.
5. Evaluate the model: 
> - Evaluate the model by calculating the BELU score and loss for the model against our testing data.

## Install Needed libraries

In [None]:
%%capture
!pip install numpy
!pip install pandas
!pip install csv
!pip install git-python==1.0.3
!pip install sacrebleu==1.4.2
!pip install rouge_score
!pip install farasapy
!pip install pyarabic
!pip install datasets
!pip install -U transformers==4.5.1

!git clone https://github.com/aub-mind/arabert

## Import Needed Libraries

In [4]:
import os
import numpy as np
import pandas as pd
from datasets import load_dataset 
from transformers import AutoTokenizer
from transformers import EncoderDecoderModel
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from sacrebleu import corpus_bleu
import torch
import torch.nn as nn
from transformers import pipeline

Use the `Dataset_Structure` class to load the data.

In [10]:
all_data = load_dataset("Dataset_Structure.py")



Downloading and preparing dataset arabic_emp_conv_dataset/arabic_emp_3_history_conv (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/arabic_emp_conv_dataset/arabic_emp_3_history_conv/1.0.0/ecdaa4fe528fc3e25b5709ebe52973cc2bada0b9a7469a0e460d14d52a9ae67a...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset arabic_emp_conv_dataset downloaded and prepared to /root/.cache/huggingface/datasets/arabic_emp_conv_dataset/arabic_emp_3_history_conv/1.0.0/ecdaa4fe528fc3e25b5709ebe52973cc2bada0b9a7469a0e460d14d52a9ae67a. Subsequent calls will reuse this data.


In [11]:
# the data structure
all_data

DatasetDict({
    train: Dataset({
        features: ['context', 'emotion', 'response'],
        num_rows: 37663
    })
    test: Dataset({
        features: ['context', 'emotion', 'response'],
        num_rows: 5247
    })
})

In [12]:
# split and shuffle the data
train_data = all_data['train'].shuffle(seed=42)
dev_data = all_data['test'].train_test_split(test_size=0.1,seed=42)['train']
test_data = all_data['test'].train_test_split(test_size=0.1,seed=42)['test']

Loading cached split indices for dataset at /root/.cache/huggingface/datasets/arabic_emp_conv_dataset/arabic_emp_3_history_conv/1.0.0/ecdaa4fe528fc3e25b5709ebe52973cc2bada0b9a7469a0e460d14d52a9ae67a/cache-effe3cb103398796.arrow and /root/.cache/huggingface/datasets/arabic_emp_conv_dataset/arabic_emp_3_history_conv/1.0.0/ecdaa4fe528fc3e25b5709ebe52973cc2bada0b9a7469a0e460d14d52a9ae67a/cache-60b41c1e0cae88f3.arrow


In [13]:
print("Length of train data",len(train_data))
print("Length of dev data",len(dev_data))
print("Length of test data",len(test_data))

Length of train data 37663
Length of dev data 4722
Length of test data 525


## Model

### Prepare the train/dev/test sets

In [14]:
# assign the encoder and decoder max length and batch sizze and load the tokenizer
encoder_max_length=150
decoder_max_length=150
batch_size=32
model_name = "aubmindlab/bert-base-arabert"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [15]:
# use the tokenizer to map the data ont numeric vectors to train the model
def process_data_to_model_inputs(batch):                                                               
    # Tokenizer will automatically set [BOS] <text> [EOS]                                               
    inputs = tokenizer(batch["context"], padding="max_length", truncation=True, max_length=encoder_max_length)
    outputs = tokenizer(batch["response"], padding="max_length", truncation=True, max_length=decoder_max_length)
                                                                                                        
    batch["input_ids"] = inputs.input_ids                                                               
    batch["attention_mask"] = inputs.attention_mask                                                     
    batch["decoder_input_ids"] = outputs.input_ids                                                      
    batch["labels"] = outputs.input_ids.copy()                                                          
    # mask loss for padding                                                                             
    batch["labels"] = [                                                                                 
        [-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]
    ]                     
    batch["decoder_attention_mask"] = outputs.attention_mask                                                                                                                                                                     
    return batch  

# map the training data
train_data = train_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["context", "response"],
)
train_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

# map the dev data
dev_data = dev_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["context", "response"],
)
dev_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

# map the test data
test_data = test_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["context", "response"],
)
test_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

HBox(children=(FloatProgress(value=0.0, max=1177.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=148.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=17.0), HTML(value='')))




### Load PreTrained arabert model

In [18]:
arabert2arabert = EncoderDecoderModel.from_encoder_decoder_pretrained(model_name, model_name, tie_encoder_decoder=False)

Some weights of the model checkpoint at aubmindlab/bert-base-arabert were not used when initializing BertLMHeadModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertLMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertLMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertLMHeadModel were not initialized from the model checkpoint at aubmindlab/bert-base-arabert and are newly initialized: ['bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.key.bias

### Configure the model to train

In [19]:
# set special tokens
arabert2arabert.config.decoder_start_token_id = tokenizer.cls_token_id                                             
arabert2arabert.config.eos_token_id = tokenizer.sep_token_id
arabert2arabert.config.pad_token_id = tokenizer.pad_token_id

# set decoding params                               
arabert2arabert.config.max_length = 64
arabert2arabert.config.early_stopping = True
arabert2arabert.config.num_beams = 1
arabert2arabert.config.vocab_size = arabert2arabert.config.encoder.vocab_size

In [20]:
# implement the compute matrix to be used in evaluation the model
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions  

    # all unnecessary tokens are removed
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    return {"bleu": round(corpus_bleu(pred_str , [label_str]).score, 4)}

In [21]:
#Set training arguments 
training_args = Seq2SeqTrainingArguments(
    output_dir="./model",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size//2,
    gradient_accumulation_steps =3,
    predict_with_generate=True,
    do_eval=True,
    evaluation_strategy ="epoch",
    do_train=True,
    logging_steps=500,  
    save_steps= 32965 // ( batch_size * 2),  
    warmup_steps=100,
    eval_steps=10,
    num_train_epochs=2,
    overwrite_output_dir=True,
    save_total_limit=0,
)

In [22]:
#Set training arguments 
trainer = Seq2SeqTrainer(
    model=arabert2arabert,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=dev_data,
    tokenizer=tokenizer
)

### Train the model

In [None]:
trainer.train()

In [None]:
# save the model
trainer._save("/trained_model/model")
tokenizer.save_pretrained("/trained_model/tokenizer")

### Evaluate The Model

In [None]:
trainer.evaluate()