In [1]:
# cell-1  
#load and clean the data (removing diacritics and unwanted text)

import os
os.environ["CUDA_VISIBLE_DEVICES"]="0" 
import numpy as np
import tensorflow as tf
import pandas as pd
import pyarabic.araby as araby
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 1000)

df = pd.read_csv('../AraPoems_Dataset.csv')
df.fillna('', inplace=True)
display(len(df))


def remove_diacritics(a):    
    return araby.strip_diacritics(a)

df['first_hemistich'] = df['first_hemistich'].apply(remove_diacritics)
df['second_hemistich'] = df['second_hemistich'].apply(remove_diacritics)

def normalizeBeforeTraining(df):
    df['first_hemistich'] = df['first_hemistich'].str.replace('النابغـة: ', '')
    df['second_hemistich'] = df['second_hemistich'].str.replace('الـربيع: ', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('عبيــد: ', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('امـرؤ القيسـ: ', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('امرؤ القيس: ', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('(جلال الــــدين الــــرومي):', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('(لـوك الفيلسـوف الإنكليزي):', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('(كانت الفيلسوف الألماني ):', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('(بركســــــــــــــــون):', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('(الحـــــــــــــــور):', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('(الشــــــــــــــاعر):', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('(الإنســـــــــــــــان):', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('العلم):', '', regex=False)
    df['first_hemistich'] = df['first_hemistich'].str.replace('(العشــــــــــــــــق):', '', regex=False)
    df['first_hemistich'] = df['first_hemistich'].str.replace('(الزهــــــــــــــــــرة):', '', regex=False)
    df['second_hemistich'] = df['second_hemistich'].str.replace('التوأم اليشكري: ', '', regex=False)  
    df['first_hemistich'] = df['first_hemistich'].str.replace('آ', 'أ')
    df['second_hemistich'] = df['second_hemistich'].str.replace('آ', 'أ')
    df['first_hemistich'] = df['first_hemistich'].str.replace('[/":?،؟]', '')
    df['second_hemistich'] = df['second_hemistich'].str.replace('[/":?،؟]', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('  ', ' ')
    df['second_hemistich'] = df['second_hemistich'].str.replace('  ', ' ')
    df['first_hemistich'] = df['first_hemistich'].str.replace('  ', ' ')
    df['second_hemistich'] = df['second_hemistich'].str.replace('  ', ' ')


normalizeBeforeTraining(df)
df.drop(df[(df['first_hemistich'] == '') & (df['second_hemistich'] == '')].index, inplace=True)

#if first_hemistich == '', then copy the text from second_hemistich. then delete the text in the second_hemistich
df['first_hemistich'] = df.apply(lambda x: x['second_hemistich'] if x['first_hemistich'] == '' else x['first_hemistich'], axis=1)
df['second_hemistich'] = df.apply(lambda x: '' if x['first_hemistich'] == x['second_hemistich'] else x['second_hemistich'], axis=1)

df.reset_index(drop=True, inplace=True)

display(len(df))
# display(df[:10])
print('done')

2023-10-01 00:52:15.205219: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-10-01 00:52:15.224415: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


2090907

2090907

done


In [2]:
# cell-2 
# preparing data for finetuning


df['second_hemistich'].replace('', 'E', inplace=True)
dfc = df[['first_hemistich', 'second_hemistich', 'meter', 'sub_meter', 'link']].copy()
dfc['text'] = dfc['first_hemistich'] + ' S ' + dfc['second_hemistich']

#removing verses without a meter
dfc = dfc[dfc['meter'] != ''] 
dfc = dfc[dfc['meter'] != 'unspecified']
dfc = dfc[dfc['meter'] != 'mixed']



classic = ['taweel', 'kamel', 'baseet', 'khafif', 'wafer', 'rajaz', 'ramel', 'mutaqarib',
           'saree', 'munsarih', 'mujtath', 'hazaj', 'madeed', 'mutadarak', 'muqtadab', 'mudari'] 

#including only verses with classical meters
dfc = dfc[dfc['meter'].isin(classic)]

dfc.reset_index(drop=True, inplace=True)

dfc['complete_meter'] = dfc['meter'] + ' ' + dfc['sub_meter']

dfc = dfc[dfc['complete_meter'] != 'kamel maktuu']
dfc = dfc[dfc['complete_meter'] != 'mutadarak manhuk']
dfc = dfc[dfc['complete_meter'] != 'baseet mashture']
dfc = dfc[dfc['complete_meter'] != 'munsarih manhuk']
dfc = dfc[dfc['complete_meter'] != 'mutaqarib majzuu']
dfc = dfc[dfc['complete_meter'] != 'mutadarak majzuu']
dfc = dfc[dfc['complete_meter'] != 'rajaz manhuk']
dfc = dfc[dfc['complete_meter'] != 'baseet majzuu']


dfc.reset_index(drop=True, inplace=True)


dfc['complete_meter'] = dfc['complete_meter'].astype('category')
# display(dfc['meter'].unique())

dfc['label'] = dfc['complete_meter'].cat.codes #assign cat_value for each meter type
dftrain, dftest = train_test_split(dfc, test_size=0.20, random_state=42, stratify=dfc['label'])
ytrain = dftrain['label'].values.tolist()
ytest = dftest['label'].values.tolist()



max_sequence_length = 32
train_batch_size = 256
classes_num = len(dfc['complete_meter'].unique())

display(classes_num)
display(len(dfc))
display(len(dftrain))
display(len(dftest))

25

1850027

1480021

370006

In [3]:
#cell-3
#loading the tokenizer and the model

from transformers import AutoTokenizer,TFBertModel, BertForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained('UBC-NLP/ARBERT')
# bert = TFBertModel.from_pretrained('faisalq/bert-base-arapoembert', from_pt=True)
model = BertForSequenceClassification.from_pretrained('UBC-NLP/ARBERT',
                                                      num_labels=classes_num).to('cuda')                                        


Some weights of the model checkpoint at UBC-NLP/ARBERT were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at UB

In [4]:
#cell-4
#tokenizing the data

xtrain = tokenizer(
    text=dftrain['text'].tolist(),
    add_special_tokens=True,
    max_length = max_sequence_length,
    truncation=True,
    padding='max_length', 
    return_tensors='pt',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)


xtest = tokenizer(
    text=dftest['text'].tolist(),
    add_special_tokens=True,
    max_length = max_sequence_length,
    truncation=True,
    padding='max_length', 
    return_tensors='pt',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

In [5]:
#group text and labels and create train_ds and test_ds

import torch

class NewGroupDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item['label'] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

train_ds = NewGroupDataset(xtrain, ytrain)
test_ds = NewGroupDataset(xtest, ytest)

from sklearn.metrics import accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return { 'accuracy': acc }

In [6]:
from transformers import Trainer, TrainingArguments


epochs = 7
save_steps = 10000 #save checkpoint every 10000 steps
batch_size = 256

training_args = TrainingArguments(
    output_dir = 'ARBERT_base_submeter/',
    overwrite_output_dir=True,
    num_train_epochs = epochs,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    save_steps = save_steps,
    save_total_limit = 5, #only save the last 5 checkpoints
    fp16=True,
    learning_rate = 5e-5,  # 5e-5 is the default
    logging_steps = 3000, #50_000
    evaluation_strategy = 'steps',
    # evaluate_during_training = True,
    eval_steps = 3000
    
)

trainer = Trainer(
    model = model,
    args = training_args,
    # data_collator=data_collator,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics = compute_metrics
)


# trainer.train(resume_from_checkpoint=True)
trainer.train()

Step,Training Loss,Validation Loss,Accuracy
3000,0.5052,0.183693,0.948174
6000,0.1718,0.14549,0.960655
9000,0.1288,0.137695,0.962974
12000,0.1178,0.127936,0.966563
15000,0.0972,0.125391,0.966317
18000,0.0921,0.125639,0.967528
21000,0.0788,0.123077,0.968073
24000,0.0733,0.126283,0.968395
27000,0.0627,0.126223,0.968644
30000,0.0579,0.138042,0.966425


TrainOutput(global_step=40474, training_loss=0.11335297373553807, metrics={'train_runtime': 3078.7112, 'train_samples_per_second': 3365.092, 'train_steps_per_second': 13.146, 'total_flos': 1.70402007743059e+17, 'train_loss': 0.11335297373553807, 'epoch': 7.0})

In [7]:
trainer.save_model('ARBERT_base_submeter/')