In [None]:
!pip install pandas
!pip install numpy==1.23

In [None]:
!pip install tensorflow==2.12
!pip install pyarabic

In [None]:
!pip install transformers

In [None]:
!pip install ipywidgets
!pip install datasets
!pip install transformers[torch]
!pip install nvidia-ml-py3

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0" 

import numpy as np
import tensorflow as tf
import pandas as pd
import pyarabic.araby as araby
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 1000)

2023-09-14 20:09:20.906226: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-14 20:09:21.346551: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# cell-1  
#load and clean the data (removing diacritics and unwanted text)

df = pd.read_csv('AraPoems_Dataset.csv')
df.fillna('', inplace=True)
display(len(df))


def remove_diacritics(a):    
    return araby.strip_diacritics(a)

df['first_hemistich'] = df['first_hemistich'].apply(remove_diacritics)
df['second_hemistich'] = df['second_hemistich'].apply(remove_diacritics)

def normalizeBeforeTraining(df):
    df['first_hemistich'] = df['first_hemistich'].str.replace('النابغـة: ', '')
    df['second_hemistich'] = df['second_hemistich'].str.replace('الـربيع: ', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('عبيــد: ', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('امـرؤ القيسـ: ', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('امرؤ القيس: ', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('(جلال الــــدين الــــرومي):', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('(لـوك الفيلسـوف الإنكليزي):', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('(كانت الفيلسوف الألماني ):', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('(بركســــــــــــــــون):', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('(الحـــــــــــــــور):', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('(الشــــــــــــــاعر):', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('(الإنســـــــــــــــان):', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('العلم):', '', regex=False)
    df['first_hemistich'] = df['first_hemistich'].str.replace('(العشــــــــــــــــق):', '', regex=False)
    df['first_hemistich'] = df['first_hemistich'].str.replace('(الزهــــــــــــــــــرة):', '', regex=False)
    df['second_hemistich'] = df['second_hemistich'].str.replace('التوأم اليشكري: ', '', regex=False)  
    df['first_hemistich'] = df['first_hemistich'].str.replace('آ', 'أ')
    df['second_hemistich'] = df['second_hemistich'].str.replace('آ', 'أ')
    df['first_hemistich'] = df['first_hemistich'].str.replace('[/":?،؟]', '')
    df['second_hemistich'] = df['second_hemistich'].str.replace('[/":?،؟]', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('  ', ' ')
    df['second_hemistich'] = df['second_hemistich'].str.replace('  ', ' ')
    df['first_hemistich'] = df['first_hemistich'].str.replace('  ', ' ')
    df['second_hemistich'] = df['second_hemistich'].str.replace('  ', ' ')


normalizeBeforeTraining(df)
df.drop(df[(df['first_hemistich'] == '') & (df['second_hemistich'] == '')].index, inplace=True)

#if first_hemistich == '', then copy the text from second_hemistich. then delete the text in the second_hemistich
df['first_hemistich'] = df.apply(lambda x: x['second_hemistich'] if x['first_hemistich'] == '' else x['first_hemistich'], axis=1)
df['second_hemistich'] = df.apply(lambda x: '' if x['first_hemistich'] == x['second_hemistich'] else x['second_hemistich'], axis=1)

df.reset_index(drop=True, inplace=True)

display(len(df))
# display(df[:10])
print('done')

2090907

2090907

done


In [3]:
# cell-2 
# preparing data for finetuning


df['second_hemistich'].replace('', 'E', inplace=True)
dfc = df[['first_hemistich', 'second_hemistich', 'meter', 'link']].copy()
dfc['text'] = dfc['first_hemistich'] + ' S ' + dfc['second_hemistich']

#removing verses without a meter
dfc = dfc[dfc['meter'] != ''] 
dfc = dfc[dfc['meter'] != 'unspecified']
dfc = dfc[dfc['meter'] != 'mixed']



classic = ['taweel', 'kamel', 'baseet', 'khafif', 'wafer', 'rajaz', 'ramel', 'mutaqarib',
           'saree', 'munsarih', 'mujtath', 'hazaj', 'madeed', 'mutadarak', 'muqtadab', 'mudari'] 

#including only verses with classical meters
# dfc = dfc[dfc['meter'].isin(classic)]

dfc.reset_index(drop=True, inplace=True)

dfc['meter'] = dfc['meter'].astype('category')
# display(dfc['meter'].unique())

dfc['label'] = dfc['meter'].cat.codes #assign cat_value for each meter type
dftrain, dftest = train_test_split(dfc, test_size=0.20, random_state=42, stratify=dfc['label'])
ytrain = to_categorical(dftrain['label']).astype('int32')
ytest = to_categorical(dftest['label']).astype('int32')

max_sequence_length = 32
train_batch_size = 256
classes_num = len(dfc['meter'].unique())

display(len(dfc))
display(len(dftrain))
display(len(dftest))

1911853

1529482

382371

In [4]:
#cell-3
#loading the tokenizer and the model

from transformers import AutoTokenizer,TFBertModel

tokenizer = AutoTokenizer.from_pretrained('UBC-NLP/ARBERT')
bert = TFBertModel.from_pretrained('UBC-NLP/ARBERT', from_pt=True)


2023-09-14 20:10:01.400625: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-09-14 20:10:01.520974: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-09-14 20:10:01.521161: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [5]:
#cell-4
#tokenizing the data

xtrain = tokenizer(
    text=dftrain['text'].tolist(),
    add_special_tokens=True,
    max_length = max_sequence_length,
    truncation=True,
    padding='max_length', 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)


xtest = tokenizer(
    text=dftest['text'].tolist(),
    add_special_tokens=True,
    max_length = max_sequence_length,
    truncation=True,
    padding='max_length', 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

In [7]:
# display(xtest)
display(xtest['input_ids'][0])

<tf.Tensor: shape=(32,), dtype=int32, numpy=
array([    2,  2914,   277,   151,  2890,  1110,    83, 31553,  8365,
           9, 25033,   122,    85,  4147,  5444,   678,   321, 40125,
           3,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0], dtype=int32)>

In [6]:
#cell-5
# building classifier model

input_ids = Input(shape=(max_sequence_length,), dtype=tf.int32, name="input_ids")
input_mask = Input(shape=(max_sequence_length,), dtype=tf.int32, name="attention_mask")

output = bert([input_ids, input_mask])[1] #pooled_output
output = tf.keras.layers.Dense(classes_num, activation='softmax', name='output')(output)
   
model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=output)

optimizer = Adam(learning_rate=5e-05)


loss =CategoricalCrossentropy(from_logits = True)
metric = CategoricalAccuracy('balanced_accuracy'),

model.compile(optimizer = optimizer, loss = loss, metrics = metric)

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 32)]         0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 32)]         0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  162841344   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 32,                                            

In [7]:
tf.config.experimental_run_functions_eagerly(True)
tf.config.run_functions_eagerly(True)

Instructions for updating:
Use `tf.config.run_functions_eagerly` instead of the experimental version.


In [8]:
#train the model (all meters)

train_history = model.fit(
    x ={'input_ids':xtrain['input_ids'],'attention_mask':xtrain['attention_mask']}, y = ytrain,
    validation_data = ({'input_ids':xtest['input_ids'],'attention_mask':xtest['attention_mask']}, 
    ytest), epochs=6, batch_size=train_batch_size )


Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [9]:
#model evaluation (all meters)

pred = model.predict({'input_ids':xtest['input_ids'],'attention_mask':xtest['attention_mask']})

y_pred = np.argmax(pred, axis = 1)
y_pred = to_categorical(y_pred, num_classes=classes_num).astype('int32')

print(classification_report(ytest, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.9843    0.9907    0.9875     53009
           1     0.7030    0.5311    0.6051      1480
           2     0.8790    0.8790    0.8790       587
           3     0.8552    0.7183    0.7808      1686
           4     1.0000    0.1000    0.1818        10
           5     0.8020    0.8331    0.8172      1833
           6     0.9813    0.9750    0.9781     81382
           7     0.0000    0.0000    0.0000        12
           8     0.9740    0.9864    0.9802     34821
           9     0.0000    0.0000    0.0000         1
          10     0.8093    0.8946    0.8498      1755
          11     0.7465    0.2062    0.3232       257
          12     0.5789    0.3793    0.4583       377
          13     0.7643    0.5436    0.6353      7682
          14     0.4286    0.2917    0.3471        72
          15     0.8655    0.9525    0.9069      3871
          16     0.9415    0.9620    0.9516      5973
          17     0.8788    

In [8]:
#train the model (classical meters)

train_history = model.fit(
    x ={'input_ids':xtrain['input_ids'],'attention_mask':xtrain['attention_mask']}, y = ytrain,
    validation_data = ({'input_ids':xtest['input_ids'],'attention_mask':xtest['attention_mask']}, 
    ytest), epochs=6, batch_size=train_batch_size )


Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [9]:
#model evaluation (classical meters)

pred = model.predict({'input_ids':xtest['input_ids'],'attention_mask':xtest['attention_mask']})

y_pred = np.argmax(pred, axis = 1)
y_pred = to_categorical(y_pred, num_classes=classes_num).astype('int32')

print(classification_report(ytest, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.9903    0.9918    0.9911     53009
           1     0.8393    0.8603    0.8497      1833
           2     0.9826    0.9837    0.9832     81382
           3     0.9868    0.9862    0.9865     34821
           4     0.9395    0.8496    0.8923      1755
           5     0.7391    0.2361    0.3579        72
           6     0.9563    0.9380    0.9471      3871
           7     0.9287    0.9675    0.9477      5973
           8     0.8596    0.8053    0.8315       190
           9     0.8705    0.8824    0.8764      1326
          10     0.9759    0.9816    0.9787     13905
          11     0.9429    0.9210    0.9318     22267
          12     0.9624    0.9768    0.9696     17971
          13     0.9596    0.9533    0.9564     12252
          14     0.9937    0.9953    0.9945     88553
          15     0.9875    0.9859    0.9867     30891

   micro avg     0.9803    0.9803    0.9803    370071
   macro avg     0.9322   

In [10]:

# classifier_path = 'finetuned/classic_meters_classifierTF_arbert.h5'
classifier_path = 'finetuned/all_meters_classifierTF_arbert.h5'


In [11]:
#saving finetuned model locally

model.save_weights(classifier_path)