In [None]:
!pip install pandas
!pip install numpy==1.23

In [None]:
!pip install tensorflow==2.12
!pip install pyarabic

In [None]:
!pip install transformers

In [None]:
!pip install ipywidgets
!pip install datasets
!pip install transformers[torch]
!pip install nvidia-ml-py3

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0" 

In [None]:
import numpy as np
import tensorflow as tf
import pandas as pd
import pyarabic.araby as araby
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 1000)

In [3]:
# cell-1  
#load and clean the data (removing diacritics and unwanted text)

df = pd.read_csv('poemsDataset.csv')
df.fillna('', inplace=True)
display(len(df))


def remove_diacritics(a):    
    return araby.strip_diacritics(a)

df['first_hemistich'] = df['first_hemistich'].apply(remove_diacritics)
df['second_hemistich'] = df['second_hemistich'].apply(remove_diacritics)

def normalizeBeforeTraining(df):
    df['first_hemistich'] = df['first_hemistich'].str.replace('النابغـة: ', '')
    df['second_hemistich'] = df['second_hemistich'].str.replace('الـربيع: ', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('عبيــد: ', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('امـرؤ القيسـ: ', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('امرؤ القيس: ', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('(جلال الــــدين الــــرومي):', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('(لـوك الفيلسـوف الإنكليزي):', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('(كانت الفيلسوف الألماني ):', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('(بركســــــــــــــــون):', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('(الحـــــــــــــــور):', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('(الشــــــــــــــاعر):', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('(الإنســـــــــــــــان):', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('العلم):', '', regex=False)
    df['first_hemistich'] = df['first_hemistich'].str.replace('(العشــــــــــــــــق):', '', regex=False)
    df['first_hemistich'] = df['first_hemistich'].str.replace('(الزهــــــــــــــــــرة):', '', regex=False)
    df['second_hemistich'] = df['second_hemistich'].str.replace('التوأم اليشكري: ', '', regex=False)  
    df['first_hemistich'] = df['first_hemistich'].str.replace('آ', 'أ')
    df['second_hemistich'] = df['second_hemistich'].str.replace('آ', 'أ')
    df['first_hemistich'] = df['first_hemistich'].str.replace('[/":?،؟]', '')
    df['second_hemistich'] = df['second_hemistich'].str.replace('[/":?،؟]', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('  ', ' ')
    df['second_hemistich'] = df['second_hemistich'].str.replace('  ', ' ')
    df['first_hemistich'] = df['first_hemistich'].str.replace('  ', ' ')
    df['second_hemistich'] = df['second_hemistich'].str.replace('  ', ' ')


normalizeBeforeTraining(df)
df.drop(df[(df['first_hemistich'] == '') & (df['second_hemistich'] == '')].index, inplace=True)

#if first_hemistich == '', then copy the text from second_hemistich. then delete the text in the second_hemistich
df['first_hemistich'] = df.apply(lambda x: x['second_hemistich'] if x['first_hemistich'] == '' else x['first_hemistich'], axis=1)
df['second_hemistich'] = df.apply(lambda x: '' if x['first_hemistich'] == x['second_hemistich'] else x['second_hemistich'], axis=1)

df.reset_index(drop=True, inplace=True)

display(len(df))
# display(df[:10])
print('done')

2090907

2090907

done


In [4]:
# cell-2 
# preparing data for finetuning


df['second_hemistich'].replace('', 'E', inplace=True)
dfc = df[['first_hemistich', 'second_hemistich', 'meter', 'link']].copy()
dfc['text'] = dfc['first_hemistich'] + ' S ' + dfc['second_hemistich']

#removing verses without a meter
dfc = dfc[dfc['meter'] != ''] 
dfc = dfc[dfc['meter'] != 'unspecified']
dfc = dfc[dfc['meter'] != 'mixed']


nonclassic100 = ['luaihani', 'sakhri', 'hajini', 'kankan', 'zajal'] #5 #meters with >100 verse
nonclassic = ['masehube', 'selselah', 'mawalia', 'doubeet', 'colloquial', 'free_form', 'muashah'] #7

classic = ['taweel', 'kamel', 'baseet', 'khafif', 'wafer', 'rajaz', 'ramel', 'mutaqarib',
           'saree', 'munsarih', 'mujtath', 'hazaj', 'madeed', 'mutadarak', 'muqtadab', 'mudari'] #16


#removing non-classical meters (comment to be included in the finetuning process)
# dfc = dfc[~dfc['meter'].isin(nonclassic100)]
# dfc = dfc[~dfc['meter'].isin(nonclassic)]

#including only verses with classical meters
dfc = dfc[dfc['meter'].isin(classic)]

dfc.reset_index(drop=True, inplace=True)

dfc['meter'] = dfc['meter'].astype('category')
# display(dfc['meter'].unique())

dfc['label'] = dfc['meter'].cat.codes #assign cat_value for each meter type
dftrain, dftest = train_test_split(dfc, test_size=0.20, random_state=42, stratify=dfc['label'])
ytrain = to_categorical(dftrain['label']).astype('int32')
ytest = to_categorical(dftest['label']).astype('int32')

max_sequence_length = 32
train_batch_size = 256
classes_num = len(dfc['meter'].unique())

display(len(dfc))
display(len(dftrain))
display(len(dftest))

1850351

1480280

370071

In [None]:
#cell-3
#loading the tokenizer and the model

from transformers import AutoTokenizer,TFBertModel

tokenizer = AutoTokenizer.from_pretrained('faisalq/bert-medium-arapoembert')
bert = TFBertModel.from_pretrained('faisalq/bert-medium-arapoembert', from_pt=True)


In [6]:
#cell-4
#tokenizing the data

xtrain = tokenizer(
    text=dftrain['text'].tolist(),
    add_special_tokens=True,
    max_length = max_sequence_length,
    truncation=True,
    padding='max_length', 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)


xtest = tokenizer(
    text=dftest['text'].tolist(),
    add_special_tokens=True,
    max_length = max_sequence_length,
    truncation=True,
    padding='max_length', 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

In [7]:
# display(xtest)
display(xtest['input_ids'][0])

<tf.Tensor: shape=(32,), dtype=int32, numpy=
array([    2,  2914,   277,   151,  2890,  1110,    83, 31553,  8365,
           9, 25033,   122,    85,  4147,  5444,   678,   321, 40125,
           3,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0], dtype=int32)>

In [7]:
#cell-5
# building classifier model

input_ids = Input(shape=(max_sequence_length,), dtype=tf.int32, name="input_ids")
input_mask = Input(shape=(max_sequence_length,), dtype=tf.int32, name="attention_mask")

output = bert([input_ids, input_mask])[1] #pooled_output
output = tf.keras.layers.Dense(classes_num, activation='softmax', name='output')(output)
   
model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=output)

optimizer = Adam(learning_rate=5e-05)


loss =CategoricalCrossentropy(from_logits = True)
metric = CategoricalAccuracy('balanced_accuracy'),

model.compile(optimizer = optimizer, loss = loss, metrics = metric)

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 32)]         0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 32)]         0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  95721216    ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 32,                                            

In [8]:
tf.config.experimental_run_functions_eagerly(True)
tf.config.run_functions_eagerly(True)

Instructions for updating:
Use `tf.config.run_functions_eagerly` instead of the experimental version.


In [9]:
#train the model (classic meters)

train_history = model.fit(
    x ={'input_ids':xtrain['input_ids'],'attention_mask':xtrain['attention_mask']}, y = ytrain,
    validation_data = ({'input_ids':xtest['input_ids'],'attention_mask':xtest['attention_mask']}, 
    ytest), epochs=3, batch_size=train_batch_size )


Epoch 1/3
Epoch 2/3
Epoch 3/3


In [10]:
#model evaluation (classic meters)

pred = model.predict({'input_ids':xtest['input_ids'],'attention_mask':xtest['attention_mask']})

y_pred = np.argmax(pred, axis = 1)
y_pred = to_categorical(y_pred, num_classes=classes_num).astype('int32')

print(classification_report(ytest, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.9961    0.9942    0.9952     53009
           1     0.9000    0.8445    0.8714      1833
           2     0.9903    0.9914    0.9909     81382
           3     0.9922    0.9944    0.9933     34821
           4     0.9589    0.9048    0.9311      1755
           5     0.5714    0.3333    0.4211        72
           6     0.9698    0.9713    0.9706      3871
           7     0.9690    0.9784    0.9737      5973
           8     0.8908    0.8158    0.8516       190
           9     0.9076    0.9480    0.9273      1326
          10     0.9897    0.9902    0.9900     13905
          11     0.9718    0.9527    0.9622     22267
          12     0.9807    0.9876    0.9841     17971
          13     0.9634    0.9835    0.9733     12252
          14     0.9967    0.9974    0.9970     88553
          15     0.9897    0.9915    0.9906     30891

   micro avg     0.9888    0.9888    0.9888    370071
   macro avg     0.9399   

In [10]:
#train the model (all meters)

train_history = model.fit(
    x ={'input_ids':xtrain['input_ids'],'attention_mask':xtrain['attention_mask']}, y = ytrain,
    validation_data = ({'input_ids':xtest['input_ids'],'attention_mask':xtest['attention_mask']}, 
    ytest), epochs=3, batch_size=train_batch_size )


Epoch 1/3
Epoch 2/3
Epoch 3/3


In [11]:
#model evaluation (all meters)

pred = model.predict({'input_ids':xtest['input_ids'],'attention_mask':xtest['attention_mask']})

y_pred = np.argmax(pred, axis = 1)
y_pred = to_categorical(y_pred, num_classes=classes_num).astype('int32')

print(classification_report(ytest, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.9932    0.9901    0.9916     53009
           1     0.8682    0.6230    0.7254      1480
           2     0.9307    0.9148    0.9227       587
           3     0.8223    0.8671    0.8441      1686
           4     0.0000    0.0000    0.0000        10
           5     0.8385    0.8723    0.8551      1833
           6     0.9864    0.9914    0.9889     81382
           7     0.0000    0.0000    0.0000        12
           8     0.9877    0.9927    0.9902     34821
           9     0.0000    0.0000    0.0000         1
          10     0.9499    0.8860    0.9169      1755
          11     0.6621    0.5642    0.6092       257
          12     0.5766    0.6286    0.6015       377
          13     0.7475    0.7018    0.7239      7682
          14     0.5652    0.3611    0.4407        72
          15     0.9227    0.9646    0.9432      3871
          16     0.9749    0.9689    0.9719      5973
          17     0.8785    

In [None]:

# classifier_path = 'finetuned/classic_meters_classifierTF_8L12H.h5'
classifier_path = 'finetuned/all_meters_classifierTF_8L12H.h5'


In [25]:
#saving finetuned model locally

model.save_weights(classifier_path)

In [26]:
#loading the finetuned model locally
# it's necessary to create new model similar to the saved one, then load it

from transformers import TFBertModel

def create_model():
    input_ids = Input(shape=(max_sequence_length,), dtype=tf.int32, name="input_ids")
    input_mask = Input(shape=(max_sequence_length,), dtype=tf.int32, name="attention_mask")
    bert = TFBertModel.from_pretrained('faisalq/bert-medium-arapoembert', from_pt=True)
    
    output = bert([input_ids, input_mask])[1] #pooled_output
    output = tf.keras.layers.Dense(classes_num, activation='softmax', name='output')(output)
    
    model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=output)
    
    optimizer = Adam(learning_rate=5e-05)
     
    loss =CategoricalCrossentropy(from_logits = True)
    metric = CategoricalAccuracy('balanced_accuracy'),
    
    model.compile(optimizer = optimizer, loss = loss, metrics = metric)
    
    return model

model = create_model()
model.load_weights(classifier_path)

In [None]:
#model evaluation

pred = model.predict({'input_ids':xtest['input_ids'],'attention_mask':xtest['attention_mask']})

y_pred = np.argmax(pred, axis = 1)
y_pred = to_categorical(y_pred, num_classes=classes_num).astype('int32')

print(classification_report(ytest, y_pred, digits=4))

In [None]:

# further finetuning the model

train_history = model.fit(
    x ={'input_ids':xtrain['input_ids'],'attention_mask':xtrain['attention_mask']}, y = ytrain,
    validation_data = ({'input_ids':xtest['input_ids'],'attention_mask':xtest['attention_mask']}, 
    ytest), epochs=3, batch_size=train_batch_size )
