In [None]:
!pip install pandas
!pip install numpy==1.23

In [None]:
!pip install tensorflow==2.12
!pip install pyarabic

In [None]:
!pip install transformers

In [None]:
!pip install ipywidgets
!pip install datasets
!pip install transformers[torch]
!pip install nvidia-ml-py3

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1" 

In [None]:
import numpy as np
import tensorflow as tf
import pandas as pd
import pyarabic.araby as araby
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 1000)

In [3]:
# cell-1
#load and clean the data (removing diacritics and unwanted text)

df = pd.read_csv('poemsDataset.csv')
df.fillna('', inplace=True)
display(len(df))


def remove_diacritics(a):    
    return araby.strip_diacritics(a)

df['first_hemistich'] = df['first_hemistich'].apply(remove_diacritics)
df['second_hemistich'] = df['second_hemistich'].apply(remove_diacritics)

def normalizeBeforeTraining(df):
    df['first_hemistich'] = df['first_hemistich'].str.replace('النابغـة: ', '')
    df['second_hemistich'] = df['second_hemistich'].str.replace('الـربيع: ', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('عبيــد: ', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('امـرؤ القيسـ: ', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('امرؤ القيس: ', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('(جلال الــــدين الــــرومي):', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('(لـوك الفيلسـوف الإنكليزي):', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('(كانت الفيلسوف الألماني ):', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('(بركســــــــــــــــون):', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('(الحـــــــــــــــور):', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('(الشــــــــــــــاعر):', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('(الإنســـــــــــــــان):', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('العلم):', '', regex=False)
    df['first_hemistich'] = df['first_hemistich'].str.replace('(العشــــــــــــــــق):', '', regex=False)
    df['first_hemistich'] = df['first_hemistich'].str.replace('(الزهــــــــــــــــــرة):', '', regex=False)
    df['second_hemistich'] = df['second_hemistich'].str.replace('التوأم اليشكري: ', '', regex=False)  
    df['first_hemistich'] = df['first_hemistich'].str.replace('آ', 'أ')
    df['second_hemistich'] = df['second_hemistich'].str.replace('آ', 'أ')
    df['first_hemistich'] = df['first_hemistich'].str.replace('[/":?،؟]', '')
    df['second_hemistich'] = df['second_hemistich'].str.replace('[/":?،؟]', '')
    df['first_hemistich'] = df['first_hemistich'].str.replace('  ', ' ')
    df['second_hemistich'] = df['second_hemistich'].str.replace('  ', ' ')
    df['first_hemistich'] = df['first_hemistich'].str.replace('  ', ' ')
    df['second_hemistich'] = df['second_hemistich'].str.replace('  ', ' ')


normalizeBeforeTraining(df)
df.drop(df[(df['first_hemistich'] == '') & (df['second_hemistich'] == '')].index, inplace=True)

#if first_hemistich == '', then copy the text from second_hemistich. then delete the text in the second_hemistich
df['first_hemistich'] = df.apply(lambda x: x['second_hemistich'] if x['first_hemistich'] == '' else x['first_hemistich'], axis=1)
df['second_hemistich'] = df.apply(lambda x: '' if x['first_hemistich'] == x['second_hemistich'] else x['second_hemistich'], axis=1)

df.reset_index(drop=True, inplace=True)

display(len(df))
print('done')

2090907

2090907

done


In [4]:
# cell-2
# preparing data for pretraining (all poems even those without labeled meter are used)

df['second_hemistich'].replace('', 'E', inplace=True)
dfc = df[['first_hemistich', 'second_hemistich', 'meter']].copy()
dfc['text'] = dfc['first_hemistich'] + ' S ' + dfc['second_hemistich']

dfc.reset_index(drop=True, inplace=True)


In [None]:
#don't run unless there isn't a pretrained tokenizer
#train tokenizer (using iterator function), then save tokenizer locally

from tqdm import tqdm
from transformers import BertTokenizerFast

#loading bert tokenizer to work as a base for the new tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

def batch_iterator(batch_size=10000):
    for i in tqdm(range(0, len(dfc), batch_size)):
        yield dfc[i: i +batch_size]['text']
bert_tokenizer = tokenizer.train_new_from_iterator(text_iterator=batch_iterator(), vocab_size=50000)
bert_tokenizer.save_pretrained('araPoemBERT_tokenizer')

In [5]:
# cell-3
#tokenizing the whole text 

import tokenizers
from transformers import Trainer, TrainingArguments, LineByLineTextDataset, BertModel
from transformers import BertConfig, BertForMaskedLM, DataCollatorForLanguageModeling
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('faisalq/bert-medium-arapoembert')
max_seq_length = 32

dataset = LineByLineTextDataset(tokenizer = tokenizer, file_path = 'poem_text.txt', 
                               block_size = max_seq_length)
display(len(dataset))

2090851

In [15]:
dataset[:5]

[{'input_ids': tensor([    2,   909,  3603,    53,  1472,  8993,     9,   126,  8979,  8989,
          11435,   133,     3])},
 {'input_ids': tensor([    2,   126,  1094, 15305,  9899,   133,     9,  1599,   136,  1814,
           2611,  5667,     3])},
 {'input_ids': tensor([    2, 17860, 13222, 13908,  1684,     9, 11696, 11360,   126,  3682,
             85, 14474,     3])},
 {'input_ids': tensor([    2,   124,   609,  6877,  3410,   340, 31589,     9,     8,     3])},
 {'input_ids': tensor([    2,    92, 11630,  8859,    84, 26039,     9, 31405,   167,   121,
           4450, 42093,    51,     3])}]

In [6]:
# cell-4
# model config
config = BertConfig( vocab_size = 50000, 
                    hidden_size = 768, 
                    num_hidden_layers = 8,
                    num_attention_heads = 12,
                    max_position_embeddings = 32)

model = BertForMaskedLM(config)
display(model.num_parameters())

95772752

In [7]:
# cell-5
#pretraining the model

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True,
                                               mlm_probability=0.15)
epochs = 1293
save_steps = 10000 #save checkpoint every 10000 steps
batch_size = 256

training_args = TrainingArguments(
    output_dir = 'araPoemBERT_8L12H/',
    overwrite_output_dir=True,
    num_train_epochs = epochs,
    per_device_train_batch_size = batch_size,
    save_steps = save_steps,
    save_total_limit = 5, #only save the last 5 checkpoints
    fp16=True,
    learning_rate = 5e-5,  # 5e-5 is the default
    logging_steps = 300 #50_000
)

trainer = Trainer(
    model = model,
    args = training_args,
    data_collator=data_collator,
    train_dataset=dataset
)



from pynvml import *


trainer.train(resume_from_checkpoint=True)
# trainer.train()
trainer.save_model('araPoemBERT_8L12H/')

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10560300,2.1249
10560600,2.1485
10560900,2.1366
10561200,2.1493


In [6]:
trainer.save_model('araPoemBERT_8L12H/')