# Arood Version3

## Imports

In [1]:
import tensorflow as tf
import numpy as np
import os
import re
from random import shuffle
from pyarabic import araby
from tensorflow.keras.layers import GRU, Embedding, Dense, Input, Dropout, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

## Check GPU Availability

In [2]:
print(tf.config.experimental.list_physical_devices())
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Num GPUs Available:  1


## Load DataSet

The dataset is in the file final_baits it is splitted into two files for training train.txt and testing test.txt and the labels of the meters are saved in the file labels.txt.

## Read label

Read label names from a file and clean them

In [3]:
with open('final_baits/labels.txt', 'r') as f:
  label2name = f.readlines()
  label2name = [name.replace('\n', '') for name in label2name]

### Preprocessing DataSet

In [4]:
def preprocess_prosody(text: str) -> str:
    # Rule 1: Replace tanween with silent noon
    tanween_to_noon = {
        r'[ً]': 'ن',  # Tanween Fatha
        r'[ٌ]': 'ن',  # Tanween Damma
        r'[ٍ]': 'ن',  # Tanween Kasra
    }
    for pattern, replacement in tanween_to_noon.items():
        text = re.sub(pattern, replacement, text)

    # Rule 2: Handle shadda by duplicating letters
    text = re.sub(r'(.)ّ', r'\1\1', text)  # Replace shadda with duplicated letters

    # Rule 3: Special handling: Saturate doubled letters at the end of the first hemistich
    text = re.sub(r'(.)ّ$', r'\1\1', text, flags=re.MULTILINE)

    # Rule 4: Add Alif in specific contexts
    alif_replacements = {
        r'هذا': 'هاذا',
        r'هذه': 'هاذه',
        r'هذان': 'هاذان',
        r'هذين': 'هاذين',
        r'ذلك': 'ذالك',
        r'الله': 'اللاه',
        r'الرحمن': 'اَرْرحمان',
        r'إله': 'إلاه',
        r'لكنْ': 'لاكنْ',
        r'لكنَّ': 'لاكنْنَ',
        r'طه': 'طاها'
    }
    for original, prosodic in alif_replacements.items():
        text = text.replace(original, prosodic)

    # Rule 5: Handle Solar and Lunar Lam
    # Define solar and lunar letters
    solar_letters = r'تثدذرشصضطظلن'
    lunar_letters = r'ابجحخعغفقكملوه'

    # Remove the "ل" in "ال" when followed by solar letters
    text = re.sub(r'\bال([' + re.escape(solar_letters) + r'])', r'ا\1', text)

    # Ensure "ال" remains unchanged for lunar letters
    text = re.sub(r'\bال([' + re.escape(lunar_letters) + r'])', r'ال\1', text)

    return text

Extracts labeled text data from the provided file.
Preprocesses text by removing diacritics and unwanted characters.

In [5]:
def extract_data(path, on_shatrs=False):
    global vocab
    text = ""
    X, y = [], []

    # Read the file with UTF-8 encoding
    with open(path, 'r', encoding='utf-8') as file:
        t = file.read()

    t = preprocess_prosody(t)
    t = araby.strip_tatweel(t)
    
    # Remove unwanted characters
    excluded_chars = '!()*-ـ.:=o[]«»;؛,،~?؟\u200f\ufeffـ'
    cleaned_text = ''.join([char for char in t if char not in excluded_chars])
    
    text += cleaned_text
    baits = cleaned_text.split('\n')
    for line in baits:
        if len(line) <= 1:  # Skip empty or short lines
            continue
        label, bait = line.split(' ', 1)  # Split label and text
        label = int(label)
        bait = bait.strip()
        if on_shatrs:
            # Further split text into parts (shatrs)
            shatrs = bait.split('#')
            for shatr in shatrs:
                X.append(shatr.strip())
                y.append(label)
        else:
            X.append(bait.strip())
            y.append(label)
    
    # Create a sorted vocabulary from the dataset
    vocab = sorted(set(' '.join(X)))

    # Shuffle the data to avoid order bias
    X, y = shuffle(X, y)
    return X, y

### Read Train Data & Test Data

File paths for training and testing datasets

In [6]:
train_file = os.path.join('./final_baits', 'train.txt')
test_file = os.path.join('./final_baits', 'test.txt')

Extract and preprocess the training data

In [7]:
X, y = extract_data(train_file, on_shatrs=False)

Display the first few data samples for verification

In [8]:
for i in range(5):
  print(X[i], ' ', label2name[y[i]])

كللما حننتْ لأرضِ المُنحنى # وكَلاها أقرحَ السسَوْقُ كُلاها   ramal
فكاتب يقام إجلالا له # وكاتب لا نستحى أن نصفعه   rajaz
فَيا لِلنَصارى إِذا أَمسَكوا # وَيا لِليَهودِ إِذا أَسبَتوا   mutakareb
كأن ذاك اشرار من ذهب # قُراضةن تستطير من نُقَر   munsareh
أنا مفتاحُ المَلاهِي والطربْ # هيئتي ظَرْفن وأحوالي عَجَب   ramal


Split the training data into train and validation sets

In [9]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.15, random_state=41)

Extract and preprocess the test data

In [10]:
X_test, y_test = extract_data(test_file, on_shatrs=False)

Map each character in the vocabulary to an index

In [11]:
char2idx = {u:i+1 for i, u in enumerate(vocab)}

### Convert text to sequences

Converts text data to sequences of indices

In [12]:
def to_sequences(X):
  X = [[char2idx[char] for char in line] for line in X]
  X = pad_sequences(X, padding='post', value=0, maxlen = 100)
  return X

Convert text data into sequences of indices

In [13]:
X_train = to_sequences(X_train)
X_valid = to_sequences(X_valid)

Convert labels to numpy arrays for compatibility

In [14]:
y_train = np.array(y_train)
y_valid = np.array(y_valid)

## Create the model

In [15]:
model = Sequential([
    Input((100,)),  # Input layer with maxlen=100
    Embedding(len(char2idx) + 1, 256),  # Embedding layer with vocab size + 1
    Bidirectional(GRU(units=256, return_sequences=True)),  # First Bi-GRU layer
    Bidirectional(GRU(units=256, return_sequences=True)),  # Second Bi-GRU layer
    Bidirectional(GRU(units=256)),  # Third Bi-GRU layer
    Dense(128, activation='relu'),  # Dense layer for feature extraction
    Dropout(0.3),  # Dropout for regularization
    Dense(len(label2name), activation='softmax')  # Output layer for classification
])

Compile the model with Adam optimizer and categorical crossentropy loss

In [16]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

Display the model summary

In [17]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 256)          11264     
                                                                 
 bidirectional (Bidirectiona  (None, 100, 512)         789504    
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 100, 512)         1182720   
 nal)                                                            
                                                                 
 bidirectional_2 (Bidirectio  (None, 512)              1182720   
 nal)                                                            
                                                                 
 dense (Dense)               (None, 128)               65664     
                                                        

Verify model input-output shape

In [18]:
model(tf.zeros((10, 100))).shape

TensorShape([10, 14])

### Callbacks

Define callbacks for learning rate adjustment and model checkpointing

In [19]:
callbacks = [
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, min_delta=0.0001, min_lr=0.0001),
    tf.keras.callbacks.ModelCheckpoint('full_verse.keras', monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
]

### Train Model

In [20]:
model.fit(X_train, y_train, validation_data= (X_valid, y_valid), epochs = 15, batch_size= 128, shuffle = True, callbacks=callbacks)

Epoch 1/15

Epoch 1: val_accuracy improved from -inf to 0.61423, saving model to full_verse.keras
Epoch 2/15

Epoch 2: val_accuracy improved from 0.61423 to 0.87551, saving model to full_verse.keras
Epoch 3/15

Epoch 3: val_accuracy improved from 0.87551 to 0.91498, saving model to full_verse.keras
Epoch 4/15

Epoch 4: val_accuracy improved from 0.91498 to 0.93026, saving model to full_verse.keras
Epoch 5/15

Epoch 5: val_accuracy improved from 0.93026 to 0.93761, saving model to full_verse.keras
Epoch 6/15

Epoch 6: val_accuracy improved from 0.93761 to 0.94115, saving model to full_verse.keras
Epoch 7/15

Epoch 7: val_accuracy improved from 0.94115 to 0.94186, saving model to full_verse.keras
Epoch 8/15

Epoch 8: val_accuracy did not improve from 0.94186
Epoch 9/15

Epoch 9: val_accuracy improved from 0.94186 to 0.94242, saving model to full_verse.keras
Epoch 10/15

Epoch 10: val_accuracy did not improve from 0.94242
Epoch 11/15

Epoch 11: val_accuracy improved from 0.94242 to 0.9516

<keras.callbacks.History at 0x20e415f7850>

## Evaluate the model

In [21]:
X_test = to_sequences(X_test)
y_test = np.array(y_test)

# Predict on test data
y_pred_probs = model.predict(X_test)

# Convert probabilities to class labels
y_pred = np.argmax(y_pred_probs, axis=1)

accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

# Display classification metrics
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Display confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Test Accuracy: 0.9518
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.93      0.91       731
           1       0.94      0.94      0.94       760
           2       0.97      0.98      0.98       758
           3       0.93      0.93      0.93       178
           4       0.98      0.94      0.96       681
           5       0.95      0.83      0.89       230
           6       0.95      0.96      0.95       303
           7       0.97      0.96      0.96       719
           8       0.97      0.98      0.98       752
           9       0.97      0.98      0.97       759
          10       0.98      0.99      0.99       752
          11       0.96      0.97      0.97       769
          12       0.91      0.95      0.93       168
          13       0.90      0.88      0.89       756

    accuracy                           0.95      8316
   macro avg       0.95      0.95      0.95      8316
weighted avg       0.95      0.95  

## Classification Function

Classify a single sentence using the trained model

In [22]:
def classify(sentence):
    sentence = preprocess_prosody(sentence)
    sentence = araby.strip_tatweel(sentence)
    sequence = [char2idx[char] for char in sentence]
    sequence = pad_sequences([sequence], maxlen = X_train.shape[1], padding='post', value=0)

    pred = model.predict(sequence)[0]
    print(label2name[np.argmax(pred, 0).astype('int')], np.max(pred))

## Tests

In [24]:
classify("ما تردون على هذا المحب # دائبا يشكو إليكم في الكتب")
classify("ولد الهدى فالكائنات ضياء # وفم الزمان تبسم وسناء")
classify("لك يا منازل في القلوب منازل # أقفرت أنت وهن منك أواهل")
classify("ومن لم يمت بالسيف مات بغيره # تعددت الأسباب والموت واحد")
classify("أنا النبي لا كذب # أنا ابن عبد المطلب")
classify("قَد تَقَطَرنَ بالعبيرِ ومَسكٍ # وَتَكَبَينَ بالكباءِ ذكيا")
classify("رُبَّما ضَربَةٍ بسيفٍ صَقِيلٍ # دُونَ بُصرَى وَطَعْنَةٍ نَجلاءِ")
classify("أَيا هِندُ لا تَنكِحي بَوهَةَ # عَلَيهِ عَقيقَتُهُ أَحسَبا")
classify("أَكَلتُ شَبابي فَأَفنَيتُهُ # وَأَفنَيتُ بَعدَ شُهورٍ شُهورا")
classify("بان شبابٌ لمَّا يكن شابا # ويحي ولم أقضِ منه آرابا")
classify("عوجوا إِلى بَيتِ عَمرو # إِلى سَماعٍ وَخَمرِ")
classify("إِذا ما اِتَّقَينا رَمقَةً مِن مُبَلِّغٍ # فَأَعيُنُنا عَنّا تُجيبُ وَتَفهَمُ")

ramal 0.9303919
kamel 0.9826461
kamel 0.99910456
taweel 0.8783224
mujtath 0.9995054
khafeef 0.9937691
khafeef 0.9998223
mutakareb 0.99994004
mutakareb 0.99996233
khafeef 0.99200964
mujtath 0.99999654
taweel 0.9996996
