# Load Arood Models

## Imoprts

In [1]:
import tensorflow as tf
import numpy as np
import os
import re
import shap
from random import shuffle
from pyarabic import araby
from tensorflow.keras.layers import GRU, Embedding, Dense, Input, Dropout, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.utils import shuffle
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from lime.lime_text import LimeTextExplainer

## Load DataSet

### Read label

In [2]:
label_file = os.path.join('./final_baits', 'labels.txt')
with open(label_file, 'r') as f:
    label2name = [line.strip() for line in f.readlines()]

### Preprocessing DataSet

In [3]:
def preprocess_prosody(text: str) -> str:
    text = text.strip()
    # Handle rare edge cases
    if not text:
        return ""
    # Rule handling improvements (combine related regexes, optimize processing)
    text = re.sub(r'[ًٌٍ]', 'ن', text)  # Tanween normalization
    text = re.sub(r'(.)ّ', r'\1\1', text)  # Handle Shadda
    text = re.sub(r'\bال([تثدذرشصضطظلن])', r'ا\1', text)  # Solar Lam simplification
    return text

In [4]:
def extract_data(path, on_shatrs=False):
    global vocab
    text = ""
    X, y = [], []

    # Read the file with UTF-8 encoding
    with open(path, 'r', encoding='utf-8') as file:
        t = file.read()

    t = preprocess_prosody(t)
    t = araby.strip_tatweel(t)
    
    # Remove unwanted characters
    excluded_chars = '!()*-ـ.:=o[]«»;؛,،~?؟\u200f\ufeffـ\xa0'
    cleaned_text = ''.join([char for char in t if char not in excluded_chars])
    
    text += cleaned_text
    baits = cleaned_text.split('\n')
    for line in baits:
        if len(line) <= 1:  # Skip empty or short lines
            continue
        label, bait = line.split(' ', 1)  # Split label and text
        label = int(label)
        bait = bait.strip()
        if on_shatrs:
            # Further split text into parts (shatrs)
            shatrs = bait.split('#')
            for shatr in shatrs:
                X.append(shatr.strip())
                y.append(label)
        else:
            X.append(bait.strip())
            y.append(label)
    
    # Create a sorted vocabulary from the dataset
    vocab = sorted(set(' '.join(X)))

    # Shuffle the data to avoid order bias
    X, y = shuffle(X, y)
    return X, y

### Read Train Data & Test Data

In [5]:
train_file = os.path.join('./final_baits', 'train.txt')
test_file = os.path.join('./final_baits', 'test.txt')

In [6]:
X, y = extract_data(train_file, on_shatrs=False)

In [7]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.15, random_state=41)

In [8]:
X_test, y_test = extract_data(test_file, on_shatrs=False)

In [9]:
char2idx = {u:i+1 for i, u in enumerate(vocab)}

In [10]:
def to_sequences(X):
  X = [[char2idx[char] for char in line] for line in X]
  X = pad_sequences(X, padding='post', value=0, maxlen = 100)
  return X

In [11]:
X_train = to_sequences(X_train)
X_valid = to_sequences(X_valid)
X_test = to_sequences(X_test)

In [12]:
y_train = np.array(y_train)
y_valid = np.array(y_valid)
y_test = np.array(y_test)

## Load AroodV3

In [13]:
# Load best model
AroodV1 = tf.keras.models.load_model('AroodV1.keras')
AroodV2 = tf.keras.models.load_model('AroodV2.keras')
AroodV3 = tf.keras.models.load_model('AroodV3.keras')

## Classify Function

### classify for AroodV1

In [14]:
def classify_V1(sentence):
    sentence = araby.strip_tashkeel(sentence)
    sentence = araby.strip_tatweel(sentence)
    sequence = [char2idx[char] for char in sentence]
    sequence = pad_sequences([sequence], maxlen=X_train.shape[1], padding='post', value=0)

    pred = AroodV1.predict(sequence)[0]
    label = label2name[np.argmax(pred, 0).astype('int')]
    confidence = np.max(pred)
    
    # Instead of print, return the result
    return label, confidence

### classify for AroodV2

In [15]:
def classify_V2(sentence):
    sentence = araby.strip_tatweel(sentence)
    sentence = araby.strip_diacritics(sentence)  # Remove diacritics for consistency
    sequence = [char2idx.get(char, char2idx.get('<UNK>', 0)) for char in sentence]
    sequence = pad_sequences([sequence], maxlen=X_train.shape[1], padding='post', value=0)

    pred = AroodV2.predict(sequence)[0]
    label = label2name[np.argmax(pred)]
    confidence = np.max(pred)
    
    # Instead of print, return the result
    return label, confidence

### classify for AroodV3

In [16]:
def classify_V3(sentence):
    sentence = preprocess_prosody(sentence)
    sentence = araby.strip_tatweel(sentence)
    sentence = re.sub(r'[\xa0\u200b\u200c]', ' ', sentence)  # Replace non-breaking space and zero-width spaces with regular spaces
    sequence = [char2idx[char] for char in sentence]
    sequence = pad_sequences([sequence], maxlen=X_train.shape[1], padding='post', value=0)

    pred = AroodV3.predict(sequence)[0]
    label = label2name[np.argmax(pred, 0).astype('int')]
    confidence = np.max(pred)
    
    # Instead of print, return the result
    return label, confidence

## Test

In [17]:
def predict_metre(sentence):
    # Get predictions and confidences from each model using the classify functions
    result_v1 = classify_V1(sentence)  # Returns (label, confidence)
    result_v2 = classify_V2(sentence)  # Returns (label, confidence)
    result_v3 = classify_V3(sentence)  # Returns (label, confidence)

    # Unpack the results
    label_v1, confidence_v1 = result_v1
    label_v2, confidence_v2 = result_v2
    label_v3, confidence_v3 = result_v3

    # Print each model's prediction and confidence with full precision
    print(f"AroodV1 predicted: {label_v1} with confidence {confidence_v1*100:.6f}%")
    print(f"AroodV2 predicted: {label_v2} with confidence {confidence_v2*100:.6f}%")
    print(f"AroodV3 predicted: {label_v3} with confidence {confidence_v3*100:.6f}%")

    # Initialize variables for best model and corresponding metre
    chosen_metre = None
    best_model = None
    highest_confidence = -1

    # Check for similarity in predictions:
    similar_models = []

    # Compare predictions between models
    if label_v1 == label_v2 == label_v3:
        similar_models = [("AroodV1", confidence_v1), ("AroodV2", confidence_v2), ("AroodV3", confidence_v3)]
        chosen_metre = label_v1
    elif label_v1 == label_v2:
        similar_models = [("AroodV1", confidence_v1), ("AroodV2", confidence_v2)]
        chosen_metre = label_v1
    elif label_v1 == label_v3:
        similar_models = [("AroodV1", confidence_v1), ("AroodV3", confidence_v3)]
        chosen_metre = label_v1
    elif label_v2 == label_v3:
        similar_models = [("AroodV2", confidence_v2), ("AroodV3", confidence_v3)]
        chosen_metre = label_v2
    else:
        # No models agree, pick the model with the highest confidence
        if confidence_v1 >= max(confidence_v2, confidence_v3):
            chosen_metre = label_v1
            best_model = "AroodV1"
            highest_confidence = confidence_v1
        elif confidence_v2 >= max(confidence_v1, confidence_v3):
            chosen_metre = label_v2
            best_model = "AroodV2"
            highest_confidence = confidence_v2
        else:
            chosen_metre = label_v3
            best_model = "AroodV3"
            highest_confidence = confidence_v3

    # If there were similar models, choose the one with the highest confidence
    if similar_models:
        best_model, highest_confidence = max(similar_models, key=lambda x: x[1])

    print(f"Predicted metre: {chosen_metre} with highest confidence from {best_model} ({highest_confidence*100:.6f}%)")

    # Return the best model, predicted metre, and highest confidence
    return best_model, chosen_metre, highest_confidence * 100

In [18]:
chosen_metre = predict_metre("قَد طَالَ شَوقِي وَعَادَني طَرَبِي # مِن ذِكِر خَودٍ كَريمَةِ الحَسَبِ")

AroodV1 predicted: munsareh with confidence 99.986231%
AroodV2 predicted: munsareh with confidence 99.988794%
AroodV3 predicted: munsareh with confidence 99.958760%
Predicted metre: munsareh with highest confidence from AroodV2 (99.988794%)


In [19]:
print(classify_V1("قَد طَالَ شَوقِي وَعَادَني طَرَبِي # مِن ذِكِر خَودٍ كَريمَةِ الحَسَبِ"))
print(classify_V2("قَد طَالَ شَوقِي وَعَادَني طَرَبِي # مِن ذِكِر خَودٍ كَريمَةِ الحَسَبِ"))
print(classify_V3("قَد طَالَ شَوقِي وَعَادَني طَرَبِي # مِن ذِكِر خَودٍ كَريمَةِ الحَسَبِ"))

('munsareh', 0.9998623)
('munsareh', 0.99988794)
('munsareh', 0.9995876)
