# Import Libraries

In [17]:
# Built in Libraries
import re
import csv
import joblib
import sqlite3
from string import punctuation

#nlp
import nltk
import pyarabic.araby as araby


# Preprocessing
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical


# Modeling and Evaluation
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import f1_score, classification_report, accuracy_score
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint


# Deeplearning
import tensorflow as tf
import tensorflow_addons as tfa

# Custom Setup
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', None)

# Setting Random Seed
tf.random.set_seed(42)
tf.keras.utils.set_random_seed(42)

# Load Dataset

In [18]:
conn = sqlite3.connect("../data/dialects_database.db")
df_label = pd.read_sql_query("SELECT * FROM id_text", conn)
df_target = pd.read_sql_query("SELECT * FROM id_dialect", conn)
df = pd.merge(df_label, df_target, on="id")
conn.close()

# Machine Learning

## Data Preprocessing

In [19]:
STOP_WORDS = set(nltk.corpus.stopwords.words("arabic"))

def replace_punctuation(text: str) -> str:
    added_punctuation = punctuation + "؟،"
    return re.sub(rf"[{added_punctuation}]", " ", text)


def remove_arabic_diatrics(text: str) -> str:
    text = araby.strip_tashkeel(text)
    text = araby.normalize_ligature(text)
    return text


def keep_arabic(text: str) -> str:
    return re.sub(r"[^\u0600-\u06FF ]+", " ", text)


def remove_stop_words(text: str) -> str:
    return " ".join(word for word in text.split() if word not in STOP_WORDS)


def replace_repeated_chars(text: str) -> str:
    return re.sub(r"(\w)\1{2,}", r"\1\1", text)


def preprocess(text: str) -> str:
    text = replace_punctuation(text)
    text = remove_arabic_diatrics(text)
    text = keep_arabic(text)
    text = remove_stop_words(text)
    text = replace_repeated_chars(text)
    return text


def wrangle_ml(df: pd.DataFrame) -> pd.DataFrame:
    df["text"] = df["text"].apply(preprocess)
    return df


In [20]:
df_clean = wrangle_ml(df)

## Save Cleaned Data

In [21]:
df_clean.to_csv('../data/dialects_cleaned.csv')

## Splitting Data

In [22]:
def split_data(df):
    X = df["text"]
    y = df["dialect"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)
    return X_train, X_test, y_train, y_test

In [23]:
X_train, X_test, y_train, y_test = split_data(df_clean)

## Modeling

### Pipeline

In [24]:
final_model = ComplementNB(alpha=0.3)
pipe = Pipeline([("Vectorizer", TfidfVectorizer()), ("classifier", final_model)])
pipe.fit(X_train, y_train)

### Save Model

In [25]:
ML_MODEL_PATH = '../models/ml_model.pkl'

In [28]:
def save_ml_model(model, path=ML_MODEL_PATH):
    joblib.dump(model, path)


def load_ml_model(path=ML_MODEL_PATH):
    model = joblib.load(path)
    return model

In [29]:
save_ml_model(pipe, ML_MODEL_PATH)
model = load_ml_model(ML_MODEL_PATH)

## Evaluation

In [30]:
def eval_ml(path=ML_MODEL_PATH):
    model = load_ml_model(path)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    macro_f1_score = f1_score(y_test, y_pred, average="macro")
    print(f"Testing ML:\nAccuracy: {accuracy}")
    print(f"Macro F1 score: {macro_f1_score}")
    return accuracy, macro_f1_score

In [31]:
ml_accuracy, ml_macro_f1_score = eval_ml(ML_MODEL_PATH)

Testing ML:
Accuracy: 0.8376768428890543
Macro F1 score: 0.7994041658446861


### Prediction

In [16]:
def predict_ml(sentence: str, path=ML_MODEL_PATH):
    """For wrangled data"""
    model = load_ml_model(path)
    predict_label = model.predict([sentence])
    predict_probabiltiy = {
        country: prob * 100
        for country, prob in zip(model.classes_, model.predict_proba([sentence])[0])
    }
    print(f"Dialect Prediction: {predict_label}\nProbabily: {predict_probabiltiy}")
    return predict_label, predict_probabiltiy

In [17]:
sentence = 'الحمد لله'

model_prediction, predict_probabiltiy = predict_ml(sentence)

Dialect Prediction: ['EG']
Probabily: {'EG': 23.97523662167075, 'LB': 15.804329054132648, 'LY': 20.511809886955135, 'MA': 21.387492799872508, 'SD': 18.321131637368932}


# DeepLearning

In [18]:
NUM_CLASSES = 5
EPOCHS = 2
BATCH_SIZE = 32
MAX_WORDS = 10_000
INPUT_LENGTH = MAX_SEQUENCE_LEN = -1
DL_MODEL_PATH = '../models/LSTM'
SRNN_MODEL_PATH = '../models/SimpleRNN'
GRU_MODEL_PATH = '../models/GRU'
DL_LABELS_PATH = '../models/dl_labels.pkl'

## Data Preprocessing and Splitting

In [19]:
def wrangle_dl(df):
    df_clean = wrangle_ml(df)
    X_train, X_test, y_train, y_test = split_data(df_clean)
    
    #preprocess
    max_sequence_len = max(len(sentence) for sentence in X_train)
    global INPUT_LENGTH, MAX_SEQUENCE_LEN
    INPUT_LENGTH = max_sequence_len
    MAX_SEQUENCE_LEN = max_sequence_len
    
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)
    
    joblib.dump(le.classes_, DL_LABELS_PATH)
    
    tok = Tokenizer(num_words=MAX_WORDS)
    tok.fit_on_texts(X_train)

    sequences = tok.texts_to_sequences(X_train)
    X_train_padded = sequence.pad_sequences(sequences, maxlen=MAX_SEQUENCE_LEN)
    y_train_ = to_categorical(y_train)

    test_sequences = tok.texts_to_sequences(X_test)
    X_test_padded = sequence.pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LEN)
    y_test_ = to_categorical(y_test)

    return X_train_padded, X_test_padded, y_train_, y_test_



X_train_dl, X_test_dl, y_train_dl, y_test_dl = wrangle_dl(df)

In [20]:
X_train_dl.shape, y_train_dl.shape, X_test_dl.shape, y_test_dl.shape

((132952, 275), (132952, 5), (14773, 275), (14773, 5))

### Save and load Model

In [21]:
def save_dl_model(model, path=DL_MODEL_PATH):
    model.save(path)


def load_dl_model(path=DL_MODEL_PATH):
    model = tf.keras.models.load_model(path)
    return model

### Training Model

In [22]:
def apply_dl(layer, path):
    model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(MAX_WORDS, 64, input_length=INPUT_LENGTH),
    layer,
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(NUM_CLASSES, activation='softmax')
])
    model.summary()
    
    
    model.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy', tfa.metrics.F1Score(average='macro', num_classes=NUM_CLASSES)])
    
    history = model.fit(X_train_dl, y_train_dl, epochs=EPOCHS, batch_size=BATCH_SIZE)
    
    save_dl_model(model, path)
    return history

## Simple Vanilla Rnn

In [23]:
history_srnn = apply_dl(tf.keras.layers.SimpleRNN(64), SRNN_MODEL_PATH)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 275, 64)           640000    
                                                                 
 simple_rnn (SimpleRNN)      (None, 64)                8256      
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 dense_1 (Dense)             (None, 5)                 325       
                                                                 
Total params: 652,741
Trainable params: 652,741
Non-trainable params: 0
_________________________________________________________________
Epoch 1/2
Epoch 2/2
INFO:tensorflow:Assets written to: ../models/SimpleRNN/assets


### Evaluate Model

In [24]:
def eval_dl(X_test, y_test, path=DL_MODEL_PATH):
    model = load_dl_model(path)
    accuracy, macro_f1_score = model.evaluate(X_test, y_test)[1:]
    print(f"Testing DL:\nAccuray: {accuracy}\nMacro F1 Score: {macro_f1_score}")
    return accuracy, macro_f1_score

In [25]:
srnn_accuracy, srnn_macro_f1_score = eval_dl(X_train_dl, y_train_dl, SRNN_MODEL_PATH)

Testing DL:
Accuray: 0.8472681641578674
Macro F1 Score: 0.8180634379386902


## Prediction

In [26]:
def predict_dl(sentence: str, path=DL_MODEL_PATH):
    model = load_dl_model(path)
    tokenizer = Tokenizer(num_words=MAX_WORDS)
    tokenizer.fit_on_texts([sentence])
    input_seq = tokenizer.texts_to_sequences([sentence])
    padded_seq = pad_sequences(input_seq, maxlen=MAX_SEQUENCE_LEN)
    labels = joblib.load(DL_LABELS_PATH)
    _predict_probabiltiy = dict(zip(labels, model.predict(padded_seq)[0]))
    predict_probabiltiy = {k: v * 100 for k, v in _predict_probabiltiy.items()}
    predict_label = max(predict_probabiltiy, key=predict_probabiltiy.get)
    print(f"Dialect Prediction: {predict_label}\nProbabily: {predict_probabiltiy}")
    return predict_label, predict_probabiltiy

In [27]:
srnn_prediction, srnn_probabiltiy = predict_dl(sentence, path=SRNN_MODEL_PATH)

Dialect Prediction: LY
Probabily: {'EG': 37.45577335357666, 'LB': 2.936970070004463, 'LY': 55.25919795036316, 'MA': 2.8830254450440407, 'SD': 1.4650274999439716}


## GRU

In [28]:
history_gru = apply_dl(tf.keras.layers.GRU(64), GRU_MODEL_PATH)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 275, 64)           640000    
                                                                 
 gru (GRU)                   (None, 64)                24960     
                                                                 
 dense_2 (Dense)             (None, 64)                4160      
                                                                 
 dense_3 (Dense)             (None, 5)                 325       
                                                                 
Total params: 669,445
Trainable params: 669,445
Non-trainable params: 0
_________________________________________________________________
Epoch 1/2
Epoch 2/2
 501/4155 [==>...........................] - ETA: 4:26 - loss: 0.4571 - accuracy: 0.8369 - f1_score: 0.803

### Evaluate Model

In [29]:
eval_dl(X_train_dl, y_train_dl, GRU_MODEL_PATH)

Testing DL:
Accuray: 0.8645375967025757
Macro F1 Score: 0.8387897610664368


(0.8645375967025757, 0.8387897610664368)

### Prediction

In [30]:
gru_prediction, gru_probabiltiy = predict_dl(sentence, path=GRU_MODEL_PATH)

Dialect Prediction: LY
Probabily: {'EG': 39.78680670261383, 'LB': 9.225909411907196, 'LY': 46.57643735408783, 'MA': 3.0645091086626053, 'SD': 1.3463341630995274}


## LSTM

In [31]:
history_lstm = apply_dl(tf.keras.layers.LSTM(64), DL_MODEL_PATH)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 275, 64)           640000    
                                                                 
 lstm (LSTM)                 (None, 64)                33024     
                                                                 
 dense_4 (Dense)             (None, 64)                4160      
                                                                 
 dense_5 (Dense)             (None, 5)                 325       
                                                                 
Total params: 677,509
Trainable params: 677,509
Non-trainable params: 0
_________________________________________________________________
Epoch 1/2
Epoch 2/2
 500/4155 [==>...........................] - ETA: 4:44 - loss: 0.4582 - accuracy: 0.8349 - f1_score: 0.803

### Evaluate Model

In [32]:
eval_dl(X_train_dl, y_train_dl, DL_MODEL_PATH)

Testing DL:
Accuray: 0.8603706359863281
Macro F1 Score: 0.8333292007446289


(0.8603706359863281, 0.8333292007446289)

### Prediction

In [33]:
lstm_prediction, lstm_probabiltiy = predict_dl(sentence, path=DL_MODEL_PATH)

Dialect Prediction: LY
Probabily: {'EG': 39.47210907936096, 'LB': 5.76515793800354, 'LY': 51.47704482078552, 'MA': 2.0826809108257294, 'SD': 1.2030051089823246}
