### Import Libraries

In [43]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
import re
from sklearn import metrics
from lightgbm import LGBMClassifier
import tensorflow_addons as tfa
from sklearn.preprocessing import LabelEncoder

### Load Dataset

In [44]:
import sqlite3
import pandas as pd

In [50]:
conn = sqlite3.connect("../data/dialects_database.db")
df_label = pd.read_sql_query("SELECT * FROM id_text", conn)
df_target = pd.read_sql_query("SELECT * FROM id_dialect", conn)
df = pd.merge(df_label, df_target, on="id")
conn.close()

**Save as csv**

In [46]:
df.to_csv('../data/dialects.csv')

**Read csv file**

In [47]:
import csv

In [49]:
# ls = [row[1:] for row in csv.reader(open('../data/dialects.csv'))]
# df = pd.DataFrame(ls)
# df.columns = df.iloc[0]
# df = df.drop(0, axis=0)
# df

# Deep Learning

In [51]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

In [52]:
X = df["text"]
y = df["dialect"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

In [53]:
NUM_CLASSES = 5
EPOCHS = 5
BATCH_SIZE = 32
MAX_WORDS = 10_000
INPUT_LENGTH = MAX_SEQUENCE_LEN = max(len(sentence) for sentence in X_train)

In [54]:
def wrangle_dl(df):
    #spilt 
    X = df["text"]
    y = df["dialect"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)
    
    #preprocess
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)
    
    tok = Tokenizer(num_words=MAX_WORDS)
    tok.fit_on_texts(X_train)

    sequences = tok.texts_to_sequences(X_train)
    X_train_padded = sequence.pad_sequences(sequences, maxlen=MAX_SEQUENCE_LEN)
    y_train_ = to_categorical(y_train)

    test_sequences = tok.texts_to_sequences(X_test)
    X_test_padded = sequence.pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LEN)
    y_test_ = to_categorical(y_test)

    return X_train_padded, X_test_padded, y_train_, y_test_


X_train, X_test, y_train, y_test = wrangle_dl(df)

In [36]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(MAX_WORDS, 64, input_length=INPUT_LENGTH),
    tf.keras.layers.LSTM(64),
#     tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(NUM_CLASSES, activation='softmax')
])

model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 698, 64)           640000    
                                                                 
 lstm_1 (LSTM)               (None, 64)                33024     
                                                                 
 dense_4 (Dense)             (None, 5)                 325       
                                                                 
Total params: 673,349
Trainable params: 673,349
Non-trainable params: 0
_________________________________________________________________


In [37]:
model.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy', tfa.metrics.F1Score(average='macro', num_classes=NUM_CLASSES)])

In [38]:
history = model.fit(X_train, y_train, epochs=1, batch_size=BATCH_SIZE)



In [39]:
print(f'F1 score for testing : {model.evaluate(X_test, y_test)[2]}')

F1 score for testing : 0.8114644885063171


In [41]:
model.save('../models/LSTM')



INFO:tensorflow:Assets written to: ../models/LSTM/assets


INFO:tensorflow:Assets written to: ../models/LSTM/assets


In [42]:
# model = tf.saved_model.load("../models/SimpleRNN/")
model = tf.keras.models.load_model('../models/LSTM/')
print(f'F1 score for testing : {model.evaluate(X_test, y_test)[2]}')

F1 score for testing : 0.8114644885063171


# Machine Learning

In [None]:
import re
import pandas as pd


def remove_user(text: str) -> str:
    return re.sub(r"@\w+", " ", text)

def replace_spaces(text: str) -> str:
    return re.sub(r"\s+", " ", str(text))

def preprocess(text: str) -> str:
    text = remove_user(text)
    text = replace_spaces(text)
    return text

def wrangle_ml(df: pd.DataFrame) -> pd.DataFrame:
    df["text"] = df["text"].apply(preprocess)
    X = df["text"]
    y = df["dialect"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)
    return X_train, X_test, y_train, y_test

In [None]:
import nltk

In [None]:
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords
stopwords_list = stopwords.words('arabic')
print(stopwords_list)

In [None]:
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, HistGradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import joblib

def fit_ml(X_train, X_test, y_train, y_test):
    final_model = LinearSVC(random_state=42)
    pipe = Pipeline([("Vectorizer", TfidfVectorizer(ngram_range=(1, 2), stop_words='arabic')), ("classifier", LinearSVC(random_state=42))])
    pipe.fit(X_train, y_train)
#     joblib.dump(pipe, "models/ml_model.pkl")
    return pipe

In [None]:
X_train, X_test, y_train, y_test = wrangle_ml(df)

le = LabelEncoder()
y_train_prep = le.fit_transform(y_train)
y_test_prep = le.transform(y_train)

model = fit_ml(X_train, X_test, y_train_prep, y_test_prep)

In [None]:
from sklearn.metrics import f1_score, classification_report, classification_report
from sklearn.preprocessing import LabelEncoder

def eval_ml(pipe, X_test, y_test):
    y_pred = pipe.predict(X_test)
    y_pred = le.inverse_transform(y_pred)
    print(classification_report(y_test, y_pred))
    print(f"ML Macro F1 score for testing: {f1_score(y_test, y_pred, average='macro')}")

In [None]:
eval_ml(model, X_test, y_test)