## Importing Libraries

In [1]:
from nltk.corpus import stopwords
import pandas as pd
from preprocessing import preprocessing_pipeline, train_val_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.pipeline import make_pipeline
from utils import confusion_matrix, report

#Supress Warnings:
import warnings
warnings.filterwarnings('ignore')

## Loading and spliting the data

In [2]:
df = pd.read_csv(r"Datasets/dialect_dataset.csv", header=0)

In [3]:
X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(df["text"], df["dialect"])

## Building the pipeline

In [5]:
# stopwords_list = set(stopwords.words('arabic'))
preprocessing = preprocessing_pipeline(["normalization","tfidf"],
                                       victorizer_kwarg = dict(ngram_range=(1, 5), min_df=10))
model = LogisticRegression()

In [6]:
pipeline = make_pipeline(preprocessing, model)
pipeline.fit(X_train, y_train)

Pipeline(steps=[('pipeline',
                 Pipeline(steps=[('arabictextnormalizer',
                                  ArabicTextNormalizer()),
                                 ('tfidfvectorizer',
                                  TfidfVectorizer(min_df=10,
                                                  ngram_range=(1, 5)))])),
                ('logisticregression', LogisticRegression())])

## Model performance assessment

In [10]:
y_train_pred = pipeline.predict(X_train)
y_val_pred = pipeline.predict(X_val)
y_test_pred = pipeline.predict(X_test)

In [11]:
report("Training", y_train, y_train_pred)

  Training set Classification Report:
              precision    recall  f1-score   support

          AE     0.5407    0.5012    0.5202     21036
          BH     0.5156    0.4206    0.4633     21034
          DZ     0.6807    0.5410    0.6028     12947
          EG     0.6867    0.8941    0.7768     46108
          IQ     0.7332    0.5640    0.6376     12397
          JO     0.5747    0.4103    0.4788     22337
          KW     0.5110    0.7140    0.5957     33687
          LB     0.6824    0.7184    0.6999     22093
          LY     0.6599    0.7438    0.6993     29199
          MA     0.8379    0.5736    0.6810      9231
          OM     0.5646    0.3987    0.4674     15293
          PL     0.5122    0.6502    0.5730     34994
          QA     0.5697    0.5704    0.5700     24855
          SA     0.4971    0.5427    0.5189     21466
          SD     0.8040    0.5741    0.6699     11548
          SY     0.6565    0.3622    0.4669     12994
          TN     0.7971    0.3834    0.5178

In [14]:
report("Validation", y_val, y_val_pred)

  Validation set Classification Report:
              precision    recall  f1-score   support

          AE     0.4230    0.3856    0.4034      2630
          BH     0.3564    0.2765    0.3114      2629
          DZ     0.5994    0.4734    0.5290      1618
          EG     0.6396    0.8562    0.7322      5764
          IQ     0.6637    0.4800    0.5571      1550
          JO     0.4237    0.2915    0.3454      2792
          KW     0.4214    0.6200    0.5018      4211
          LB     0.6050    0.6365    0.6203      2762
          LY     0.5933    0.6751    0.6316      3650
          MA     0.8112    0.5546    0.6588      1154
          OM     0.4323    0.2957    0.3511      1911
          PL     0.4280    0.5578    0.4844      4374
          QA     0.4463    0.4445    0.4454      3107
          SA     0.3907    0.4078    0.3991      2683
          SD     0.7490    0.5149    0.6103      1443
          SY     0.4975    0.2438    0.3273      1624
          TN     0.7366    0.3265    0.45

In [None]:
confusion_matrix(y_train, y_train_pred, y_val, y_val_pred)

In [15]:
report("Testing", y_test, y_test_pred)

  Testing set Classification Report:
              precision    recall  f1-score   support

          AE     0.4283    0.3802    0.4028      2630
          BH     0.3661    0.2864    0.3214      2629
          DZ     0.5851    0.4376    0.5007      1618
          EG     0.6504    0.8522    0.7378      5764
          IQ     0.6775    0.4987    0.5745      1550
          JO     0.4379    0.3044    0.3592      2792
          KW     0.4177    0.6037    0.4938      4211
          LB     0.6045    0.6459    0.6245      2762
          LY     0.5952    0.6797    0.6347      3650
          MA     0.7735    0.5208    0.6225      1154
          OM     0.4402    0.2965    0.3544      1912
          PL     0.4358    0.5649    0.4920      4374
          QA     0.4537    0.4641    0.4589      3107
          SA     0.3917    0.4372    0.4132      2683
          SD     0.7652    0.5239    0.6220      1443
          SY     0.5165    0.2691    0.3538      1624
          TN     0.7488    0.3323    0.4603 

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, y_test_pred,
                                        cmap = "cividis", 
                                        xticks_rotation = "vertical")