## Importing Libraries

In [1]:
from nltk.corpus import stopwords
import pandas as pd
from preprocessing import preprocessing_pipeline, train_val_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.pipeline import make_pipeline
from utils import confusion_matrix, report

#Supress Warnings:
import warnings
warnings.filterwarnings('ignore')

## Loading and spliting the data

In [2]:
df = pd.read_csv(r"Datasets/dialect_dataset.csv", header=0)

In [3]:
X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(df["text"], df["dialect"])

## Building the pipeline

In [4]:
stopwords_list = set(stopwords.words('arabic'))
preprocessing = preprocessing_pipeline(["tfidf"],
                                        ngram_range=(1, 5), 
                                        min_df=10)
model = LogisticRegression()

In [5]:
pipeline = make_pipeline(preprocessing, model)
pipeline.fit(X_train, y_train)

Pipeline(steps=[('pipeline',
                 Pipeline(steps=[('tfidfvectorizer',
                                  TfidfVectorizer(min_df=10,
                                                  ngram_range=(1, 5)))])),
                ('logisticregression', LogisticRegression())])

## Model performance assessment

In [6]:
y_train_pred = pipeline.predict(X_train)
y_val_pred = pipeline.predict(X_val)
y_test_pred = pipeline.predict(X_test)

In [7]:
report("Training", y_train, y_train_pred)

  Training set Classification Report:
              precision    recall  f1-score   support

          AE     0.6639    0.5909    0.6253     21036
          BH     0.6622    0.5257    0.5861     21034
          DZ     0.7996    0.6323    0.7061     12947
          EG     0.7233    0.9170    0.8087     46108
          IQ     0.7864    0.6252    0.6966     12397
          JO     0.7440    0.4908    0.5914     22337
          KW     0.5310    0.8074    0.6407     33687
          LB     0.7484    0.7700    0.7590     22093
          LY     0.7426    0.7888    0.7650     29199
          MA     0.8780    0.6527    0.7488      9231
          OM     0.7004    0.4858    0.5737     15293
          PL     0.5677    0.7465    0.6450     34994
          QA     0.6887    0.6681    0.6783     24855
          SA     0.5581    0.5887    0.5729     21466
          SD     0.8459    0.5910    0.6959     11548
          SY     0.7505    0.4643    0.5737     12994
          TN     0.8356    0.4596    0.5931

In [10]:
report("Validation", y_val, y_val_pred)

  Validation set Classification Report:
              precision    recall  f1-score   support

          AE     0.5695    0.5000    0.5325      2630
          BH     0.5413    0.4009    0.4607      2629
          DZ     0.7418    0.5698    0.6445      1618
          EG     0.6827    0.8878    0.7719      5764
          IQ     0.7255    0.5594    0.6317      1550
          JO     0.6391    0.3843    0.4800      2792
          KW     0.4629    0.7402    0.5696      4211
          LB     0.6851    0.7028    0.6938      2762
          LY     0.6747    0.7285    0.7006      3650
          MA     0.8590    0.6282    0.7257      1154
          OM     0.6145    0.4171    0.4969      1911
          PL     0.4930    0.6822    0.5724      4374
          QA     0.6214    0.5806    0.6003      3107
          SA     0.4613    0.4715    0.4664      2683
          SD     0.7908    0.5343    0.6377      1443
          SY     0.6340    0.3627    0.4614      1624
          TN     0.8083    0.4011    0.53

In [None]:
confusion_matrix(y_train, y_train_pred, y_val, y_val_pred)

In [11]:
report("Testing", y_test, y_test_pred)

  Testing set Classification Report:
              precision    recall  f1-score   support

          AE     0.5668    0.4985    0.5304      2630
          BH     0.5613    0.4199    0.4804      2629
          DZ     0.7384    0.5494    0.6300      1618
          EG     0.6870    0.8874    0.7745      5764
          IQ     0.7292    0.5716    0.6409      1550
          JO     0.6400    0.4044    0.4956      2792
          KW     0.4557    0.7141    0.5564      4211
          LB     0.6733    0.7067    0.6896      2762
          LY     0.6844    0.7296    0.7063      3650
          MA     0.8251    0.6049    0.6980      1154
          OM     0.6281    0.4168    0.5011      1912
          PL     0.5003    0.6710    0.5732      4374
          QA     0.6114    0.5961    0.6037      3107
          SA     0.4635    0.4920    0.4773      2683
          SD     0.8114    0.5426    0.6503      1443
          SY     0.6404    0.3805    0.4774      1624
          TN     0.8009    0.3961    0.5301 

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, y_test_pred,
                                        cmap = "cividis", 
                                        xticks_rotation = "vertical")