# Needed libararies

In [36]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import confusion_matrix,accuracy_score, classification_report

# Reading the preprocessing step's output and spliting the data

In [37]:
data = pd.read_csv('outputs/output_2.csv')
X_train, X_test, y_train, y_test = train_test_split(data['Text'], data['dialect'],  test_size=0.1, random_state=42)

# Naive Bayes

In [39]:
pipe = make_pipeline(TfidfVectorizer(),
                    MultinomialNB())
pipe.fit(X_train,y_train)
prediction = pipe.predict(X_test)
print(f"Accuracy score is {accuracy_score(y_test, prediction):.2f}")
print(classification_report(y_test, prediction))

Accuracy score is 0.40
              precision    recall  f1-score   support

           1       0.36      0.97      0.52      5759
           2       0.34      0.67      0.45      4520
           3       0.28      0.81      0.42      4167
           4       0.66      0.57      0.61      3653
           5       0.52      0.38      0.44      3065
           6       0.81      0.07      0.12      2754
           7       0.81      0.42      0.56      2731
           8       0.64      0.14      0.23      2562
           9       0.71      0.15      0.25      2551
          10       0.78      0.08      0.15      2637
          11       0.95      0.03      0.06      1972
          12       0.98      0.03      0.06      1621
          13       0.92      0.17      0.29      1738
          14       0.94      0.10      0.19      1548
          15       1.00      0.06      0.10      1462
          16       0.98      0.24      0.38      1166
          17       1.00      0.01      0.02       990
    

In [25]:
pipe.predict(["شكون احنا"])

array([18], dtype=int64)

In [28]:
pipe.predict(["مااخذ اي بشر وحدي"])

array([14], dtype=int64)

In [29]:
pipe.predict(["ربي يعطيك ويعطيهم الصحه مع قناعتي انك تنصح ولاتُنصح"])

array([13], dtype=int64)

In [26]:
pipe.predict(["اني تعبت هسه من الحديث معهم"])

array([14], dtype=int64)

# Using ComplementNB instead of MultinomialNB as it deal better with imbalanced data

In [40]:
pipe = make_pipeline(TfidfVectorizer(),
                    ComplementNB())
pipe.fit(X_train,y_train)
prediction = pipe.predict(X_test)
print(f"Accuracy score is {accuracy_score(y_test, prediction):.2f}")
print(classification_report(y_test, prediction))

Accuracy score is 0.53
              precision    recall  f1-score   support

           1       0.58      0.92      0.71      5759
           2       0.48      0.55      0.51      4520
           3       0.44      0.62      0.51      4167
           4       0.66      0.67      0.67      3653
           5       0.44      0.51      0.47      3065
           6       0.48      0.25      0.33      2754
           7       0.57      0.69      0.63      2731
           8       0.44      0.40      0.42      2562
           9       0.44      0.39      0.41      2551
          10       0.43      0.30      0.36      2637
          11       0.48      0.25      0.33      1972
          12       0.54      0.23      0.32      1621
          13       0.66      0.52      0.58      1738
          14       0.67      0.52      0.58      1548
          15       0.71      0.46      0.56      1462
          16       0.70      0.64      0.67      1166
          17       0.45      0.14      0.22       990
    

In [57]:
pipe.predict(["ايش تجول انت"])


array([2], dtype=int64)

# Save the model as a pkl file to be used in the model deployment

In [42]:

pickle.dump(pipe, open('outputs/output_4.pkl', 'wb'))