### The Machine Learning model:

In [12]:
#Importing libraries
import pickle
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
# from sklearn.svm import SVC # SVC took a very long time with very bad accuracy
from sklearn.naive_bayes import ComplementNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder

In [3]:
#Import the clean dataset
data = pd.read_csv("data/clean_dialect_dataset.csv", index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
data.head()

Unnamed: 0,id,dialect,text
0,1.175358e+18,IQ,لكن بالنهايه ينتفض يغير
1,1.175416e+18,IQ,يعني هذا محسوب علي البشر حيونه ووحشيه وتطلبو...
2,1.17545e+18,IQ,مبين من كلامه خليجي
3,1.175471e+18,IQ,يسلملي مرورك وروحك الحلوه
4,1.175497e+18,IQ,وين هل الغيبه اخ محمد


In [5]:
data.drop_duplicates(inplace=True)
data.dropna(inplace=True)
data.shape

(458187, 3)

In [6]:
#Label encoding the dialect
encoder = LabelEncoder()
data["dialect_transformed"] = encoder.fit_transform(data["dialect"])

In [7]:
#Will save for future use in the server
output = open('model/dialect_encoder.pkl', 'wb')
pickle.dump(encoder, output)
output.close()

In [8]:
#Splitting data into three datasets
training_data, testing_data = train_test_split(data, train_size=0.8,
                                               stratify=data.loc[:,"dialect_transformed"],
                                               random_state=77)

training_data, validation_data = train_test_split(training_data, train_size=0.85,
                                                  stratify=training_data.loc[:,"dialect_transformed"],
                                                  random_state=77)

print(training_data.shape,validation_data.shape, testing_data.shape)

(311566, 4) (54983, 4) (91638, 4)


In [9]:
#Read the Arabic stopwords ->> Source SpaCy Library Stop words
from data.stopwords import stopwords

In [10]:
#lets' create the TF-IDF vectorizer of the text and using n-grams (1-4)
vectorizer = TfidfVectorizer(analyzer='char_wb', stop_words=stopwords,
                             ngram_range=(1,4),)

### First we we'll try a baseline model. Let's try the Logistic regression:

In [10]:
clf1 = LogisticRegression(random_state=7, penalty= "l1",
                          verbose=True, n_jobs=-1, solver='liblinear')
pipe1=make_pipeline(vectorizer,clf1)

In [11]:
pipe1.fit(training_data.text,training_data.dialect_transformed)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


[LibLinear]

Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(analyzer='char_wb', ngram_range=(1, 4),
                                 stop_words=['،', 'ء', 'ءَ', 'آ', 'آب', 'آذار',
                                             'آض', 'آل', 'آمينَ', 'آناء',
                                             'آنفا', 'آه', 'آهاً', 'آهٍ', 'آهِ',
                                             'أ', 'أبدا', 'أبريل', 'أبو', 'أبٌ',
                                             'أجل', 'أجمع', 'أحد', 'أخبر',
                                             'أخذ', 'أخو', 'أخٌ', 'أربع',
                                             'أربعاء', 'أربعة', ...])),
                ('logisticregression',
                 LogisticRegression(n_jobs=-1, penalty='l1', random_state=7,
                                    solver='liblinear', verbose=True))])

In [12]:
pred=pipe1.predict(validation_data.text)
print(classification_report(validation_data.dialect_transformed,pred))
print('Score:',accuracy_score(validation_data.dialect_transformed,pred))

              precision    recall  f1-score   support

           0       0.45      0.40      0.42      3156
           1       0.38      0.28      0.32      3155
           2       0.64      0.48      0.54      1942
           3       0.66      0.86      0.75      6916
           4       0.64      0.51      0.57      1860
           5       0.43      0.28      0.34      3351
           6       0.44      0.63      0.52      5053
           7       0.61      0.66      0.64      3314
           8       0.59      0.70      0.64      4380
           9       0.76      0.55      0.64      1385
          10       0.46      0.34      0.39      2294
          11       0.45      0.58      0.51      5249
          12       0.48      0.46      0.47      3728
          13       0.39      0.43      0.41      3220
          14       0.71      0.52      0.60      1732
          15       0.55      0.25      0.35      1949
          16       0.76      0.36      0.49      1108
          17       0.55    

The score is very bad. Let's try a quick hyperparameter tuning to see if we can make some improvement.

In [None]:
# define search space
space = {"logisticregression__C":[1.0,2.0,3.0, 5.0, 6.0, 7.0]}

# define evaluation
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)

# define search
search = GridSearchCV(pipe1, space, scoring='accuracy')

# execute search
result = search.fit(training_data.text,training_data.dialect_transformed)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]

In [None]:
# summarize result
print(f'Best Score: {result.best_score_}')
print(f'Best Hyperparameters: {result.best_params_}')

In [None]:
#Let's check the accuracy improvement on validation dataset
pred=search.predict(validation_data.text)
print(classification_report(validation_data.dialect_transformed,pred))
print('Score:',accuracy_score(validation_data.dialect_transformed,pred))

There is a slight improvement. Let's try what will happen if we used the Count Vectorizer instead of Tf-idf to avoid decreasing the weight of the most common words in most of the dialects.

In [None]:
#Let's try the count vectorizer of the text and using n-grams (1-4)
count_vectorizer = CountVectorizer(analyzer='char_wb', stop_words=stopwords,
                                   ngram_range=(1,4))

In [None]:
clf1 = LogisticRegression(random_state=7,penalty= "l1", verbose=True, solver='liblinear')
pipe1=make_pipeline(count_vectorizer,clf1)
pipe1.fit(training_data.text,training_data.dialect_transformed)

In [None]:
pred=pipe1.predict(validation_data.text)
print(classification_report(validation_data.dialect_transformed,pred))
print('Score:',accuracy_score(validation_data.dialect_transformed,pred))

Changing the Vectorization method didn't affect the results, so let's try another model.

### Naive Bayes Classifier:

In [11]:
clf2=ComplementNB(fit_prior=True)
pipe2=make_pipeline(vectorizer,clf2)
pipe2.fit(training_data.text,training_data.dialect_transformed)

Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(analyzer='char_wb', ngram_range=(1, 4),
                                 stop_words=['،', 'ء', 'ءَ', 'آ', 'آب', 'آذار',
                                             'آض', 'آل', 'آمينَ', 'آناء',
                                             'آنفا', 'آه', 'آهاً', 'آهٍ', 'آهِ',
                                             'أ', 'أبدا', 'أبريل', 'أبو', 'أبٌ',
                                             'أجل', 'أجمع', 'أحد', 'أخبر',
                                             'أخذ', 'أخو', 'أخٌ', 'أربع',
                                             'أربعاء', 'أربعة', ...])),
                ('complementnb', ComplementNB())])

In [12]:
pred=pipe2.predict(validation_data.text)
print(classification_report(validation_data.dialect_transformed,pred))
print('Score:',accuracy_score(validation_data.dialect_transformed,pred))

              precision    recall  f1-score   support

           0       0.41      0.28      0.34      3156
           1       0.42      0.22      0.29      3155
           2       0.54      0.42      0.47      1942
           3       0.51      0.91      0.65      6916
           4       0.63      0.45      0.53      1860
           5       0.49      0.15      0.23      3351
           6       0.38      0.63      0.48      5053
           7       0.54      0.64      0.59      3314
           8       0.59      0.65      0.62      4380
           9       0.66      0.55      0.60      1385
          10       0.55      0.20      0.30      2294
          11       0.39      0.56      0.46      5249
          12       0.41      0.47      0.44      3728
          13       0.42      0.32      0.36      3220
          14       0.67      0.33      0.44      1732
          15       0.56      0.12      0.20      1949
          16       0.66      0.30      0.42      1108
          17       0.57    

In [None]:
# define search space
space = {"complementnb__alpha":[0.2, 0.25, 0.3, 0.35, 0.40, 0.45]}

# define evaluation
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=1)

# define search
search2 = GridSearchCV(pipe2, space, scoring='accuracy', n_jobs=-1)

# execute search
result2 = search2.fit(training_data.text,training_data.dialect_transformed)

In [None]:
# summarize result
print(f'Best Score: {result2.best_score_}')
print(f'Best Hyperparameters: {result2.best_params_}')

In [None]:
#Let's check the accuracy improvement on validation dataset
pred=search2.predict(validation_data.text)
print(classification_report(validation_data.dialect_transformed,pred))
print('Score:',accuracy_score(validation_data.dialect_transformed,pred))

### Random Forest Classifier:

In [13]:
clf3=RandomForestClassifier(verbose=True, n_jobs=-9, random_state=77)
pipe3=make_pipeline(vectorizer,clf3)
pipe3.fit(training_data.text,training_data.dialect_transformed)

[Parallel(n_jobs=-9)]: Using backend ThreadingBackend with 28 concurrent workers.
[Parallel(n_jobs=-9)]: Done 100 out of 100 | elapsed:  9.7min finished


Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(analyzer='char_wb', ngram_range=(1, 4),
                                 stop_words=['،', 'ء', 'ءَ', 'آ', 'آب', 'آذار',
                                             'آض', 'آل', 'آمينَ', 'آناء',
                                             'آنفا', 'آه', 'آهاً', 'آهٍ', 'آهِ',
                                             'أ', 'أبدا', 'أبريل', 'أبو', 'أبٌ',
                                             'أجل', 'أجمع', 'أحد', 'أخبر',
                                             'أخذ', 'أخو', 'أخٌ', 'أربع',
                                             'أربعاء', 'أربعة', ...])),
                ('randomforestclassifier',
                 RandomForestClassifier(n_jobs=-9, random_state=77,
                                        verbose=True))])

In [14]:
pred=pipe3.predict(validation_data.text)
print(classification_report(validation_data.dialect_transformed,pred))
print('Score:',accuracy_score(validation_data.dialect_transformed,pred))

[Parallel(n_jobs=28)]: Using backend ThreadingBackend with 28 concurrent workers.


              precision    recall  f1-score   support

           0       0.29      0.14      0.19      3156
           1       0.31      0.17      0.22      3155
           2       0.64      0.24      0.35      1942
           3       0.40      0.87      0.55      6916
           4       0.63      0.22      0.32      1860
           5       0.31      0.12      0.17      3351
           6       0.28      0.59      0.38      5053
           7       0.54      0.47      0.50      3314
           8       0.42      0.47      0.45      4380
           9       0.79      0.27      0.40      1385
          10       0.37      0.10      0.16      2294
          11       0.30      0.51      0.38      5249
          12       0.39      0.31      0.34      3728
          13       0.36      0.19      0.25      3220
          14       0.77      0.18      0.29      1732
          15       0.66      0.06      0.12      1949
          16       0.76      0.12      0.21      1108
          17       0.62    

[Parallel(n_jobs=28)]: Done 100 out of 100 | elapsed:    1.3s finished


In [None]:
# define search space
space = {"randomforestclassifier__n_estimators":[100, 150, 200, 250]}

# define evaluation
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=1)

# define search
search3 = GridSearchCV(pipe3, space, scoring='accuracy', n_jobs=-1)

# execute search
result3 = search3.fit(training_data.text,training_data.dialect_transformed)

In [None]:
# summarize result
print(f'Best Score: {result3.best_score_}')
print(f'Best Hyperparameters: {result3.best_params_}')

#Let's check the accuracy improvement on test dataset
pred=search3.predict(validation_data.text)
print(classification_report(validation_data.dialect_transformed,pred))
print('Score:',accuracy_score(validation_data.dialect_transformed,pred))

### Voting Classifier:

Now Let's merge all the models with their best parameters, from the hyperparameter tuning, in a voting classifier to improve the performance.

In [13]:
clf1 = LogisticRegression(random_state=7,penalty= "l1",C =7.0,verbose=True,n_jobs=-1,solver='liblinear')

clf2=ComplementNB(fit_prior=True, alpha=0.25)

clf3 = RandomForestClassifier(n_estimators=250, verbose=True, n_jobs=-1, random_state=77)

clf = VotingClassifier(estimators=[('lr', clf1), ('nb', clf2),('rf', clf3)], voting='soft')
pipe=make_pipeline(vectorizer,clf)
pipe.fit(training_data.text,training_data.dialect_transformed)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


[LibLinear]

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 36 concurrent workers.
[Parallel(n_jobs=-1)]: Done 128 tasks      | elapsed: 14.8min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed: 25.9min finished


Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(analyzer='char_wb', ngram_range=(1, 4),
                                 stop_words=['،', 'ء', 'ءَ', 'آ', 'آب', 'آذار',
                                             'آض', 'آل', 'آمينَ', 'آناء',
                                             'آنفا', 'آه', 'آهاً', 'آهٍ', 'آهِ',
                                             'أ', 'أبدا', 'أبريل', 'أبو', 'أبٌ',
                                             'أجل', 'أجمع', 'أحد', 'أخبر',
                                             'أخذ', 'أخو', 'أخٌ', 'أربع',
                                             'أربعاء', 'أربعة', ...])),
                ('votingclassifier',
                 VotingClassifier(estimators=[('lr',
                                               LogisticRegression(C=7.0,
                                                                  n_jobs=-1,
                                                                  penalty='l1',
                                  

In [14]:
#Score on Validation dataset
pred=pipe.predict(validation_data.text)
print(classification_report(validation_data.dialect_transformed,pred))
print('Score:',accuracy_score(validation_data.dialect_transformed,pred))

[Parallel(n_jobs=36)]: Using backend ThreadingBackend with 36 concurrent workers.
[Parallel(n_jobs=36)]: Done 128 tasks      | elapsed:    2.1s


              precision    recall  f1-score   support

           0       0.46      0.40      0.43      3156
           1       0.39      0.33      0.36      3155
           2       0.66      0.52      0.58      1942
           3       0.69      0.88      0.77      6916
           4       0.66      0.54      0.59      1860
           5       0.42      0.31      0.36      3351
           6       0.47      0.62      0.54      5053
           7       0.64      0.67      0.66      3314
           8       0.65      0.73      0.68      4380
           9       0.78      0.57      0.66      1385
          10       0.46      0.37      0.41      2294
          11       0.47      0.59      0.52      5249
          12       0.49      0.49      0.49      3728
          13       0.44      0.44      0.44      3220
          14       0.75      0.54      0.63      1732
          15       0.52      0.31      0.38      1949
          16       0.74      0.45      0.56      1108
          17       0.44    

[Parallel(n_jobs=36)]: Done 250 out of 250 | elapsed:    3.6s finished


In [15]:
#Final results on the testing dataset
pred=pipe.predict(testing_data.text)
print(classification_report(testing_data.dialect_transformed,pred))
print('Score:',accuracy_score(testing_data.dialect_transformed,pred))

[Parallel(n_jobs=36)]: Using backend ThreadingBackend with 36 concurrent workers.
[Parallel(n_jobs=36)]: Done 128 tasks      | elapsed:    3.6s
[Parallel(n_jobs=36)]: Done 250 out of 250 | elapsed:    6.7s finished


              precision    recall  f1-score   support

           0       0.45      0.42      0.43      5259
           1       0.40      0.33      0.36      5259
           2       0.67      0.53      0.60      3237
           3       0.69      0.87      0.77     11527
           4       0.68      0.54      0.60      3099
           5       0.44      0.34      0.38      5584
           6       0.47      0.62      0.54      8422
           7       0.65      0.69      0.67      5524
           8       0.66      0.72      0.69      7300
           9       0.78      0.60      0.68      2308
          10       0.47      0.38      0.42      3823
          11       0.47      0.59      0.53      8749
          12       0.48      0.50      0.49      6214
          13       0.44      0.44      0.44      5366
          14       0.75      0.57      0.64      2887
          15       0.50      0.30      0.38      3248
          16       0.73      0.44      0.55      1847
          17       0.43    

In [16]:
# save the final model to disk
filename = 'model/finalized_model.sav'
pickle.dump(pipe, open(filename, 'wb'))

It seems that the Voting Classifier with tuned hyper-parameters is the best one till now.