In [1]:
import pandas as pd
from data_shuffling_split import *
from features_extraction import *
from data_preprocess import *
from ml_modeling import *
from configs import *

In [2]:
def train_val_test_score(model, x_train, y_train, x_val, y_val, x_test, y_test):
    
    print("On Training set\n")
    f1_score_result(model, x_train, y_train)
    print("="*50)
    print("On Validation set \n")
    f1_score_result(model, x_val, y_val)
    print("="*50)
    print("On Training \n")
    f1_score_result(model, x_test, y_test)
    
    return True

# Tokenize All data

In [3]:
# Train and Validation data
strat_train_set = read_csv("train/strat_train_set.csv")
x_train_text, x_val_text, y_train, y_val = prepare_data(strat_train_set)

# Test
strat_test_set = pd.read_csv("dataset/test/strat_test_set.csv")
x_test_text, y_test = list(strat_test_set['text']), strat_test_set['dialect_l_encoded'].values


x_train_text_tokenized = tokenize_using_nltk_TreebankWordTokenizer(x_train_text)

print("Before Tokenization : \n", x_train_text[:3])
print("="*50)
print("After Tokenization : \n", x_train_text_tokenized[:3])
print("="*50)

x_val_text_tokenized = tokenize_using_nltk_TreebankWordTokenizer(x_val_text)

print("Before Tokenization : \n", x_val_text[:3])
print("="*50)
print("After Tokenization : \n", x_val_text_tokenized[:3])


x_test_text_tokenized = tokenize_using_nltk_TreebankWordTokenizer(x_test_text)

print("Before Tokenization : \n", x_test_text[:3])
print("="*50)
print("After Tokenization : \n", x_test_text_tokenized[:3])
print("="*50)

Number of instances in the file are:  449033
The number of instances in the training data after StratifiedShuffleSplit are:  440052
The number of instances in the testing data after StratifiedShuffleSplit are:   8981
The number of trainin instances:  440052
The number of validation instances:  8981
The number of trainin labels :  440052
The number of validation labels :  8981
Before Tokenization : 
 ['حسابشخصي صعب لان الشعب لو ماكلو مسيس ماوصلنا لهون', 'حسابشخصي شي غير ايجابي وصلنا 2018 ولا زالت الافكار الغريبه وغير ايجابيه موجوده اذ تفكرون في الايجابيه والسعاده خلوا الموظفين والموظفات كل واحد يجيب اطفاله معه وتكون احلي سعاده 🤚 ', 'وياهم انزاح تراء ماعاد فيني احتمال بشرا قلوبهم من حجر قساه مايرحمون يحسسوك انه مايفرق حالك حال نعال السخريه من وضعك وفيه حاجتك يذلون حسبي الله ونعم الوكيل']
After Tokenization : 
 [['حسابشخصي', 'صعب', 'لان', 'الشعب', 'لو', 'ماكلو', 'مسيس', 'ماوصلنا', 'لهون'], ['حسابشخصي', 'شي', 'غير', 'ايجابي', 'وصلنا', '2018', 'ولا', 'زالت', 'الافكار', 'الغريبه', 'وغير', 'ا

# Abo Bakr Word2vec

In [4]:
number_of_features = 100
max_len_str = 64

word_to_vec_model = load_word2vec_model("models/word2vec/bakrianoo_unigram_cbow_model/full_uni_cbow_100_twitter.mdl")

X_train_embed_matrix = text_to_matrix_using_word2vec(word_to_vec_model, x_train_text_tokenized, max_len_str)
X_val_embed_matrix = text_to_matrix_using_word2vec(word_to_vec_model, x_val_text_tokenized, max_len_str)
x_test_embed_matrix = text_to_matrix_using_word2vec(word_to_vec_model, x_test_text_tokenized, max_len_str)


(440052, 6400)
[-1.892   -0.7603   1.813   -1.012    1.148   -2.803    0.7295   2.883
  0.653    2.656   -0.05063 -1.826   -0.656   -0.3872  -2.8     -1.694
 -1.081    2.838   -1.684   -1.89     0.03732 -1.621   -0.5024   3.002
  1.024   -2.393   -1.351    2.922    0.6934  -0.8276  -4.355    0.896
  2.215   -0.6357   1.226   -5.832   -0.4255   4.285   -2.77    -2.832
  3.697    2.02     0.6646   0.0755   5.61    -2.03    -0.8774   0.2825
 -1.219    0.10815]
(8981, 6400)
[-0.395  -1.198   0.4067 -0.3384 -0.4414  1.207  -0.5366  0.0782  0.333
  0.4075 -1.254   0.2238 -0.5186  0.624   1.36    0.8545  0.34   -1.218
 -3.293  -0.1758 -0.2307  1.958  -1.204   1.968  -0.842  -1.233   0.7466
 -1.993  -0.4653 -2.162   0.2644 -2.395   2.201   1.48    0.1986  2.328
  2.422  -1.893  -0.66   -0.1448  0.5537 -0.3228 -0.4375 -0.5117 -3.07
 -0.4277  2.225   1.252   0.7383 -1.787 ]
(9164, 6400)
[-0.2822  -0.1575   3.271   -5.48    -0.6846   1.536    0.1155   0.1368
 -2.66    -2.887    0.4595   0.6196  -

In [5]:
# Test using AdaBoostClassifier 

model_path    = "bakr/AdaBoostClassifier__f1_0.325_ml.sav"
model = pickle_load_model("models/ml_models/" + model_path)

_ = train_val_test_score(model, X_train_embed_matrix, y_train, X_val_embed_matrix, y_val, 
                         x_test_embed_matrix, y_test)

On Training set

F1 score is:  0.3366397607555471
On Validation set 

F1 score is:  0.33971718071484247
On Training 

F1 score is:  0.32802269751200347


In [6]:
# Test using Logistic Regression 
model_path    = "bakr/unigram_100d_lg_cls_model_f1_0.366.sav"
model = pickle_load_model("models/ml_models/" + model_path)

_ = train_val_test_score(model, X_train_embed_matrix, y_train, X_val_embed_matrix, y_val, 
                         x_test_embed_matrix, y_test)

On Training set

F1 score is:  0.4001640715188205
On Validation set 

F1 score is:  0.40285046208662734
On Training 

F1 score is:  0.3644696639022261


#  Rezk Word2vec

In [7]:
number_of_features = 300
max_len_str = 64

word2vec_path = "rezk_unigram_CBOW_model/train_word2vec_cbow__window_3_min_count_300"
word_to_vec_model = load_word2vec_model("models/word2vec/" + word2vec_path)

X_train_embed_matrix = text_to_matrix_using_word2vec(word_to_vec_model, x_train_text_tokenized, max_len_str)
X_val_embed_matrix = text_to_matrix_using_word2vec(word_to_vec_model, x_val_text_tokenized, max_len_str)
x_test_embed_matrix = text_to_matrix_using_word2vec(word_to_vec_model, x_test_text_tokenized, max_len_str)

(440052, 19200)
[-0.1262   0.2761   0.2466  -0.3464  -0.5044   0.216    0.2651   0.05423
 -0.3276  -0.2793   0.328    0.1699  -0.05267  0.1941   0.292    0.1654
 -0.01619 -0.428    0.411    0.0927   0.271    0.6206  -0.04764  0.04465
  0.0863   0.06042  0.08374 -0.0927   0.05176 -0.1616  -0.4875   0.4932
  0.1333   0.4666   0.0387  -0.19     0.05563 -0.1526   0.549    0.2966
 -0.0969  -0.345   -0.2896  -0.0667   0.12146  0.2126   0.1146  -0.4404
 -0.1198   0.2651 ]
(8981, 19200)
[-0.1262   0.2761   0.2466  -0.3464  -0.5044   0.216    0.2651   0.05423
 -0.3276  -0.2793   0.328    0.1699  -0.05267  0.1941   0.292    0.1654
 -0.01619 -0.428    0.411    0.0927   0.271    0.6206  -0.04764  0.04465
  0.0863   0.06042  0.08374 -0.0927   0.05176 -0.1616  -0.4875   0.4932
  0.1333   0.4666   0.0387  -0.19     0.05563 -0.1526   0.549    0.2966
 -0.0969  -0.345   -0.2896  -0.0667   0.12146  0.2126   0.1146  -0.4404
 -0.1198   0.2651 ]
(9164, 19200)
[ 0.629     1.157    -0.6753    0.2048    0.2107

In [8]:
# Test using LogisticRegression 

model_path    = "rezk/LogisticRegression__f1_0.41_ml.sav"
model = pickle_load_model("models/ml_models/" + model_path)

_ = train_val_test_score(model, X_train_embed_matrix, y_train, X_val_embed_matrix, y_val, 
                         x_test_embed_matrix, y_test)

On Training set

F1 score is:  0.5131757155972476
On Validation set 

F1 score is:  0.5205433693352634
On Training 

F1 score is:  0.4143387167175906


In [9]:
# Test using GradientBoostingClassifier 

model_path    = "rezk/GradientBoostingClassifier__f1_0.22_ml.sav"
model = pickle_load_model("models/ml_models/" + model_path)

_ = train_val_test_score(model, X_train_embed_matrix, y_train, X_val_embed_matrix, y_val, 
                         x_test_embed_matrix, y_test)

On Training set

F1 score is:  0.24181233126994084
On Validation set 

F1 score is:  0.24562966262108896
On Training 

F1 score is:  0.22315582714971627


In [10]:
# Test using LinearSVC 

model_path    = "rezk/LinearSVC__f1_0.279_ml.sav"
model = pickle_load_model("models/ml_models/" + model_path)

_ = train_val_test_score(model, X_train_embed_matrix, y_train, X_val_embed_matrix, y_val, 
                         x_test_embed_matrix, y_test)

On Training set

F1 score is:  0.3368874587548744
On Validation set 

F1 score is:  0.34650929740563413
On Training 

F1 score is:  0.2752073330423396


# Conclusion

If we compare the model prediction to the human prediction, we may conclude that the task of predict the dialect is semi difficult task for human. So how its if we that compare to the model !.

And as in the reference paper the SVC model have over **50%**, but we need more resource to train and test multiple models with either Gridsearch or Zoom out and Zoom in idea.

Also I would like to use AdaBoostClassifier trained with AraVec at the end with the API, as if we compared to human prediction of the text, its the best ones as we have very small gap between training and validation, and its doing the same for testing data which never seeing by the model.