In [1]:
import pandas as pd
from data_shuffling_split import *
from features_extraction import *
from data_preprocess import *
from ml_modeling import *
from configs import *

In [2]:
def train_val_test_score(model, x_train, y_train, x_val, y_val, x_test, y_test):
    
    print("On Training set\n")
    f1_score_result(model, x_train, y_train)
    print("="*50)
    print("On Validation set \n")
    f1_score_result(model, x_val, y_val)
    print("="*50)
    print("On Training \n")
    f1_score_result(model, x_test, y_test)
    
    return True

# Tokenize All data

In [3]:
# Train and Validation data
strat_train_set = read_csv("train/strat_train_set.csv")
x_train_text, x_val_text, y_train, y_val = prepare_data(strat_train_set)

# Test
strat_test_set = pd.read_csv("dataset/test/strat_test_set.csv")
x_test_text, y_test = list(strat_test_set['text']), strat_test_set['dialect_l_encoded'].values


x_train_text_tokenized = tokenize_using_nltk_TreebankWordTokenizer(x_train_text)

print("Before Tokenization : \n", x_train_text[:3])
print("="*50)
print("After Tokenization : \n", x_train_text_tokenized[:3])
print("="*50)

x_val_text_tokenized = tokenize_using_nltk_TreebankWordTokenizer(x_val_text)

print("Before Tokenization : \n", x_val_text[:3])
print("="*50)
print("After Tokenization : \n", x_val_text_tokenized[:3])


x_test_text_tokenized = tokenize_using_nltk_TreebankWordTokenizer(x_test_text)

print("Before Tokenization : \n", x_test_text[:3])
print("="*50)
print("After Tokenization : \n", x_test_text_tokenized[:3])
print("="*50)

Number of instances in the file are:  449033
The number of instances in the training data after StratifiedShuffleSplit are:  440052
The number of instances in the testing data after StratifiedShuffleSplit are:   8981
The number of trainin instances:  440052
The number of validation instances:  8981
The number of trainin labels :  440052
The number of validation labels :  8981
Before Tokenization : 
 ['حسابشخصي وشن دخل اهالي طرابلس يا جاهل', 'رساله توعويه تحذيريه من شرطه ابوظبي للجميع بتوخي الحذر في ظل سوء الاحوال الجويه شكرا شرطه ابوظبي رابطويب', 'يسعدوو الدكتوور قلي انا حتكفل بالمعمل تتعبيش نفسك 😭 ❤❤']
After Tokenization : 
 [['حسابشخصي', 'وشن', 'دخل', 'اهالي', 'طرابلس', 'يا', 'جاهل'], ['رساله', 'توعويه', 'تحذيريه', 'من', 'شرطه', 'ابوظبي', 'للجميع', 'بتوخي', 'الحذر', 'في', 'ظل', 'سوء', 'الاحوال', 'الجويه', 'شكرا', 'شرطه', 'ابوظبي', 'رابطويب'], ['يسعدوو', 'الدكتوور', 'قلي', 'انا', 'حتكفل', 'بالمعمل', 'تتعبيش', 'نفسك', '😭', '❤❤']]
Before Tokenization : 
 ['حسابشخصي ااشر علي القمر والاخ 

# Abo Bakr Word2vec

In [4]:
number_of_features = 100
max_len_str = 64

word_to_vec_model = load_word2vec_model("models/word2vec/bakrianoo_unigram_cbow_model/full_uni_cbow_100_twitter.mdl")

X_train_embed_matrix = text_to_matrix_using_word2vec(word_to_vec_model, x_train_text_tokenized, max_len_str)
X_val_embed_matrix = text_to_matrix_using_word2vec(word_to_vec_model, x_val_text_tokenized, max_len_str)
x_test_embed_matrix = text_to_matrix_using_word2vec(word_to_vec_model, x_test_text_tokenized, max_len_str)


(440052, 6400)
[-0.05905 -0.0757   0.4565  -0.6665  -0.597   -0.595    0.823   -1.165
  0.8506   1.134   -0.2147  -1.182    1.329   -1.651   -2.314    1.583
 -0.129    0.3357   0.399   -0.605    1.994    1.381   -0.942    1.259
  2.197   -0.19    -1.18    -0.93    -1.755   -0.8086   0.646   -0.9424
  1.502    0.7266   1.449    1.867    0.3801   1.539   -0.05664  0.728
 -0.69    -0.884   -0.7812   0.823   -1.819    0.06198  0.145   -0.08875
  1.484    0.3823 ]
(8981, 6400)
[ 0.3884   0.5273  -0.6177   0.6475  -1.007   -1.808    0.02069 -0.5015
  0.6987   0.0622   0.2275  -0.10034 -0.9834   0.8125  -0.3647   0.1355
 -0.4746  -0.5894   0.1296   0.8384  -0.3652   0.8203  -1.062    0.7207
 -1.255   -1.192   -0.1323   0.03262  0.8306   0.2048   0.738    0.00873
 -0.3853   1.633   -0.4583  -0.8047  -1.7     -0.05905 -1.256    0.6904
 -0.4019  -0.7856   1.004    1.708    0.5195  -1.478    0.9395   0.4226
 -0.0227  -0.2229 ]
(9164, 6400)
[-0.2822  -0.1575   3.271   -5.48    -0.6846   1.536    0

In [5]:
# Test using AdaBoostClassifier 

model_path    = "bakr/AdaBoostClassifier__f1_0.325_ml.sav"
model = pickle_load_model("models/ml_models/" + model_path)

_ = train_val_test_score(model, X_train_embed_matrix, y_train, X_val_embed_matrix, y_val, 
                         x_test_embed_matrix, y_test)

On Training set

F1 score is:  0.3367897430303691
On Validation set 

F1 score is:  0.3323683331477564
On Training 

F1 score is:  0.32802269751200347


In [7]:
# Test using Logistic Regression 
model_path    = "bakr/unigram_100d_lg_cls_model_f1_0.366.sav"
model = pickle_load_model("models/ml_models/" + model_path)

_ = train_val_test_score(model, X_train_embed_matrix, y_train, X_val_embed_matrix, y_val, 
                         x_test_embed_matrix, y_test)

On Training set

F1 score is:  0.4002072482342996
On Validation set 

F1 score is:  0.4007348847567086
On Training 

F1 score is:  0.3644696639022261


#  Rezk Word2vec

In [8]:
number_of_features = 100
max_len_str = 64

word2vec_path = "rezk_unigram_CBOW_model/train_word2vec_cbow__window_3_min_count_300"
word_to_vec_model = load_word2vec_model("models/word2vec/" + word2vec_path)

X_train_embed_matrix = text_to_matrix_using_word2vec(word_to_vec_model, x_train_text_tokenized, max_len_str)
X_val_embed_matrix = text_to_matrix_using_word2vec(word_to_vec_model, x_val_text_tokenized, max_len_str)
x_test_embed_matrix = text_to_matrix_using_word2vec(word_to_vec_model, x_test_text_tokenized, max_len_str)

(440052, 19200)
[-0.1262   0.2761   0.2466  -0.3464  -0.5044   0.216    0.2651   0.05423
 -0.3276  -0.2793   0.328    0.1699  -0.05267  0.1941   0.292    0.1654
 -0.01619 -0.428    0.411    0.0927   0.271    0.6206  -0.04764  0.04465
  0.0863   0.06042  0.08374 -0.0927   0.05176 -0.1616  -0.4875   0.4932
  0.1333   0.4666   0.0387  -0.19     0.05563 -0.1526   0.549    0.2966
 -0.0969  -0.345   -0.2896  -0.0667   0.12146  0.2126   0.1146  -0.4404
 -0.1198   0.2651 ]
(8981, 19200)
[-0.1262   0.2761   0.2466  -0.3464  -0.5044   0.216    0.2651   0.05423
 -0.3276  -0.2793   0.328    0.1699  -0.05267  0.1941   0.292    0.1654
 -0.01619 -0.428    0.411    0.0927   0.271    0.6206  -0.04764  0.04465
  0.0863   0.06042  0.08374 -0.0927   0.05176 -0.1616  -0.4875   0.4932
  0.1333   0.4666   0.0387  -0.19     0.05563 -0.1526   0.549    0.2966
 -0.0969  -0.345   -0.2896  -0.0667   0.12146  0.2126   0.1146  -0.4404
 -0.1198   0.2651 ]
(9164, 19200)
[ 0.629     1.157    -0.6753    0.2048    0.2107

In [10]:
# Test using LogisticRegression 

model_path    = "rezk/LogisticRegression__f1_0.41_ml.sav"
model = pickle_load_model("models/ml_models/" + model_path)

_ = train_val_test_score(model, X_train_embed_matrix, y_train, X_val_embed_matrix, y_val, 
                         x_test_embed_matrix, y_test)

On Training set

F1 score is:  0.5133302427894885
On Validation set 

F1 score is:  0.5129718294176595
On Training 

F1 score is:  0.4143387167175906


# Conclusion

If we compare the model prediction to the human prediction, we may conclude that the tasks of predict the dialect is difficult task for human. how its compare to the model. maybe its simpler if we talked about voice recogniation, as new layers comes in the voice it self and how people speak, the phonems, and others, its not as the same as the text.

And as in the reference paper the SVC model have over **50%**, but we need more resource to train and wait for the model.

Also I would like to use AdaBoostClassifier at the end with the API, as if we compared to human prediction of the text, its the best ones as we have very small gap between training and validation, and its doing the same for testing data which never seeing by the model.