In [1]:
import glob
import os
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from configs import *
from fetch_data import *
from features_extraction import *
from data_shuffling_split import *
from data_preprocess import *
from ml_modeling import *

In [2]:
strat_train_set = read_csv("train/strat_train_set.csv")
strat_train_set = strat_train_set.iloc[:5000]
strat_train_set.head()

Number of instances in the file are:  449033


Unnamed: 0,id,dialect,dialect_l_encoded,text
0,1056552188082716800,LY,8,توا دوشه الكلاسيكو شن بيتمها وشن بيسكتهم وشن ب...
1,891734969202114560,SY,15,حسابشخصي في احلي من الشحاطه 😂
2,1110565179257954432,SD,14,حسابشخصي موهبه والله 😂 اوع تحاول تطورها تقوم م...
3,1172817955270340608,LB,7,حسابشخصي حسابشخصي 😂 انا صرلي عشر سنين مش مجدده...
4,293253217821790208,QA,12,احلي شعور تكون باجازه وتقوم من الصبح وتمر ع ال...


In [3]:
x_train_text, x_val_text, y_train, y_val = prepare_data(strat_train_set)

The number of instances in the training data after StratifiedShuffleSplit are:  4900
The number of instances in the testing data after StratifiedShuffleSplit are:   100
The number of trainin instances:  4900
The number of validation instances:  100
The number of trainin labels :  4900
The number of validation labels :  100


In [4]:
x_train_text_tokenized = tokenize_using_nltk_TreebankWordTokenizer(x_train_text)

print("Before Tokenization : \n", x_train_text[:3])
print("="*50)
print("After Tokenization : \n", x_train_text_tokenized[:3])
print("="*50)

x_val_text_tokenized = tokenize_using_nltk_TreebankWordTokenizer(x_val_text)

print("Before Tokenization : \n", x_val_text[:3])
print("="*50)
print("After Tokenization : \n", x_val_text_tokenized[:3])

Before Tokenization : 
 ['ولما طلبت الوصل منها تمنعت 🎶 ', 'حسابشخصي لو الجد عاوز يعمل كده بدون علم الوالد يعمل باسمه و يوصي لها بعد عمر طويل للحفيده ، و لما عمر الحفيده يكمل ٢١ سنه تفتح حساب و يحول لها', 'حسابشخصي دايم تفيد الناس الله يكثر من امثالك 🌹 ']
After Tokenization : 
 [['ولما', 'طلبت', 'الوصل', 'منها', 'تمنعت', '🎶'], ['حسابشخصي', 'لو', 'الجد', 'عاوز', 'يعمل', 'كده', 'بدون', 'علم', 'الوالد', 'يعمل', 'باسمه', 'و', 'يوصي', 'لها', 'بعد', 'عمر', 'طويل', 'للحفيده', '،', 'و', 'لما', 'عمر', 'الحفيده', 'يكمل', '٢١', 'سنه', 'تفتح', 'حساب', 'و', 'يحول', 'لها'], ['حسابشخصي', 'دايم', 'تفيد', 'الناس', 'الله', 'يكثر', 'من', 'امثالك', '🌹']]
Before Tokenization : 
 ['شنو الاحلام التافهه الي توني حلمتها ! ! 🙂 ', 'كانت اوكي بعدين عملتوها اوك بعدين عملتوها اكك بعدين عملتوها كك 😂 #نتفتوها_الله_ينتفكم 💔 ', 'بالرهوط المسيبه هذي و يحبوا النصره و فيبالهم بش يمشيوا الناس علي السراط . . #É liav _ Cohen ماله الا تونسي و حر في معتقداته و دينه بينه و بين ربي ✌ كان تلهيتو في الصهاينه المتسترين وسطكم كان خير

In [5]:
word_to_vec_model = load_word2vec_model("models/word2vec/rezk_unigram_CBOW_model/train_word2vec_cbow__window_3_min_count_300")

In [6]:
# words = list(word_to_vec_model.wv.key_to_index)
# words[:10]

In [7]:
word_to_vec_model.wv['الله']

array([ 0.5681784 , -0.30010086, -0.2583801 , -0.37423003, -0.48989734,
       -0.11130093,  0.477914  , -0.7084362 ,  0.4492829 ,  0.13346606,
        0.50121063,  0.7314992 , -1.1720065 ,  1.005286  , -0.10987772,
        0.1091288 ,  0.54772073,  0.01527023, -0.2394439 , -0.52070594,
        1.1548085 , -1.0749179 , -1.7650245 , -0.32195583,  1.6003461 ,
        0.83306354, -0.16221358,  0.02520426,  0.24955502,  0.14337762,
       -0.36821768,  0.15932007, -0.29922467, -0.25858533,  0.08653131,
       -0.2937703 ,  0.13829572, -0.34190527, -0.45977148,  0.679429  ,
        0.95369464, -0.09211156,  0.3770238 , -0.53584146, -0.5314214 ,
       -0.30582327, -0.06015404, -0.6080289 ,  0.0697981 , -0.8562191 ,
        0.5494267 ,  0.20494674,  0.19669822, -0.07874821, -0.11959118,
       -0.6960639 , -0.10837762, -0.6692657 , -0.11070859, -0.4817535 ,
       -0.40251568, -0.3512062 , -0.81904745, -0.2538155 ,  2.383865  ,
       -0.71738064, -0.27610055,  0.64741206, -0.52924365,  0.56

In [8]:
# Get how many words inside each text after tokenization
num_of_words_in_each_text = [len(text) for text in x_train_text_tokenized]
max_len = max(num_of_words_in_each_text)
print("The max length is: ", max_len)
num_of_words_in_each_text[:10]

The max length is:  71


[6, 31, 9, 8, 8, 6, 8, 11, 9, 7]

In [9]:
x_train_text_tokenized[0]

['ولما', 'طلبت', 'الوصل', 'منها', 'تمنعت', '🎶']

In [10]:
number_of_features = 300
max_len_str = max_len
batch_size = 32
epochs = 5

X_train_embed_matrix = DL_text_to_matrix_using_word2vec(word_to_vec_model, x_train_text_tokenized)

X_val_embed_matrix = DL_text_to_matrix_using_word2vec(word_to_vec_model, x_val_text_tokenized)

In [11]:
X_train_embed_matrix[0]

[array([ 5.54423511e-01, -5.50999165e-01,  4.32995819e-02,  3.48497063e-01,
         8.27402115e-01, -1.30287498e-01, -5.74544311e-01,  3.77178729e-01,
        -1.84698373e-01,  7.02642083e-01, -7.38945544e-01,  5.50884306e-02,
         1.09201825e+00, -1.56139180e-01, -8.20909977e-01,  8.31583500e-01,
        -3.20366472e-01,  5.40679812e-01, -9.54797417e-02,  1.69852495e+00,
         5.71068823e-01, -7.87636817e-01, -6.21873200e-01, -8.11908543e-01,
         4.41219062e-01, -5.83797932e-01, -5.20038784e-01,  6.24094307e-01,
        -2.69220322e-01,  2.51252323e-01,  6.64607763e-01,  1.40739530e-01,
        -2.74770468e-01,  5.80416679e-01, -4.23231810e-01,  5.72161674e-02,
        -6.18225485e-02, -2.45041743e-01, -1.97540205e-02,  2.22363636e-01,
         6.06619477e-01, -4.87465769e-01,  3.46281588e-01,  1.64026177e+00,
        -1.76995844e-01, -5.48049174e-02,  7.78256595e-01,  6.35309100e-01,
         4.90735918e-01,  8.55933011e-01,  9.16530378e-04, -1.32755250e-01,
         3.6

In [12]:
X_train_embed_matrix[0]

[array([ 5.54423511e-01, -5.50999165e-01,  4.32995819e-02,  3.48497063e-01,
         8.27402115e-01, -1.30287498e-01, -5.74544311e-01,  3.77178729e-01,
        -1.84698373e-01,  7.02642083e-01, -7.38945544e-01,  5.50884306e-02,
         1.09201825e+00, -1.56139180e-01, -8.20909977e-01,  8.31583500e-01,
        -3.20366472e-01,  5.40679812e-01, -9.54797417e-02,  1.69852495e+00,
         5.71068823e-01, -7.87636817e-01, -6.21873200e-01, -8.11908543e-01,
         4.41219062e-01, -5.83797932e-01, -5.20038784e-01,  6.24094307e-01,
        -2.69220322e-01,  2.51252323e-01,  6.64607763e-01,  1.40739530e-01,
        -2.74770468e-01,  5.80416679e-01, -4.23231810e-01,  5.72161674e-02,
        -6.18225485e-02, -2.45041743e-01, -1.97540205e-02,  2.22363636e-01,
         6.06619477e-01, -4.87465769e-01,  3.46281588e-01,  1.64026177e+00,
        -1.76995844e-01, -5.48049174e-02,  7.78256595e-01,  6.35309100e-01,
         4.90735918e-01,  8.55933011e-01,  9.16530378e-04, -1.32755250e-01,
         3.6

In [13]:
from keras.preprocessing.sequence import pad_sequences

In [14]:
X_train_embed_matrix = pad_sequences(X_train_embed_matrix, maxlen=max_len_str, padding='post')
X_train_embed_matrix.shape

(4900, 71, 300)

In [15]:
X_val_embed_matrix = pad_sequences(X_val_embed_matrix, maxlen=max_len_str, padding='post')
X_val_embed_matrix.shape

(100, 71, 300)

In [16]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, SimpleRNN

In [17]:
num_nurons = 50
model = Sequential()

In [18]:
model.add(SimpleRNN(
    num_nurons, 
    return_sequences=True,
    input_shape=(max_len_str, number_of_features)))

model.add(Dropout(.2))
model.add(Flatten())
model.add(Dense(18, activation="softmax"))
model.compile("rmsprop", 'sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn (SimpleRNN)       (None, 71, 50)            17550     
_________________________________________________________________
dropout (Dropout)            (None, 71, 50)            0         
_________________________________________________________________
flatten (Flatten)            (None, 3550)              0         
_________________________________________________________________
dense (Dense)                (None, 18)                63918     
Total params: 81,468
Trainable params: 81,468
Non-trainable params: 0
_________________________________________________________________


In [19]:
model.fit(X_train_embed_matrix, y_train, batch_size=batch_size, epochs=epochs, validation_split=.02)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f72e59f89e8>