In [1]:
import glob
import os
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, LSTM, Embedding
from configs import *
from fetch_data import *
from features_extraction import *
from data_shuffling_split import *
from data_preprocess import *
from ml_modeling import *
from keras_models import *

In [2]:
strat_train_set = read_csv("train/strat_train_set.csv")
strat_train_set = strat_train_set.iloc[:5000]
strat_train_set.head()

Number of instances in the file are:  449033


Unnamed: 0,id,dialect,dialect_l_encoded,text
0,1056552188082716800,LY,8,توا دوشه الكلاسيكو شن بيتمها وشن بيسكتهم وشن ب...
1,891734969202114560,SY,15,حسابشخصي في احلي من الشحاطه 😂
2,1110565179257954432,SD,14,حسابشخصي موهبه والله 😂 اوع تحاول تطورها تقوم م...
3,1172817955270340608,LB,7,حسابشخصي حسابشخصي 😂 انا صرلي عشر سنين مش مجدده...
4,293253217821790208,QA,12,احلي شعور تكون باجازه وتقوم من الصبح وتمر ع ال...


In [3]:
x_train_text, x_val_text, y_train, y_val = prepare_data(strat_train_set)

The number of instances in the training data after StratifiedShuffleSplit are:  9800
The number of instances in the testing data after StratifiedShuffleSplit are:   200
The number of trainin instances:  9800
The number of validation instances:  200
The number of trainin labels :  9800
The number of validation labels :  200


In [4]:
x_train_text_tokenized = tokenize_using_nltk_TreebankWordTokenizer(x_train_text)

print("Before Tokenization : \n", x_train_text[:3])
print("="*50)
print("After Tokenization : \n", x_train_text_tokenized[:3])
print("="*50)

x_val_text_tokenized = tokenize_using_nltk_TreebankWordTokenizer(x_val_text)

print("Before Tokenization : \n", x_val_text[:3])
print("="*50)
print("After Tokenization : \n", x_val_text_tokenized[:3])

Before Tokenization : 
 ['حسابشخصي يا الله البنات عاطين سمعه عنا زباله 🤦 🏼 🤦 🏼 😭 احنا مش هيك والله 💔 ', 'حسابشخصي والله الزول ده كتر المحلبيه', 'حسابشخصي لين ادشين فيها تمي لعبي بالتلفون 🌚 ']
After Tokenization : 
 [['حسابشخصي', 'يا', 'الله', 'البنات', 'عاطين', 'سمعه', 'عنا', 'زباله', '🤦', '🏼', '🤦', '🏼', '😭', 'احنا', 'مش', 'هيك', 'والله', '💔'], ['حسابشخصي', 'والله', 'الزول', 'ده', 'كتر', 'المحلبيه'], ['حسابشخصي', 'لين', 'ادشين', 'فيها', 'تمي', 'لعبي', 'بالتلفون', '🌚']]
Before Tokenization : 
 ['لمحت زولك او جابوا بالحكي طاريك ،، #يرفرف_قلبي_اذا', 'حسابشخصي مهما عملت روسيا هتغطي علي الكل 😂 انا قصدي ماتشات المنتخب محدش يفهمني غلط مقولتش المشجعات الروسيات', '#طرابلس . الشعب الوحيد اللي تغم عليه نفسيته لما يشوفو حدوده']
After Tokenization : 
 [['لمحت', 'زولك', 'او', 'جابوا', 'بالحكي', 'طاريك', '،،', '#', 'يرفرف_قلبي_اذا'], ['حسابشخصي', 'مهما', 'عملت', 'روسيا', 'هتغطي', 'علي', 'الكل', '😂', 'انا', 'قصدي', 'ماتشات', 'المنتخب', 'محدش', 'يفهمني', 'غلط', 'مقولتش', 'المشجعات', 'الروسيات'], ['#', 

# LSTM Model with Bakr Word2vec

In [5]:
word_to_vec_model = load_word2vec_model("models/word2vec/bakrianoo_unigram_cbow_model/full_uni_cbow_100_twitter.mdl")

In [6]:
max_len_str = 64
hid_num_neurons = 50
learning_rate = .1
epochs = 30

performance_lr = keras.callbacks.ReduceLROnPlateau(factor=.5, patience=5)
SGD_optimizer     =keras.optimizers.SGD(learning_rate=learning_rate)
Adam_optimizer = keras.optimizers.Adam(beta_1=0.9, beta_2=0.999)
RMSprop_optimizer = keras.optimizers.RMSprop(learning_rate=learning_rate, rho=.9)


callbacks_ = keras_callbacks(word2vec_type="bakr_word2vec", model_type="lstm_no_batch", learning_rate=learning_rate)
callbacks_.append(performance_lr)

number_of_features = 100
word2vec_path = "bakr/"

X_train_embed_matrix = text_to_matrix_using_word2vec(word_to_vec_model, x_train_text_tokenized, max_len_str)
X_val_embed_matrix = text_to_matrix_using_word2vec(word_to_vec_model, x_val_text_tokenized, max_len_str)

# Reshape because of deep learning model
X_train_embed_matrix = X_train_embed_matrix.reshape(X_train_embed_matrix.shape[0], max_len_str, number_of_features)
X_val_embed_matrix = X_val_embed_matrix.reshape(X_val_embed_matrix.shape[0], max_len_str, number_of_features)

(9800, 64, 100)
(9800, 6400)
[ 0.2341   1.27    -1.722   -7.742   -0.05646 -1.704    1.6      0.5825
 -0.4714  -4.09    -0.7295   0.6294   0.194    2.477   -1.136    2.854
 -0.6875   1.646    1.055   -9.82     0.2559  -0.919   -5.27    -1.97
  1.414   -0.7124   1.286    1.83     1.561   -4.965   -2.19    -3.455
  2.053    0.1661  -2.848   -2.291   -0.836    1.756    2.492   -1.278
 -1.596    1.31     3.03     2.994   -2.713    0.943    1.218    2.777
  4.227   -0.533  ]
(200, 64, 100)
(200, 6400)
[ 1.7686e+00 -6.7578e-01  6.1523e+00 -1.2939e+00 -2.3877e-01 -1.9150e+00
  2.3379e+00  3.9160e+00  3.7266e+00  3.6172e+00  2.5801e+00 -6.0498e-01
 -1.4756e+00  2.8594e+00  1.3076e+00 -1.2031e+00 -9.7229e-02 -4.2065e-01
 -1.6821e-01 -8.4778e-02  6.3416e-02 -5.5029e-01  1.5127e+00 -6.1401e-02
 -8.5107e-01 -2.4688e+00 -2.7656e+00 -3.7695e+00  5.7297e-03  9.8682e-01
  7.7305e+00 -1.6689e+00 -1.4238e+00  2.3633e+00  3.9038e-01  5.9424e-01
 -1.4375e+00  1.9690e-01 -2.0840e+00 -4.8086e+00 -2.0410e+00

# With  SGD and No Batch Normalization

In [7]:
model = lstm_no_batch_seqential_model_create(hid_num_neurons, max_len_str, number_of_features, dropout=.2)
model = seqential_model_compile(model, SGD_optimizer)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 64, 50)            30200     
_________________________________________________________________
dropout (Dropout)            (None, 64, 50)            0         
_________________________________________________________________
flatten (Flatten)            (None, 3200)              0         
_________________________________________________________________
dense (Dense)                (None, 18)                57618     
Total params: 87,818
Trainable params: 87,818
Non-trainable params: 0
_________________________________________________________________


In [8]:
history = model.fit(X_train_embed_matrix, y_train, batch_size=32, epochs=epochs, validation_data=(X_val_embed_matrix, y_val),
                   callbacks=callbacks_)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# With  Adam and No Batch Normalization

In [None]:
model = lstm_no_batch_seqential_model_create(hid_num_neurons, max_len_str, number_of_features, dropout=.2)
model = seqential_model_compile(model, Adam_optimizer)
model.summary()

In [None]:
history = model.fit(X_train_embed_matrix, y_train, batch_size=32, epochs=epochs, validation_data=(X_val_embed_matrix, y_val),
                   callbacks=callbacks_)

# With  RMSprob and No Batch Normalization

In [None]:
model = lstm_no_batch_seqential_model_create(hid_num_neurons, max_len_str, number_of_features, dropout=.2)
model = seqential_model_compile(model, RMSprop_optimizer)
model.summary()

In [None]:
history = model.fit(X_train_embed_matrix, y_train, batch_size=32, epochs=epochs, validation_data=(X_val_embed_matrix, y_val),
                   callbacks=callbacks_)

# With  SGD and Batch Normalization

In [None]:
callbacks_ = keras_callbacks(word2vec_type="bakr_word2vec", model_type="lstm_with_batch", learning_rate=learning_rate)
callbacks_.append(performance_lr)

In [None]:
model = lstm_with_batch_model_create(hid_num_neurons, max_len_str, number_of_features, dropout=.2)
model = seqential_model_compile(model, SGD_optimizer)
model.summary()

In [None]:
history = model.fit(X_train_embed_matrix, y_train, batch_size=32, epochs=epochs, validation_data=(X_val_embed_matrix, y_val),
                   callbacks=callbacks_)

# With  Adam and No Batch Normalization

In [None]:
model = lstm_with_batch_model_create(hid_num_neurons, max_len_str, number_of_features, dropout=.2)
model = seqential_model_compile(model, Adam_optimizer)
model.summary()

In [None]:
history = model.fit(X_train_embed_matrix, y_train, batch_size=32, epochs=epochs, validation_data=(X_val_embed_matrix, y_val),
                   callbacks=callbacks_)

# With  RMSprob and No Batch Normalization

In [None]:
model = lstm_with_batch_model_create(hid_num_neurons, max_len_str, number_of_features, dropout=.2)
model = seqential_model_compile(model, RMSprop_optimizer)
model.summary()

In [None]:
history = model.fit(X_train_embed_matrix, y_train, batch_size=32, epochs=epochs, validation_data=(X_val_embed_matrix, y_val),
                   callbacks=callbacks_)

# LSTM Model with Rezk Word2vec

In [None]:
word_to_vec_model = load_word2vec_model("models/word2vec/rezk_unigram_CBOW_model/train_word2vec_cbow__window_3_min_count_300")

In [None]:
number_of_features = 300

callbacks_ = keras_callbacks(word2vec_type="rezk_word2vec", model_type="lstm_no_batch", learning_rate=learning_rate)
callbacks_.append(performance_lr)

word2vec_path = "rezk/"

X_train_embed_matrix = text_to_matrix_using_word2vec(word_to_vec_model, x_train_text_tokenized, max_len_str)
X_val_embed_matrix = text_to_matrix_using_word2vec(word_to_vec_model, x_val_text_tokenized, max_len_str)

# Reshape because of deep learning model
X_train_embed_matrix = X_train_embed_matrix.reshape(X_train_embed_matrix.shape[0], max_len_str, number_of_features)
X_val_embed_matrix = X_val_embed_matrix.reshape(X_val_embed_matrix.shape[0], max_len_str, number_of_features)

# With  SGD and No Batch Normalization

In [None]:
model = lstm_no_batch_seqential_model_create(hid_num_neurons, max_len_str, number_of_features, dropout=.2)
model = seqential_model_compile(model, SGD_optimizer)
model.summary()

In [None]:
history = model.fit(X_train_embed_matrix, y_train, batch_size=32, epochs=epochs, validation_data=(X_val_embed_matrix, y_val),
                   callbacks=callbacks_)

# With  Adam and No Batch Normalization

In [None]:
model = lstm_no_batch_seqential_model_create(hid_num_neurons, max_len_str, number_of_features, dropout=.2)
model = seqential_model_compile(model, Adam_optimizer)
model.summary()

In [None]:
history = model.fit(X_train_embed_matrix, y_train, batch_size=32, epochs=epochs, validation_data=(X_val_embed_matrix, y_val),
                   callbacks=callbacks_)

# With  Rmsprob and No Batch Normalization

In [None]:
model = lstm_no_batch_seqential_model_create(hid_num_neurons, max_len_str, number_of_features, dropout=.2)
model = seqential_model_compile(model, RMSprop_optimizer)
model.summary()

In [None]:
history = model.fit(X_train_embed_matrix, y_train, batch_size=32, epochs=epochs, validation_data=(X_val_embed_matrix, y_val),
                   callbacks=callbacks_)

# With  SGD and  Batch Normalization

In [None]:
callbacks_ = keras_callbacks(word2vec_type="rezk_word2vec", model_type="lstm_with_batch", learning_rate=learning_rate)
callbacks_.append(performance_lr)

In [None]:
model = lstm_with_batch_model_create(hid_num_neurons, max_len_str, number_of_features, dropout=.2)
model = seqential_model_compile(model, SGD_optimizer)
model.summary()

In [None]:
history = model.fit(X_train_embed_matrix, y_train, batch_size=32, epochs=epochs, validation_data=(X_val_embed_matrix, y_val),
                   callbacks=callbacks_)

In [None]:
model = lstm_with_batch_model_create(hid_num_neurons, max_len_str, number_of_features, dropout=.2)
model = seqential_model_compile(model, Adam_optimizer)
model.summary()

In [None]:
history = model.fit(X_train_embed_matrix, y_train, batch_size=32, epochs=epochs, validation_data=(X_val_embed_matrix, y_val),
                   callbacks=callbacks_)

In [None]:
model = lstm_with_batch_model_create(hid_num_neurons, max_len_str, number_of_features, dropout=.2)
model = seqential_model_compile(model, RMSprop_optimizer)
model.summary()

In [None]:
history = model.fit(X_train_embed_matrix, y_train, batch_size=32, epochs=epochs, validation_data=(X_val_embed_matrix, y_val),
                   callbacks=callbacks_)