In [1]:
import glob
import os
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, LSTM, Embedding
from configs import *
from fetch_data import *
from features_extraction import *
from data_shuffling_split import *
from data_preprocess import *
from ml_modeling import *

In [2]:
strat_train_set = read_csv("train/strat_train_set.csv")
strat_train_set = strat_train_set.iloc[:5000]
strat_train_set.head()

Number of instances in the file are:  449033


Unnamed: 0,id,dialect,dialect_l_encoded,text
0,1056552188082716800,LY,8,توا دوشه الكلاسيكو شن بيتمها وشن بيسكتهم وشن ب...
1,891734969202114560,SY,15,حسابشخصي في احلي من الشحاطه 😂
2,1110565179257954432,SD,14,حسابشخصي موهبه والله 😂 اوع تحاول تطورها تقوم م...
3,1172817955270340608,LB,7,حسابشخصي حسابشخصي 😂 انا صرلي عشر سنين مش مجدده...
4,293253217821790208,QA,12,احلي شعور تكون باجازه وتقوم من الصبح وتمر ع ال...


In [3]:
word_to_vec_model = load_word2vec_model("models/word2vec/bakrianoo_unigram_cbow_100_twitter/full_uni_cbow_100_twitter.mdl")

In [4]:
x_train_text, x_val_text, y_train, y_val = prepare_data(strat_train_set)

The number of instances in the training data after StratifiedShuffleSplit are:  4900
The number of instances in the testing data after StratifiedShuffleSplit are:   100
The number of trainin instances:  4900
The number of validation instances:  100
The number of trainin labels :  4900
The number of validation labels :  100


In [5]:
x_train_text_tokenized = tokenize_using_nltk_TreebankWordTokenizer(x_train_text)

print("Before Tokenization : \n", x_train_text[:3])
print("="*50)
print("After Tokenization : \n", x_train_text_tokenized[:3])
print("="*50)

x_val_text_tokenized = tokenize_using_nltk_TreebankWordTokenizer(x_val_text)

print("Before Tokenization : \n", x_val_text[:3])
print("="*50)
print("After Tokenization : \n", x_val_text_tokenized[:3])

Before Tokenization : 
 ['تلفونك هاد عشان تقلب فيه ؟ ! رابطويب', 'حسابشخصي وسنين وانا دايب شوق وحنين عاوز اعرف بس طريقو منيين', '‘محمد حماقي - رسمك في خيالي - من البوم كل يوم من ده 2019 ’ on # SoundCloud # np رابطويب']
After Tokenization : 
 [['تلفونك', 'هاد', 'عشان', 'تقلب', 'فيه', '؟', '!', 'رابطويب'], ['حسابشخصي', 'وسنين', 'وانا', 'دايب', 'شوق', 'وحنين', 'عاوز', 'اعرف', 'بس', 'طريقو', 'منيين'], ['‘محمد', 'حماقي', '-', 'رسمك', 'في', 'خيالي', '-', 'من', 'البوم', 'كل', 'يوم', 'من', 'ده', '2019', '’', 'on', '#', 'SoundCloud', '#', 'np', 'رابطويب']]
Before Tokenization : 
 ['بيعجبني اللي بيعيد اختراع العجله، يجي يقولك علي فكره كل طرف بيدور علي مصلحته في السياسه ! ، لا ياواد يانبيه، يا خلاصه زمانك ! ! ، يا عصاره ذكاء جينات عيلتك ! ! . . فاجئتني وعايش في هول الصدمه، الناس الوحشه بتدور علي مصلحتها في السياسه', 'حسابشخصي ياريت يتحقق هذا المطلب وكل حد ياخذ حقه ويصرف فيه', 'خساره فيك اي كلمه سمعانا منك يا منافق بتجري ورا الفلوس خبيث رابطويب']
After Tokenization : 
 [['بيعجبني', 'اللي', 'بيعيد'

In [6]:
number_of_features = 100
max_len_str = 64
word2vec_path = "rezk/"
model_path_to_save = "models/ml_models/"
estimators = voting_models()

X_train_embed_matrix = text_to_matrix_using_word2vec(word_to_vec_model, x_train_text_tokenized, max_len_str)
X_val_embed_matrix = text_to_matrix_using_word2vec(word_to_vec_model, x_val_text_tokenized, max_len_str)

(4900, 6400)
[-0.802   2.176  -2.914  -1.148   2.05   -2.33    1.976   1.35   -2.018
  3.11   -4.49   -1.446  -0.4963  1.45    3.727   6.41    0.308   2.55
 -1.509   1.952  -1.707   0.3433 -1.031   3.2    -0.786   0.662   1.399
  4.195  -2.088   2.129  -3.22   -0.838   0.628   1.369  -1.197  -1.782
  0.4106 -0.4167 -4.215  -3.604  -3.838   1.367   4.188   0.0954  3.195
  0.9663  2.469  -1.191  -2.746   2.016 ]
(100, 6400)
[ 1.6875e+00 -6.6943e-01  8.7451e-01 -5.8936e-01  2.1484e+00  2.8594e+00
  6.5332e-01 -4.9512e-01  2.9395e+00  1.2568e+00  1.4512e+00  1.1553e+00
  3.0801e+00 -6.3916e-01 -3.1914e+00 -1.0459e+00 -3.2363e+00  6.2744e-01
  1.1836e+00 -1.2139e+00  2.3535e+00 -2.0098e+00 -9.1064e-01 -9.7803e-01
 -9.3701e-01  1.5361e+00  1.4443e+00  1.4902e+00  5.0586e-01 -3.6074e+00
  1.0488e+00  2.2520e+00 -1.6611e+00 -1.4854e+00  1.4746e+00 -1.5986e+00
  2.6512e-03  3.1074e+00 -1.1035e+00 -4.5703e-01 -4.8096e-01  3.5000e+00
 -1.5557e+00 -4.2109e+00 -7.6611e-01 -3.0664e+00  4.7424e-02 -1

In [7]:
X_train_embed_matrix.shape

(4900, 6400)

In [10]:
X_train_embed_matrix = X_train_embed_matrix.reshape([X_train_embed_matrix.shape[0], max_len_str, number_of_features])
X_val_embed_matrix  = X_val_embed_matrix.reshape([X_val_embed_matrix.shape[0], max_len_str, number_of_features])

In [11]:
X_train_embed_matrix.shape

(4900, 64, 100)

In [None]:
def tokenize_and_vectorize(train_text, word_to_vec_model):
    tokenizer = TreebankWordTokenizer()
    vectorize_data = []
    for sampel in train_text:
        tokens = tokenizer.tokenize(sampel)
        sampel_vec=[]
        for token in tokens:
            try:
                sampel_vec.append(word_to_vec_model.wv[token])
            except KeyError:
                pass
        vectorize_data.append(sampel_vec)
    return vectorize_data

In [None]:
X_train = tokenize_and_vectorize(x_train_text, word_to_vec_model)
X_val = tokenize_and_vectorize(x_val_text, word_to_vec_model)

In [None]:
len(X_train)

In [None]:
len(X_train[0])

In [None]:
def pad_trunc(data, maxlen):
    new_data = []
    zero_vector = []
    for _ in range(len(data[0][0])):
        zero_vector.append(0.0)
        
    for sample in data:
        if len(sample) > maxlen:
            temp =sample[:maxlen]
        elif len(sample) < maxlen:
            temp=sample
            additional_elems = maxlen - len(sample)
            for _ in range(additional_elems):
                temp.append(zero_vector)
        else:
            temp = sample
        new_data.append(temp)
    return new_data

In [None]:
X_train = pad_trunc(X_train, 64)
X_val = pad_trunc(X_val, 64)

In [19]:
model = Sequential()

In [20]:
num_nuros = 50
model.add(LSTM(num_nuros, return_sequences=True, input_shape=(64, 100)))

In [21]:
model.add(Dropout(.2))
model.add(Flatten())
model.add(Dense(18, activation="softmax"))
model.compile(loss="sparse_categorical_crossentropy",
         optimizer="sgd",
         metrics="accuracy")
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 64, 50)            30200     
_________________________________________________________________
dropout_1 (Dropout)          (None, 64, 50)            0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 3200)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 18)                57618     
Total params: 87,818
Trainable params: 87,818
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model.fit(X_train_embed_matrix, y_train, batch_size=32, epochs=10, validation_data=(X_val_embed_matrix, y_val))

Epoch 1/10

In [18]:
X_val_embed_matrix.shape

(100, 64, 100)