In [1]:
import sys
sys.path.append('../../preprocess_assets/')

In [2]:
import glob
import os
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, LSTM, Embedding
import tensorflow as tf
from features_extraction import *
from data_shuffling_split import *
from ara_vec_preprocess_configs import *
from ml_modeling import *
from keras_models import *



In [3]:
strat_train_set = read_file("train/strat_train_set.csv")
strat_train_set.head()

Unnamed: 0,text,label,classes
0,راجعت الردود فوجدت المتزمتين دينيا هم الاكثر ا...,0,NEG
1,#شاهد_سكاي : #عيد_العمال تراجع اعدد العاطلين,2,OBJ
2,يقوم د / ابو الفتوح التواصل مع مختلف التيارات ...,2,OBJ
3,هل الشرطه والجيش سيطبق عليهم حدود الاجور دي ؟,2,OBJ
4,1 -انسحاب الوفد المصري المشارك في مهرجان مالم...,2,OBJ


In [4]:
x_train_text, x_val_text, y_train, y_val = prepare_data(strat_train_set)

The number of instances in the training data after StratifiedShuffleSplit are:  9608
The number of instances in the testing data after StratifiedShuffleSplit are:   197
The number of trainin instances:  9608
The number of validation instances:  197
The number of trainin labels :  9608
The number of validation labels :  197


In [5]:
x_train_text_tokenized = tokenize_using_nltk_TreebankWordTokenizer(x_train_text)

print("Before Tokenization : \n", x_train_text[:3])
print("="*50)
print("After Tokenization : \n", x_train_text_tokenized[:3])
print("="*50)

x_val_text_tokenized = tokenize_using_nltk_TreebankWordTokenizer(x_val_text)

print("Before Tokenization : \n", x_val_text[:3])
print("="*50)
print("After Tokenization : \n", x_val_text_tokenized[:3])

fullgram_x_train_text_tokenized = get_all_ngrams(x_train_text_tokenized)
print("full gram tokenization : \n", fullgram_x_train_text_tokenized[:3])
print("="*50)
fullgram_x_val_text_tokenized = get_all_ngrams(x_val_text_tokenized)
print("full gram tokenization : \n", fullgram_x_val_text_tokenized[:3])
print("="*50)

Before Tokenization : 
 ['عرض الاسبوع 40 الف متابع #شوارعنا #فن_تتقنه_النساء #ذلك_الشخص #شي_ودك_تجربه #بوح #درر #عجبني #حلو #مكه [ 2059853 ] ', 'نودع اليوم فقيد الصحافه المصريه صاحب الكلمه الحره الاستاذ سلامه احمد سلامه الي مثواه الاخير وهذا اقل ما يمكن ان نقدمه لقلمه المحترم', 'انا رفضت وقف فيلم السبكي ورافضه لالغاء مشهد رفع مبارك العلم في سينا انتوا فاكرنها خطوبه وبتقطعوا الصور ده ايه القرف ده #مرار_طافح']
After Tokenization : 
 [['عرض', 'الاسبوع', '40', 'الف', 'متابع', '#', 'شوارعنا', '#', 'فن_تتقنه_النساء', '#', 'ذلك_الشخص', '#', 'شي_ودك_تجربه', '#', 'بوح', '#', 'درر', '#', 'عجبني', '#', 'حلو', '#', 'مكه', '[', '2059853', ']'], ['نودع', 'اليوم', 'فقيد', 'الصحافه', 'المصريه', 'صاحب', 'الكلمه', 'الحره', 'الاستاذ', 'سلامه', 'احمد', 'سلامه', 'الي', 'مثواه', 'الاخير', 'وهذا', 'اقل', 'ما', 'يمكن', 'ان', 'نقدمه', 'لقلمه', 'المحترم'], ['انا', 'رفضت', 'وقف', 'فيلم', 'السبكي', 'ورافضه', 'لالغاء', 'مشهد', 'رفع', 'مبارك', 'العلم', 'في', 'سينا', 'انتوا', 'فاكرنها', 'خطوبه', 'وبتقطعوا', 'الصور',

# Our CBOW Word2Vec Model

In [6]:
our_word2vec_model = load_word2vec_model("../word2vec_models/rezk/cbow/continuous_bow_fullgram_vec_size_300-d_min_count_100")

In [7]:
number_of_features = 300
max_len_str = 132
word2vec_path = "rezk/cbow/"
model_path_to_save = "../ml_models_saved/"
hid_num_neurons = 25
learning_rate = .00005
epochs = 10
estimators = voting_models()

performance_lr = keras.callbacks.ReduceLROnPlateau(factor=.5, patience=5)
RMSprop_optimizer = tf.keras.optimizers.RMSprop(learning_rate=learning_rate, rho=.9)


X_train_embed_matrix = text_to_matrix_using_word2vec(our_word2vec_model, fullgram_x_train_text_tokenized, max_len_str)
X_val_embed_matrix = text_to_matrix_using_word2vec(our_word2vec_model, fullgram_x_val_text_tokenized, max_len_str)
# Reshape because of deep learning model
X_train_embed_matrix = X_train_embed_matrix.reshape(X_train_embed_matrix.shape[0], max_len_str, number_of_features)
X_val_embed_matrix = X_val_embed_matrix.reshape(X_val_embed_matrix.shape[0], max_len_str, number_of_features)

(9608, 132, 300)
(9608, 39600)
[ 0.1214    0.3682   -0.1448   -0.03687   0.05566  -0.2715    0.7036
  0.354    -0.2155    0.1714    0.3108   -0.08636   0.0853   -0.0822
  0.733    -0.609    -0.3848    0.1864   -0.1465    0.1383    0.1486
  0.2345    0.2715   -0.555    -0.0826    0.0421   -0.1056   -0.06616
 -0.3127   -0.2988    0.2344    0.0658    0.01614   0.009705 -0.01072
 -0.03845   0.266    -0.3828    0.0698   -0.4275   -0.1929    0.12
 -0.1859   -0.03053   0.1207    0.248     0.1338    0.292     0.36
 -0.133   ]
(197, 132, 300)
(197, 39600)
[ 0.195    0.1139   0.11273  0.10834  0.1918  -0.272   -0.0904   0.3499
  0.11255 -0.1373   0.1769  -0.01694 -0.331   -0.2861   0.3823  -0.2349
 -0.2189   0.07245  0.047   -0.03537  0.05942 -0.1351   0.01404 -0.0586
  0.237   -0.03625 -0.1224  -0.101    0.2336   0.263   -0.05103 -0.11957
  0.3762   0.1641  -0.00931 -0.1451   0.0516  -0.04178 -0.3274  -0.3633
  0.0931   0.03833 -0.194   -0.05487 -0.0223   0.3647   0.1729   0.02745
  0.2876   0.

# Our CBOW Word2Vec Model

# With  Rmsprob and  Batch Normalization

In [8]:
callbacks_ = keras_callbacks(word2vec_type="rezk_cbow_word2vec", model_type="Rmsprob_lstm_with_batch", learning_rate=learning_rate)
callbacks_.append(performance_lr)
model = lstm_with_batch_model_create(hid_num_neurons, max_len_str, number_of_features, dropout=.2)
model = seqential_model_compile(model, RMSprop_optimizer)
model.summary()

rezk_cbow_word2vec_Rmsprob_lstm_with_batch_learning_rate=5e-05_
../test_models/ml_models_saved/dl_models/tensor_logs/run_2022_05_10_05_37_58_rezk_cbow_word2vec_Rmsprob_lstm_with_batch_learning_rate=5e-05_


2022-05-10 05:37:58.764831: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-10 05:37:58.774460: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-10 05:37:58.776263: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-10 05:37:59.193692: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-10 05:37:59.195461: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from S

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 batch_normalization (BatchN  (None, 132, 300)         1200      
 ormalization)                                                   
                                                                 
 lstm (LSTM)                 (None, 132, 25)           32600     
                                                                 
 batch_normalization_1 (Batc  (None, 132, 25)          100       
 hNormalization)                                                 
                                                                 
 dropout (Dropout)           (None, 132, 25)           0         
                                                                 
 flatten (Flatten)           (None, 3300)              0         
                                                                 
 dense (Dense)               (None, 18)                5

In [9]:
history = model.fit(X_train_embed_matrix, y_train, batch_size=32, epochs=epochs, validation_data=(X_val_embed_matrix, y_val),
                   callbacks=callbacks_)

Epoch 1/10


2022-05-10 05:38:11.034982: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8100


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Bakr CBOW Word2Vec Model

In [10]:
bakr_word2vec_model = load_word2vec_model("../word2vec_models//bakr/cbow/full_grams_cbow_300_twitter.mdl")

In [11]:
number_of_features = 300
max_len_str = 132
word2vec_path = "bakr/cbow/"
model_path_to_save = "../ml_models_saved/"
hid_num_neurons = 25
learning_rate = .00005
epochs = 10
estimators = voting_models()

performance_lr = keras.callbacks.ReduceLROnPlateau(factor=.5, patience=5)
RMSprop_optimizer = tf.keras.optimizers.RMSprop(learning_rate=learning_rate, rho=.9)


X_train_embed_matrix = text_to_matrix_using_word2vec(bakr_word2vec_model, fullgram_x_train_text_tokenized, max_len_str)
X_val_embed_matrix = text_to_matrix_using_word2vec(bakr_word2vec_model, fullgram_x_val_text_tokenized, max_len_str)
# Reshape because of deep learning model
X_train_embed_matrix = X_train_embed_matrix.reshape(X_train_embed_matrix.shape[0], max_len_str, number_of_features)
X_val_embed_matrix = X_val_embed_matrix.reshape(X_val_embed_matrix.shape[0], max_len_str, number_of_features)

(9608, 132, 300)
(9608, 39600)
[ 0.0971  -1.876   -1.263    1.349   -0.8433   0.879    2.324   -1.0205
  0.9077  -0.93     1.103    0.01787  0.6445  -1.017   -1.765   -0.539
 -0.456    0.6274   0.7246   3.027   -1.861    0.1412  -0.2025   1.914
  1.376   -0.1142  -0.01277 -1.559    0.872    0.3884   1.963    0.3481
  0.558   -2.217   -0.2201   1.38    -0.5503   0.714    1.045   -2.162
  0.8477   0.1963  -0.5225   0.439    0.409    1.058    2.438   -1.232
  0.08734 -1.635  ]
(197, 132, 300)
(197, 39600)
[-0.2115  -0.2556   0.473    1.086    0.11786 -0.5483  -1.174    0.7393
  2.07    -1.959    0.2927  -0.335   -0.8047  -0.6226   0.558    0.772
 -0.1536   0.925    1.451    3.363   -0.6655  -0.6377  -0.2278   0.64
 -0.491    1.162    0.565   -0.1466  -0.3965  -1.302    1.644    1.453
 -0.2742   0.3464   0.583    1.733   -0.0814   0.0537  -0.3413  -0.1697
 -1.409    1.622    1.419    3.764    1.141   -0.5483   3.453   -2.21
  0.4814  -1.508  ]


# With  Rmsprob and  Batch Normalization

In [12]:
callbacks_ = keras_callbacks(word2vec_type="bakr_cbow_word2vec", model_type="Rmsprob_lstm_with_batch", learning_rate=learning_rate)
callbacks_.append(performance_lr)
model = lstm_with_batch_model_create(hid_num_neurons, max_len_str, number_of_features, dropout=.2)
model = seqential_model_compile(model, RMSprop_optimizer)
model.summary()

bakr_cbow_word2vec_Rmsprob_lstm_with_batch_learning_rate=5e-05_
../test_models/ml_models_saved/dl_models/tensor_logs/run_2022_05_10_05_40_55_bakr_cbow_word2vec_Rmsprob_lstm_with_batch_learning_rate=5e-05_
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 batch_normalization_2 (Batc  (None, 132, 300)         1200      
 hNormalization)                                                 
                                                                 
 lstm_1 (LSTM)               (None, 132, 25)           32600     
                                                                 
 batch_normalization_3 (Batc  (None, 132, 25)          100       
 hNormalization)                                                 
                                                                 
 dropout_1 (Dropout)         (None, 132, 25)           0         
                                               

In [13]:
history = model.fit(X_train_embed_matrix, y_train, batch_size=32, epochs=epochs, validation_data=(X_val_embed_matrix, y_val),
                   callbacks=callbacks_)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Muhammed CBOW Word2Vec Model

In [14]:
muhammed_word2vec_model = load_word2vec_model("../word2vec_models/muhammed/cbow/w2v_CBOW_300_3_400_10.model")

In [15]:
number_of_features = 300
max_len_str = 132
word2vec_path = "muhammed/cbow/"
model_path_to_save = "../ml_models_saved/"
hid_num_neurons = 25
learning_rate = .00005
epochs = 10
estimators = voting_models()

performance_lr = keras.callbacks.ReduceLROnPlateau(factor=.5, patience=5)
RMSprop_optimizer = tf.keras.optimizers.RMSprop(learning_rate=learning_rate, rho=.9)


X_train_embed_matrix = text_to_matrix_using_word2vec(muhammed_word2vec_model, fullgram_x_train_text_tokenized, max_len_str)
X_val_embed_matrix = text_to_matrix_using_word2vec(muhammed_word2vec_model, fullgram_x_val_text_tokenized, max_len_str)
# Reshape because of deep learning model
X_train_embed_matrix = X_train_embed_matrix.reshape(X_train_embed_matrix.shape[0], max_len_str, number_of_features)
X_val_embed_matrix = X_val_embed_matrix.reshape(X_val_embed_matrix.shape[0], max_len_str, number_of_features)

(9608, 132, 300)
(9608, 39600)
[ 0.818    0.3699  -0.4402  -1.145   -0.305   -1.197   -0.04993  2.256
  2.035    0.6685   0.6113  -0.3662   0.7954  -0.3528   0.4214  -1.98
  0.2295   0.9023  -0.9624   0.3145  -0.9424   0.602   -0.3047   0.2708
 -1.545    2.154   -1.758    0.4424  -1.366    1.809    0.347    2.451
  1.658   -0.9473   2.766   -1.251    0.164   -0.1345   0.751   -0.5083
 -1.247    0.3098  -0.793    0.2264  -1.852   -1.138   -0.05344  0.739
  0.11304  0.619  ]
(197, 132, 300)
(197, 39600)
[-0.7144 -1.232   0.49   -0.2788  0.3442  0.5703 -1.1045  0.5747  0.646
  1.317  -0.3972  0.357   3.158  -0.2544 -0.259  -0.135   1.456   4.97
 -1.786   2.35    0.232  -1.206   0.846   1.296  -0.7773 -0.0883 -0.5806
 -0.3147  1.126   1.328   0.33    1.735   1.116  -1.099   0.0929 -0.2737
  2.146   1.369  -0.7153 -0.1835  0.332   0.8726  0.2756 -0.4575  2.412
 -4.758   0.7764  0.3591 -0.806   0.7935]


# With  Rmsprob and  Batch Normalization

In [16]:
callbacks_ = keras_callbacks(word2vec_type="muhammed_cbow_word2vec", model_type="Rmsprob_lstm_with_batch", learning_rate=learning_rate)
callbacks_.append(performance_lr)
model = lstm_with_batch_model_create(hid_num_neurons, max_len_str, number_of_features, dropout=.2)
model = seqential_model_compile(model, RMSprop_optimizer)
model.summary()

muhammed_cbow_word2vec_Rmsprob_lstm_with_batch_learning_rate=5e-05_
../test_models/ml_models_saved/dl_models/tensor_logs/run_2022_05_10_05_42_31_muhammed_cbow_word2vec_Rmsprob_lstm_with_batch_learning_rate=5e-05_
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 batch_normalization_4 (Batc  (None, 132, 300)         1200      
 hNormalization)                                                 
                                                                 
 lstm_2 (LSTM)               (None, 132, 25)           32600     
                                                                 
 batch_normalization_5 (Batc  (None, 132, 25)          100       
 hNormalization)                                                 
                                                                 
 dropout_2 (Dropout)         (None, 132, 25)           0         
                                       

In [17]:
history = model.fit(X_train_embed_matrix, y_train, batch_size=32, epochs=epochs, validation_data=(X_val_embed_matrix, y_val),
                   callbacks=callbacks_)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Load best model & predict test set

In [18]:
strat_test_set = read_file("test/strat_test_set.csv")
strat_test_set.head()

Unnamed: 0,text,label,classes
0,نرجوا عدم متابعه وحظر حسابات : المباحث تابعني ...,2,OBJ
1,المسلماني اغلي متحدث اعلامي للرئيس واشهر كذاب ...,0,NEG
2,الاحد 3 يوليو : ابو الفتوح في ندوه بالمؤتمر ال...,2,OBJ
3,#كبسوله_صحيه #صحه #طفل #طفلي #نوم ##نوبه #نصيحه,2,OBJ
4,#اللي_رافضين_السيسي_رئيس_بيعملوا_فولوا_لبعض #م...,1,NEU


In [19]:
X_test_text = list(strat_test_set['text'])
y_test = strat_test_set['label'].values

In [20]:
X_test_text_tokenized = tokenize_using_nltk_TreebankWordTokenizer(X_test_text)

print("Before Tokenization : \n", X_test_text[:3])
print("="*50)
print("After Tokenization : \n", X_test_text_tokenized[:3])
print("="*50)

fullgram_X_test_text_tokenized = get_all_ngrams(X_test_text_tokenized)
print("full gram tokenization : \n", fullgram_X_test_text_tokenized[:3])
print("="*50)

Before Tokenization : 
 ['نرجوا عدم متابعه وحظر حسابات : المباحث تابعني واتابعك زياده المتابعين الاخبار المحليه #الشعب_يقول_كلمته #جماعه_انصار_بيت_طنيطر #الرياض #جده', 'المسلماني اغلي متحدث اعلامي للرئيس واشهر كذاب ومنافق ومضلل ومحرض تحول الي قليل الادب وضع معارضي الجيش في سله القمامه كلام زباله', 'الاحد 3 يوليو : ابو الفتوح في ندوه بالمؤتمر السنوي لهندسه عين شمس 5 مساء']
After Tokenization : 
 [['نرجوا', 'عدم', 'متابعه', 'وحظر', 'حسابات', ':', 'المباحث', 'تابعني', 'واتابعك', 'زياده', 'المتابعين', 'الاخبار', 'المحليه', '#', 'الشعب_يقول_كلمته', '#', 'جماعه_انصار_بيت_طنيطر', '#', 'الرياض', '#', 'جده'], ['المسلماني', 'اغلي', 'متحدث', 'اعلامي', 'للرئيس', 'واشهر', 'كذاب', 'ومنافق', 'ومضلل', 'ومحرض', 'تحول', 'الي', 'قليل', 'الادب', 'وضع', 'معارضي', 'الجيش', 'في', 'سله', 'القمامه', 'كلام', 'زباله'], ['الاحد', '3', 'يوليو', ':', 'ابو', 'الفتوح', 'في', 'ندوه', 'بالمؤتمر', 'السنوي', 'لهندسه', 'عين', 'شمس', '5', 'مساء']]
full gram tokenization : 
 [['نرجوا', 'عدم', 'متابعه', 'وحظر', 'حسابات', ':'

In [21]:
rezk_model = keras_load_model("../ml_models_saved/dl_models/run_with_rezk_cbow_word2vec_Rmsprob_lstm_with_batch_learning_rate=5e-05__model.h5"  )
bakr_model = keras_load_model("../ml_models_saved/dl_models/run_with_bakr_cbow_word2vec_Rmsprob_lstm_with_batch_learning_rate=5e-05__model.h5"  )
muhammed_model = keras_load_model("../ml_models_saved/dl_models/run_with_muhammed_cbow_word2vec_Rmsprob_lstm_with_batch_learning_rate=5e-05__model.h5"  )

X_test_embed_matrix = text_to_matrix_using_word2vec(our_word2vec_model, fullgram_X_test_text_tokenized, max_len_str)
X_test_embed_matrix = X_test_embed_matrix.reshape(X_test_embed_matrix.shape[0], max_len_str, number_of_features)

keras_f1_score_result(rezk_model, X_test_embed_matrix, y_test)
print("="*50)

X_test_embed_matrix = text_to_matrix_using_word2vec(bakr_word2vec_model, fullgram_X_test_text_tokenized, max_len_str)
X_test_embed_matrix = X_test_embed_matrix.reshape(X_test_embed_matrix.shape[0], max_len_str, number_of_features)
keras_f1_score_result(bakr_model, X_test_embed_matrix, y_test)
print("="*50)

X_test_embed_matrix = text_to_matrix_using_word2vec(muhammed_word2vec_model, fullgram_X_test_text_tokenized, max_len_str)
X_test_embed_matrix = X_test_embed_matrix.reshape(X_test_embed_matrix.shape[0], max_len_str, number_of_features)
keras_f1_score_result(muhammed_model, X_test_embed_matrix, y_test)

(201, 132, 300)
(201, 39600)
[ 0.2988  -0.2693   0.10986 -0.641   -0.2272  -0.833    0.5947   0.2289
  1.029    1.69     0.1128  -0.513    0.5083  -0.5146   0.455    1.061
  0.623    0.8276  -0.07904  1.13     0.2983   0.584   -0.336    0.817
 -0.747    0.2122  -0.772   -1.026   -0.213    0.78     0.2832   0.05463
  0.2041   0.4233  -1.4795   0.4827  -0.4382   0.2524  -0.6396   0.5166
 -0.0176  -0.1283  -0.4321  -0.4316   0.2345   0.10675  0.08984  0.03406
 -0.349    0.0258 ]
F1 score is:  0.6567164179104478
(201, 132, 300)
(201, 39600)
[ 0.1403   2.791   -0.1461  -1.685   -0.5776   0.8594   0.1614  -0.643
  1.949    1.234    0.5005  -1.628    0.01268  0.6436  -1.111   -0.3813
  1.564   -0.1381   0.2703   1.786   -0.4143   0.3918  -0.94     0.798
  1.602    1.107   -1.866    1.3955   2.795   -0.5894   2.129    2.266
  0.918   -0.12256  0.03775 -0.8667  -1.592   -1.315   -0.4163   1.871
 -0.1918  -1.6455  -1.846   -0.6865   1.72     1.369   -0.808    0.01648
  1.045    0.5747 ]
F1 score

0.612