In [1]:
import sys
sys.path.append('../../preprocess_assets/')

In [2]:
# Main libraries 
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
import matplotlib.pyplot as plt

# Our files
from data_shuffling_split import *
from features_extraction import *
from ara_vec_preprocess_configs import *
from ml_modeling import *

In [None]:
strat_train_set = read_file("train/strat_train_set.csv")
strat_train_set.head()

In [None]:
x_train_text, x_val_text, y_train, y_val = prepare_data(strat_train_set)

In [None]:
x_train_text_tokenized = tokenize_using_nltk_TreebankWordTokenizer(x_train_text)

print("Before Tokenization : \n", x_train_text[:3])
print("="*50)
print("After Tokenization : \n", x_train_text_tokenized[:3])
print("="*50)

x_val_text_tokenized = tokenize_using_nltk_TreebankWordTokenizer(x_val_text)

print("Before Tokenization : \n", x_val_text[:3])
print("="*50)
print("After Tokenization : \n", x_val_text_tokenized[:3])

fullgram_x_train_text_tokenized = get_all_ngrams(x_train_text_tokenized)
print("full gram tokenization : \n", fullgram_x_train_text_tokenized[:3])
print("="*50)
fullgram_x_val_text_tokenized = get_all_ngrams(x_val_text_tokenized)
print("full gram tokenization : \n", fullgram_x_val_text_tokenized[:3])
print("="*50)

In [None]:
# Get how many words inside each text after tokenization
num_of_words_in_each_text = [len(text) for text in fullgram_x_train_text_tokenized]
max_len = max(num_of_words_in_each_text)
print("The max length is: ", max_len)
num_of_words_in_each_text[:10]

In [None]:
# count how many times these value repeated and sort them
new_dicts = get_keys_that_val_gr_than_num(num_of_words_in_each_text, 1000)
keys = list(new_dicts.keys())
values = list(new_dicts.values())
plt.style.use('dark_background')
fig = plt.gcf()
fig.set_size_inches(18.5, 6)
plt.bar(range(len(new_dicts)), values, tick_label=keys)
plt.show()

In [None]:
word2vec_model = load_word2vec_model("../word2vec_models/rezk_unigram_CBOW_model/train_word2vec_cbow__window_3_min_count_300")

In [None]:
number_of_features = 300
max_len_str = 128
word2vec_path = "rezk/"
model_path_to_save = "../ml_models_saved/"
estimators = voting_models()

X_train_embed_matrix = text_to_matrix_using_word2vec(word2vec_model, fullgram_x_train_text_tokenized, max_len_str)
X_val_embed_matrix = text_to_matrix_using_word2vec(word2vec_model, fullgram_x_val_text_tokenized, max_len_str)

print(X_train_embed_matrix.shape)
print("="*50)
print(X_val_embed_matrix.shape)
print("="*50)

In [None]:
# Train Logistic Regression

In [None]:
model = LogisticRegression(penalty='l2', C=1, multi_class='multinomial', solver='lbfgs', verbose=1)
model = ml_classifer_pipeline(model, X_train_embed_matrix, y_train, X_val_embed_matrix, y_val,word2vec_path, model_path_to_save)

In [None]:
# Train SVC

In [None]:
model = LinearSVC(C=0.5,  max_iter=50, verbose=1)
model = ml_classifer_pipeline(model, X_train_embed_matrix, y_train, X_val_embed_matrix, y_val,word2vec_path, model_path_to_save)

In [None]:
# Hard Voting

In [None]:
model = VotingClassifier(estimators, voting="hard")
model = ml_classifer_pipeline(model, X_train_embed_matrix, y_train, X_val_embed_matrix, y_val,word2vec_path, model_path_to_save)

In [None]:
# AdaBoost 

In [None]:
model = LinearSVC(C=0.5,  verbose=1)
model = AdaBoostClassifier(model,  algorithm="SAMME", n_estimators=5)
model = ml_classifer_pipeline(model, X_train_embed_matrix, y_train, X_val_embed_matrix, y_val,word2vec_path, model_path_to_save)

In [None]:
#  Gradient Boosting

In [None]:
model = GradientBoostingClassifier(n_estimators=10, subsample=.1, learning_rate=.5,   max_depth=5, verbose=1)
model = ml_classifer_pipeline(model, X_train_embed_matrix, y_train, X_val_embed_matrix, y_val,word2vec_path, model_path_to_save)

In [None]:
# Load best model & predict test set

In [None]:
strat_test_set = read_file("test/strat_test_set.csv")
strat_test_set.head()

In [None]:
X_test_text = list(strat_test_set['text'])
y_test = strat_test_set['label'].values

In [None]:
X_test_text_tokenized = tokenize_using_nltk_TreebankWordTokenizer(X_test_text)

print("Before Tokenization : \n", X_test_text[:3])
print("="*50)
print("After Tokenization : \n", X_test_text_tokenized[:3])
print("="*50)

fullgram_X_test_text_tokenized = get_all_ngrams(X_test_text_tokenized)
print("full gram tokenization : \n", fullgram_X_test_text_tokenized[:3])
print("="*50)

In [None]:
X_test_embed_matrix = text_to_matrix_using_word2vec(word2vec_model, fullgram_X_test_text_tokenized, max_len_str)

print(X_test_embed_matrix.shape)
print("="*50)

In [None]:
model = pickle_load_model("ml_models_saved/rezk/" + )
f1_score_result(model, X_test_embed_matrix, y_test):

# Qualitative Evaluation

In [None]:
vocabs = list(word2vec_model.wv.index_to_key)
print("The number of words the model learn from your dataset are:", len(vocabs))
print(vocabs[:100])

In [None]:
word2vec_model.wv.most_similar('قابس')

In [None]:
word2vec_model.wv.most_similar('طنطا')

In [None]:
word2vec_model.wv.most_similar('فندق')

In [None]:
word2vec_model.wv.most_similar('🤣')

In [None]:
word2vec_model.wv.most_similar('🇰🇼')

In [None]:
word2vec_model.wv.most_similar('IT')

In [None]:
word2vec_model.wv.most_similar('؟')

In [None]:
word2vec_model.wv.most_similar('🌸')

In [None]:
word2vec_model.wv.most_similar('كورونا')

In [None]:
word2vec_model.wv.most_similar('كرونا')

In [None]:
word2vec_model.wv.most_similar('covid')

In [None]:
word2vec_model.wv.most_similar('covid_19')

In [None]:
word2vec_model.wv.most_similar('بايدن')

In [None]:
word2vec_model.wv.most_similar('عمر_بن_الخطاب')

In [None]:
word2vec_model.wv.most_similar('ابو_تريكه')

In [None]:
word2vec_model.wv.most_similar('بن_خلدون')

In [None]:
# Reduce the dimension of SENTIMENT_WORDS
tsne_df_scale = tsne_graph(word2vec_model, SENTIMENT_WORDS, 500, .03)

_ = init_graph_style()

_ = word_display(tsne_df_scale, SENTIMENT_WORDS, "SENTIMENT_WORDS.png")

In [None]:
# Reduce the dimension of NER_WORDS
tsne_df_scale = tsne_graph(rezk_model, NER_WORDS, 1400, .06)

_ = init_graph_style(figsize=(16, 10))

_ = word_display(tsne_df_scale, NER_WORDS, "NER_WORDS.png")