In [None]:
import glob
import os
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, LSTM, Embedding
from configs import *
from fetch_data import *
from features_extraction import *
from data_shuffling_split import *
from data_preprocess import *
from ml_modeling import *

In [None]:
strat_train_set = read_csv("train/strat_train_set.csv")
strat_train_set = strat_train_set.iloc[:5000]
strat_train_set.head()

In [None]:
word_to_vec_model = load_word2vec_model("models/word2vec/bakrianoo_unigram_cbow_100_twitter/full_uni_cbow_100_twitter.mdl")

In [None]:
x_train_text, x_val_text, y_train, y_val = prepare_data(strat_train_set)

In [None]:
x_train_text_tokenized = tokenize_using_nltk_TreebankWordTokenizer(x_train_text)

print("Before Tokenization : \n", x_train_text[:3])
print("="*50)
print("After Tokenization : \n", x_train_text_tokenized[:3])
print("="*50)

x_val_text_tokenized = tokenize_using_nltk_TreebankWordTokenizer(x_val_text)

print("Before Tokenization : \n", x_val_text[:3])
print("="*50)
print("After Tokenization : \n", x_val_text_tokenized[:3])

In [None]:
number_of_features = 100
max_len_str = 64
word2vec_path = "rezk/"
model_path_to_save = "models/ml_models/"
estimators = voting_models()

X_train_embed_matrix = text_to_matrix_using_word2vec(word_to_vec_model, x_train_text_tokenized, max_len_str)
X_val_embed_matrix = text_to_matrix_using_word2vec(word_to_vec_model, x_val_text_tokenized, max_len_str)

In [None]:
X_train_embed_matrix.shape

In [None]:
X_train_embed_matrix = X_train_embed_matrix.reshape([X_train_embed_matrix.shape[0], max_len_str, number_of_features])
X_val_embed_matrix  = X_val_embed_matrix.reshape([X_val_embed_matrix.shape[0], max_len_str, number_of_features])

In [None]:
X_train_embed_matrix.shape

In [None]:
def tokenize_and_vectorize(train_text, word_to_vec_model):
    tokenizer = TreebankWordTokenizer()
    vectorize_data = []
    for sampel in train_text:
        tokens = tokenizer.tokenize(sampel)
        sampel_vec=[]
        for token in tokens:
            try:
                sampel_vec.append(word_to_vec_model.wv[token])
            except KeyError:
                pass
        vectorize_data.append(sampel_vec)
    return vectorize_data

In [None]:
X_train = tokenize_and_vectorize(x_train_text, word_to_vec_model)
X_val = tokenize_and_vectorize(x_val_text, word_to_vec_model)

In [None]:
len(X_train)

In [None]:
len(X_train[0])

In [None]:
X_train = pad_trunc(X_train, 64)
X_val = pad_trunc(X_val, 64)

In [None]:
model = Sequential()

In [None]:
num_nuros = 50
model.add(LSTM(num_nuros, return_sequences=True, input_shape=(64, 100)))

In [None]:
model.add(Dropout(.2))
model.add(Flatten())
model.add(Dense(18, activation="softmax"))
model.compile(loss="sparse_categorical_crossentropy",
         optimizer="sgd",
         metrics="accuracy")
model.summary()

In [None]:
history = model.fit(X_train_embed_matrix, y_train, batch_size=32, epochs=10, validation_data=(X_val_embed_matrix, y_val))

In [None]:
X_val_embed_matrix.shape