In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import sklearn

from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences



# #READ DATA

In [2]:
dataframe = pd.read_csv("../DATA/cleaned_data1.csv", names=["tweet", "class"]).iloc[2:,:]
dataframe["class"] = np.where(dataframe["class"] == "Positive",1,0)
dataframe = dataframe.sample(frac = 1)
x= dataframe['tweet'].apply(lambda x : str(x).split()).to_list()
y = dataframe['class'].to_list()


In [3]:
X_train, X_test, y_train,y_test= train_test_split(x,y,test_size=0.25,random_state=42 , stratify=y)  #same random state 


# Load word embedding models

In [4]:
from gensim.models import Word2Vec
from gensim.models import FastText

# FastText :
ft_model_sg   = FastText.load("../CODE/EmbeddingModels/ft_model_sg.model")
ft_model_cbow  = FastText.load("../CODE/EmbeddingModels/ft_model_cbow.model")

# Word2vec :
w2v_model_sg   = Word2Vec.load("../CODE/EmbeddingModels/w2v_model_sg.model")
w2v_model_cbow = Word2Vec.load("../CODE/EmbeddingModels/w2v_model_cbow.model")



In [52]:
MAX_NB_WORDS = 100000
embedding_dim = 300
epoch = 1

In [53]:
# Identify the Max length of sentences : 
def get_max_length(data):
    max_length = 0
    for index in range(len(data)) : 
        number_words = len(data[index])
        if (number_words) > (max_length):
            max_length = number_words
    return max_length

In [21]:
max_len_data = get_max_length(x)
max_len_data = 500

print(f"max_len_data = {max_len_data}")

max_len_data = 500


In [22]:
from tensorflow.keras.preprocessing.text import Tokenizer

# create the tokenizer
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)

# fit the tokenizer on the documents
tokenizer.fit_on_texts(x)


In [23]:
total_words = len(tokenizer.word_index) +1
print(f" Total words : {total_words}")

 Total words : 174770


In [24]:
def get_embedding_matrix(model,data,embedding_dim, max_nb_words, total_words,tokenizer):
     
    skipped_words = 0 
    embedding_matrix = np.zeros((total_words, embedding_dim))
    for word, index in tokenizer.word_index.items():
        # embedding_vector = None
        try:
            embedding_vector = model.wv[word]
        except :
            skipped_words += 1
            pass
        if embedding_vector is not None :
            embedding_matrix[index] = embedding_vector
    return embedding_matrix

In [25]:
embedding_matrix_ft_sg    = get_embedding_matrix(ft_model_sg,   x,embedding_dim,MAX_NB_WORDS,total_words,tokenizer)
embedding_matrix_ft_cbow  = get_embedding_matrix(ft_model_cbow, x,embedding_dim,MAX_NB_WORDS,total_words,tokenizer)
embedding_matrix_w2v_sg   = get_embedding_matrix(w2v_model_sg,  x,embedding_dim,MAX_NB_WORDS,total_words,tokenizer)
embedding_matrix_w2v_cbow = get_embedding_matrix(w2v_model_cbow,x,embedding_dim,MAX_NB_WORDS,total_words,tokenizer)


In [26]:
# Data  :

padding_X_train = pad_sequences(tokenizer.texts_to_sequences(X_train),maxlen = max_len_data)
padding_X_test  = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen = max_len_data)


In [27]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM , Embedding , Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import *


In [14]:
#dtype=object
#X_train = np.array(X_train)

In [15]:
#X_train = np.reshape(X_train, newshape = range(X_train.shape[0], X_train.shape[1], 1))

In [16]:
#X_train = np.reshape(X_train,(len(X_train),max_len_data,1))

In [28]:
def model(embedding_layer):
    
    model = Sequential()
    model.add(embedding_layer)
    model.add(LSTM(64,dropout = 0.1))
    #model.add(LSTM(50, return_sequences=True, input_shape= np.reshape(X_train,(len(x),max_len_data,1))#(X_train.shape[1], 1))
    #model.add(LSTM(50, return_sequences= True))
    #model.add(LSTM(50, return_sequences= False))
    #model.add(Dense(25))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))
    optimizer = Adam(learning_rate=3e-4)
    model.compile(optimizer= optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    return model 

In [29]:
def evaluate_model(y_test,predictions, file ,info ):
    file.write("\n Le model : " + str(info)+"\n")  
    file.write("Confusion Matrix : \n" + str(confusion_matrix(y_test,predictions))+"\n")  
    file.write("Classification Report : \n" + str(classification_report(y_test,predictions))+"\n")  
    file.write("Accuracy score : \n"+str(accuracy_score(y_test, predictions))+"\n")
    file.write("Recall Score : \n" + str(recall_score(y_test,predictions))+"\n")
    file.write("F1-score : \n" + str(f1_score(y_test, predictions, zero_division=1))+"\n")

In [30]:
# Data :
embedding_layer_ft_sg    = Embedding(total_words, embedding_dim, weights=[embedding_matrix_ft_sg], input_length   =max_len_data)
embedding_layer_ft_cbow  = Embedding(total_words, embedding_dim, weights=[embedding_matrix_ft_cbow], input_length =max_len_data)
embedding_layer_w2v_sg   = Embedding(total_words, embedding_dim, weights=[embedding_matrix_w2v_sg], input_length  =max_len_data)
embedding_layer_w2v_cbow = Embedding(total_words, embedding_dim, weights=[embedding_matrix_w2v_cbow], input_length=max_len_data)


In [20]:
#pip install -U numpy==1.18.5


^C
Note: you may need to restart the kernel to use updated packages.


In [31]:
# Data :
model_ft_sg    = model(embedding_layer_ft_sg)
model_ft_cbow  = model(embedding_layer_ft_cbow)
model_w2v_sg   = model(embedding_layer_w2v_sg)
model_w2v_cbow = model(embedding_layer_w2v_cbow)


In [32]:
def predict(model,x_test,y_test):
    
    predictions    = model.predict(x_test)
    predictions    = [predictions[i][0] for i in range(len(predictions))]
    predict_result = [round(num) for num in predictions]

    return predict_result

In [33]:
X_train = padding_X_train
y_train = y_train

X_test = padding_X_test
y_test = y_test

models = [model_ft_sg,model_ft_cbow,model_w2v_sg,model_w2v_cbow]

###############################################################################################################################

models_names = ["FastText_SG","FastText_CBOW","Word2vec_SG","Word2vec_CBOW"]

In [34]:
# Open file to save Results :
results = open("../CODE/Results/Result_LSTM1E1.txt",'w')

In [35]:
import os.path

In [36]:
for model, name in zip(models, models_names) :
    info = "Train the LSTM using : "+name
    print(len(X_train),len(y_train),len(X_test),len(y_test))
    model.fit(X_train, np.array(y_train), epochs = epoch, verbose=1)
    model.save("../Models/LSTM1E1_model_"+name)
    evaluate_model(y_test,predict(model,X_test,y_test), results ,info )
    print("- finish -------------------------------------------------------------------- "+info)


37398 37398 12466 12466
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: ../Models/LSTM1E1_model_FastText_SG\assets
- finish -------------------------------------------------------------------- Train the LSTM using : FastText_SG
37398 37398 12466 12466
INFO:tensorflow:Assets written to: ../Models/LSTM1E1_model_FastText_CBOW\assets
- finish -------------------------------------------------------------------- Train the LSTM using : FastText_CBOW
37398 37398 12466 12466
INFO:tensorflow:Assets written to: ../Models/LSTM1E1_model_Word2vec_SG\assets
- finish -------------------------------------------------------------------- Train the LSTM using : Word2vec_SG
37398 37398 12466 12466
INFO:tensorflow:Assets written to: ../Models/LSTM1E1_model_Word2vec_CBOW\assets
- finish ----

In [37]:
# Close file 
results.close()

# a revoir

In [45]:
from tensorflow.keras.models import load_model
LSTM_model_FastText_CBOW = load_model('../Models/LSTM1E1_model_FastText_CBOW')

In [51]:

#text = ["ياخي حالة ياخي"]
#text = ["نتي ناس ملاح و نحبوك"]
text = ["مليحة "]
text = pad_sequences(tokenizer.texts_to_sequences(text),maxlen = max_len_data)

predict(LSTM_model_FastText_CBOW,text,[0])

[0]