In [2]:
import sklearn
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import tensorflow as tf

# Read Data: 

In [3]:
data = pd.read_csv('../Data/cleaned_data1.csv')
data = pd.read_csv("../DATA/cleaned_data1.csv", names=["tweet", "class"]).iloc[2:,:]
data["class"] = np.where(data["class"] == "Positive",1,0)
data = data.sample(frac = 1)
x = data['tweet'].apply(lambda x : str(x).split()).to_list()
y = data['class'].to_list()


In [4]:
#data['len'] = [len(tweet) for tweet in x]

In [29]:
#len(data['len']) - cpt

In [26]:
#cpt = 0
#for i in range(len(data)) :
    #tweet = data.iloc[i]
    #if tweet['len'] < 500 :
        #cpt += 1
        
#print(cpt)

In [5]:
X_train, X_test, y_train,y_test= train_test_split(x,y,test_size=0.25,random_state=42 , stratify=y)  #same random state 


# Load Word Embedding Models :

In [6]:
from gensim.models import Word2Vec
from gensim.models import FastText

# FastText :
ft_model_sg   = FastText.load("../CODE/EmbeddingModels/ft_model_sg.model")
ft_model_cbow  = FastText.load("../CODE/EmbeddingModels/ft_model_cbow.model")

# Word2vec :
w2v_model_sg   = Word2Vec.load("../CODE/EmbeddingModels/w2v_model_sg.model")
w2v_model_cbow = Word2Vec.load("../CODE/EmbeddingModels/w2v_model_cbow.model")



### Parameters :  

In [7]:
MAX_NB_WORDS = 100000
embedding_dim = 300
epoch = 100

### Functions :  

Get the Max lenght of sentences :

In [8]:
# Identify the Max length of sentences : 
def get_max_length(data):
    max_length = 0
    for index in range(len(data)) : 
        number_words = len(data[index])
        if (number_words) > (max_length):
            max_length = number_words
    return max_length

In [9]:
max_len_data = get_max_length(x)
max_len_data = 500

print(f"max_len_data = {max_len_data}")

max_len_data = 500


Tokenize the data : 

In [12]:
from tensorflow.keras.preprocessing.text import Tokenizer

# create the tokenizer
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)

# fit the tokenizer on the documents
tokenizer.fit_on_texts(x)


In [13]:
total_words = len(tokenizer.word_index) +1
print(f" Total words : {total_words}")

 Total words : 174770


Get embedding matrix :

In [14]:
def get_embedding_matrix(model,data,embedding_dim, max_nb_words, total_words,tokenizer):
     
    skipped_words = 0 
    embedding_matrix = np.zeros((total_words, embedding_dim))
    for word, index in tokenizer.word_index.items():
        # embedding_vector = None
        try:
            embedding_vector = model.wv[word]
        except :
            skipped_words += 1
            pass
        if embedding_vector is not None :
            embedding_matrix[index] = embedding_vector
    return embedding_matrix

In [15]:

embedding_matrix_ft_sg    = get_embedding_matrix(ft_model_sg,   x,embedding_dim,MAX_NB_WORDS,total_words,tokenizer)
embedding_matrix_ft_cbow  = get_embedding_matrix(ft_model_cbow, x,embedding_dim,MAX_NB_WORDS,total_words,tokenizer)
embedding_matrix_w2v_sg   = get_embedding_matrix(w2v_model_sg,  x,embedding_dim,MAX_NB_WORDS,total_words,tokenizer)
embedding_matrix_w2v_cbow = get_embedding_matrix(w2v_model_cbow,x,embedding_dim,MAX_NB_WORDS,total_words,tokenizer)


Prepare training and testing data : 

In [16]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Data  :

padding_X_train = pad_sequences(tokenizer.texts_to_sequences(X_train),maxlen = max_len_data)
padding_X_test  = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen = max_len_data)


# CNN model : 

In [63]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *

In [64]:
def model(embedding_layer):
    
    model = Sequential()
    model.add(embedding_layer)
    model.add(Conv1D(100,5,activation="relu"))
    model.add(MaxPooling1D(2,2))
    model.add(Conv1D(50,2,activation="relu"))
    model.add(MaxPooling1D(2,2))
    model.add(Conv1D(30,2,activation="relu"))
    model.add(MaxPooling1D(2,2))
    model.add(Conv1D(50,2,activation="relu"))
    model.add(MaxPooling1D(2,2))
    model.add(Dense(10,activation="relu"))
    model.add(Flatten())
    model.add(Dense(1,activation="sigmoid"))
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    return model 

In [65]:
def evaluate_model(y_test,predictions, file ,info ):
    file.write("\n Le model : " + str(info)+"\n")  
    file.write("Confusion Matrix : \n" + str(confusion_matrix(y_test,predictions))+"\n")  
    file.write("Classification Report : \n" + str(classification_report(y_test,predictions))+"\n")  
    file.write("Accuracy score : \n"+str(accuracy_score(y_test, predictions))+"\n")
    file.write("Recall Score : \n" + str(recall_score(y_test,predictions))+"\n")
    file.write("F1-score : \n" + str(f1_score(y_test, predictions, zero_division=1))+"\n")

Create embedding layers : 

In [66]:
# Data :
embedding_layer_ft_sg    = Embedding(total_words, embedding_dim, weights=[embedding_matrix_ft_sg], input_length   =max_len_data)
embedding_layer_ft_cbow  = Embedding(total_words, embedding_dim, weights=[embedding_matrix_ft_cbow], input_length =max_len_data)
embedding_layer_w2v_sg   = Embedding(total_words, embedding_dim, weights=[embedding_matrix_w2v_sg], input_length  =max_len_data)
embedding_layer_w2v_cbow = Embedding(total_words, embedding_dim, weights=[embedding_matrix_w2v_cbow], input_length=max_len_data)


Create CNN models :

In [67]:
# Data :
model_ft_sg    = model(embedding_layer_ft_sg)
model_ft_cbow  = model(embedding_layer_ft_cbow)
model_w2v_sg   = model(embedding_layer_w2v_sg)
model_w2v_cbow = model(embedding_layer_w2v_cbow)


In [18]:
def predict(cnn_model,x_test,y_test):
    
    predictions    = cnn_model.predict(x_test)
    predictions    = [predictions[i][0] for i in range(len(predictions))]
    predict_result = [round(num) for num in predictions]

    return predict_result

### Train CNN :

In [69]:
X_train = padding_X_train
y_train = y_train

X_test = padding_X_test
y_test = y_test

models = [model_ft_sg,model_ft_cbow,model_w2v_sg,model_w2v_cbow]

###############################################################################################################################

models_names = ["FastText_SG","FastText_CBOW","Word2vec_SG","Word2vec_CBOW"]

In [70]:
# Open file to save Results :
results = open("../CODE/Results/Result_3CNN100.txt",'w')

In [71]:
import os.path

In [72]:
for model, name in zip(models, models_names) :
    info = "Train the CNN using : "+name
    print(len(X_train),len(y_train),len(X_test),len(y_test))
    model.fit(X_train, np.array(y_train), epochs = epoch, verbose=1)
    model.save("../Models/CNN_model_"+name)
    evaluate_model(y_test,predict(model,X_test,y_test), results ,info )
    print("- finish -------------------------------------------------------------------- "+info)


37398 37398 12466 12466
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/1

Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
INFO:tensorflow:Assets written to: ../Models/CNN_model_FastText_CBOW\assets
- finish -------------------------------------------------------------------- Train the CNN using : FastText_CBOW
37398 37398 12466 12466
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
E

Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100


Epoch 100/100
INFO:tensorflow:Assets written to: ../Models/CNN_model_Word2vec_SG\assets
- finish -------------------------------------------------------------------- Train the CNN using : Word2vec_SG
37398 37398 12466 12466
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch

Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
INFO:tensorflow:Assets written to: ../Models/CNN_model_Word2vec_CBOW\assets
- finish -------------------------------------------------------------------- Train the CNN using : Word2vec_CBOW


In [73]:
# Close file 
results.close()


In [30]:
from tensorflow.keras.models import load_model
CNN_model_FastText_SG = load_model('../Models/CNN_model_Word2vec_CBOW')


#text = ["ياخي حالة ياخي"]
text = [" قلب "]
#text = ["مليحة "]
text = pad_sequences(tokenizer.texts_to_sequences(text),maxlen = max_len_data)

prediction = CNN_model_FastText_SG.predict(text)



In [31]:
prediction

array([[0.37465876]], dtype=float32)

In [27]:
print(prediction)

[[0.37465876]]
