# **IMPORTING THE LIBRARIES**

In [15]:
import numpy as np
import pandas as pd
import re
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
import urllib.request
import zipfile
import os
from keras.models import Sequential
from keras.layers import Embedding,Bidirectional,LSTM,GRU,Dense
import warnings
import tensorflow as tf

# **DATA DISPLAY: TRAIN, TEST AND VALIDATION**

**1) DATA TRAIN**

In [16]:
column_data=["Text"]
dff=pd.read_csv("/content/sample_data/train.txt",names=column_data)
dff.head()

Unnamed: 0,Text
0,i didnt feel humiliated;sadness
1,i can go from feeling so hopeless to so damned...
2,im grabbing a minute to post i feel greedy wro...
3,i am ever feeling nostalgic about the fireplac...
4,i am feeling grouchy;anger


**2) DATA TEST**

In [17]:
column_data=["Text"]
dff=pd.read_csv("/content/sample_data/test.txt",names=column_data)
dff.head()

Unnamed: 0,Text
0,im feeling rather rotten so im not very ambiti...
1,im updating my blog because i feel shitty;sadness
2,i never make her separate from me because i do...
3,i left with my bouquet of red and yellow tulip...
4,i was feeling a little vain when i did this on...


**3) DATA VALIDATION**

In [18]:
column_data=["Text"]
dff=pd.read_csv("/content/sample_data/val.txt",names=column_data)
dff.head()

Unnamed: 0,Text
0,im feeling quite sad and sorry for myself but ...
1,i feel like i am still looking at a blank canv...
2,i feel like a faithful servant;love
3,i am just feeling cranky and blue;anger
4,i can have for a treat or if i am feeling fest...


**Now that the packages and data are imported, we need to extract the sentences and their respective emotions and insert them into the training, testing and validation data frames respectively.**

In [19]:
f=open('/content/sample_data/train.txt','r')
x_train=[]
y_train=[]
for i in f:
    l=i.split(';')
    y_train.append(l[1].strip())
    x_train.append(l[0])
f=open('/content/sample_data/test.txt','r')
x_test=[]
y_test=[]
for i in f:
    l=i.split(';')
    y_test.append(l[1].strip())
    x_test.append(l[0])
f=open('/content/sample_data/val.txt','r')
for i in f:
    l=i.split(';')
    y_test.append(l[1].strip())
    x_test.append(l[0])
data_train=pd.DataFrame({'Text':x_train,'Emotion':y_train})
data_test=pd.DataFrame({'Text':x_test,'Emotion':y_test})
data=data_train.append(data_test,ignore_index=True)

**ADDITION OF LIBRARIES** 

In [20]:
import nltk
from nltk.tokenize import word_tokenize
import warnings
import tensorflow as tf
nltk.download('punkt')
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


**CLEANING OF NOISE, I.E. PREPOSITIONS, ARTICLES, PUNCTUATION MARKS, TO LEAVE ONLY IMPORTANT WORDS**

In [21]:
def clean_text(data):
  data=re.sub(r"(#[\d\w\.]+)", '', data)
  data=re.sub(r"(@[\d\w\.]+)", '', data)
  data=word_tokenize(data)
  return data
texts=[' '.join(clean_text(text)) for text in data.Text]
texts_train=[' '.join(clean_text(text)) for text in x_train]
texts_test=[' '.join(clean_text(text)) for text in x_test]

# **TOKEMIZATION**

 **It tokenizes each sentence, extracts each unique word and creates a dictionary where each unique word is assigned an index.**

In [22]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(texts)
sequence_train=tokenizer.texts_to_sequences(texts_train)
sequence_test=tokenizer.texts_to_sequences(texts_test)
index_of_words=tokenizer.word_index
vocab_size=len(index_of_words)+1

**ATTRIBUTION DE LA VALEUR CATEGORIELLE DE 0-5 A NOS 6 EMOTIONS OBTENUES DE NOS DONNEES A SAVOIR: anger, sadness, fear, joy, surprise, and love.**

In [23]:
from tensorflow.keras.utils import to_categorical

In [24]:
num_classes=6
embed_num_dims=300
max_seq_len=500
class_names=['anger','sadness','fear','joy','surprise','love']
X_train_pad=pad_sequences(sequence_train,maxlen=max_seq_len)
X_test_pad=pad_sequences(sequence_test,maxlen=max_seq_len)
encoding={'anger':0,'sadness':1,'fear':2,'joy':3,'surprise':4,'love':5}
y_train=[encoding[x] for x in data_train.Emotion]
y_test=[encoding[x] for x in data_test.Emotion]
y_train=to_categorical(y_train)
y_test=to_categorical(y_test)

**USING THE WORD VECTORS TRAINED ON WIKIPEDIA TO TRAIN OUR MODEL MORE EFFICIENTLY AND MORE THOROUGHLY, THUS ACHIEVING BETTER ACCURACY.**

In [25]:
def create_embedding_matrix(filepath,word_index,embedding_dim):
  vocab_size=len(word_index)+1
  embedding_matrix=np.zeros((vocab_size,embedding_dim))
  with open(filepath) as f:
    for line in f:
      word,*vector=line.split()
      if word in word_index:
        idx=word_index[word]
        embedding_matrix[idx] = np.array(vector,dtype=np.float32)[:embedding_dim]
  return embedding_matrix
fname='/content/sample_data/wiki-news-300d-1M.vec'
embedd_matrix=create_embedding_matrix(fname,index_of_words,embed_num_dims)

**Now, we create an architecture that will be used to train the model. For this purpose we first create an Embedding layer for which the weights are obtained from the word vectors file.**

We also add a Bidirectional layer, whose features. Finally, a Dense layer is added which has ‘softmax’ activation. Adam’s optimizer is used as the optimizer and loss is calculated using ‘categorical_crossentropy’.
‘model.summary()’ can be used to see the features, layer type, output shape and number of parameters in the model.

In [26]:
embedd_layer=Embedding(vocab_size,embed_num_dims,input_length=max_seq_len,weights=[embedd_matrix],trainable=False)
gru_output_size=128
bidirectional=True
model=Sequential()
model.add(embedd_layer)
model.add(Bidirectional(GRU(units=gru_output_size,dropout=0.2,recurrent_dropout=0.2)))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [27]:
batch_size=128
epochs=8
hist=model.fit(X_train_pad,y_train,batch_size=batch_size,epochs=epochs,validation_data=(X_test_pad,y_test))

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


# **TEST DU MODEL AVEC  CERTAINES PHRASES**

In [29]:
message=['I am good.']
seq=tokenizer.texts_to_sequences(message)
padded=pad_sequences(seq,maxlen=max_seq_len)
pred=model.predict(padded)
print('Message:'+str(message))
print('Emotion:',class_names[np.argmax(pred)])

Message:['I am good.']
Emotion: joy
