# 1. Libraries


**Import some libraries and the Corpus (Mental Health dataset) stored in the drive**

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set_style(style = 'whitegrid')
%matplotlib inline
from google.colab import drive
drive.mount('/content/drive')

**Analysis of the Mental Health Dataset**

In [None]:
dataset = pd.read_csv("/content/drive/MyDrive/DatasetMH_Emotions.csv", sep = ";", header = None)

# I remove the first line with the heading
dataset.drop(dataset.head(1).index,inplace = True)
dataset.head(3)


**Columns are named to organise Instragram messages with their corresponding emotion (class o category)**


In [None]:
dataset.columns = ["Id", "Emoticos", "Polaridad","Emocion", "nada", "nada", "nada"]

#Label Polaridad is Polarity
#Label Emoticonos is Emoticons
#Label nada is null


In [None]:
# Check whether the first column of the dataset has been removed
dataset.head(10)


**Graph to show the distribution of messages in the dataset according to emotion**

In [None]:
plt.figure(figsize=(14, 10))
sns.countplot(x = 'Emocion', data = dataset, palette = 'rocket',

              order=['Amor/Admiración', 'Gratitud', 'Comprensión/Empatía/Identificación', 'Tristeza/Pena','Enfado/Desprecio/Burla','Indeterminado']);
plt.xlabel('Polaridad')
plt.ylabel('Number of messages')
plt.title('Polarity distribution of messages')
plt.show()

#Label Positiva is Positive
#Label Negativa is Negative
#Label Indeterminado is Neutral

#Label Amor/Admiración is Love/admiration
#Label Gratitud is Gratitude
#Label Tristeza/Pena is Sadness
#Label Enfado/Desprecio/Burla is Anger/contempt/mockery
#Label Comprensión/Empatía/Identificación is Comprehension/empathy/identification
#Label Indeterminado is Neutral

**6 datasets are created, one for each class to print the number of messages of each class, in this case of each emotion**

In [None]:
#Label Amor/Admiración is Love/admiration
#Label Gratitud is Gratitude
#Label Tristeza/Pena is Sadness
#Label Enfado/Desprecio/Burla is Anger/contempt/mockery
#Label Comprensión/Empatía/Identificación is Comprehension/empathy/identification
#Label Indeterminado is Neutral

dataset_Love = dataset[dataset['Emocion'] == 'Amor/Admiración']
dataset_Gratitude = dataset[dataset['Emocion'] == 'Gratitud']
dataset_Empathy = dataset[dataset['Emocion'] == 'Comprensión/Empatía/Identificación']
dataset_Sadness = dataset[dataset['Emocion'] == 'Tristeza/Pena']
dataset_Anger = dataset[dataset['Emocion'] == 'Enfado/Desprecio/Burla']
dataset_Neutral = dataset[dataset['Emocion'] == 'Indeterminado']

print("Number of messages with the following emotion:\n",
      "\nLove   ", len(dataset_Love),
      "\nGratitud   ", len(dataset_Gratitude),
      "\nComprenhesion   ", len(dataset_Empathy),
      "\nSadness   ", len(dataset_Sadness),
      "\nAnger   ", len(dataset_Anger),
      "\nNeutral   ", len(dataset_Neutral))


**Dataset of Mental Healt**

In [None]:
dataset = pd.concat([dataset_Love, dataset_Gratitude, dataset_Empathy,
                     dataset_Sadness, dataset_Anger, dataset_Neutral], axis = 0)
dataset

**Import libraries**

In [None]:
import re
import string
from string import punctuation
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

# 3. Data pre-processing

**Pre-processing and tokenisation function of dataset messages**

In [None]:
import re

from nltk import TweetTokenizer
import spacy
from nltk.stem import SnowballStemmer

# We create the function 'processed' which will delete stopwords and some characters peculiar to social networks
def processing(text):
  #Spanish stopwords 
  stopWords_without_prepositions = {'al', 'algo', 'algunas', 'algunos', 'antes', 'como', 'cual', 'cuando', 'del', 'donde', 'durante', 'e', 'el', 'ella', 'ellas', 'ellos', 'era', 'erais', 'eran', 'eras', 'eres', 'es', 'esa', 'esas', 'ese', 'eso', 'esos', 'esta', 'estaba', 'estabais', 'estaban', 'estabas',
                               'estad', 'estada', 'estadas', 'estado', 'estados', 'estamos', 'estando', 'estar', 'estaremos', 'estará', 'estarán', 'estarás', 'estaré', 'estaréis', 'estaría', 'estaríais', 'estaríamos', 'estarían', 'estarías', 'estas', 'este', 'estemos', 'esto', 'estos', 'estoy', 'estuve',
                               'estuviera', 'estuvierais', 'estuvieran', 'estuvieras', 'estuvieron', 'estuviese', 'estuvieseis', 'estuviesen', 'estuvieses', 'estuvimos', 'estuviste', 'estuvisteis', 'estuviéramos', 'estuviésemos', 'estuvo', 'está', 'estábamos', 'estáis', 'están', 'estás', 'esté', 'estéis',
                               'estén', 'estés', 'fue', 'fuera', 'fuerais', 'fueran', 'fueras', 'fueron', 'fuese', 'fueseis', 'fuesen', 'fueses', 'fui', 'fuimos', 'fuiste', 'fuisteis', 'fuéramos', 'fuésemos', 'ha', 'habida', 'habidas', 'habido', 'habidos', 'habiendo', 'habremos', 'habrá', 'habrán', 'habrás',
                               'habré', 'habréis', 'habría', 'habríais', 'habríamos', 'habrían', 'habrías', 'habéis', 'había', 'habíais', 'habíamos', 'habían', 'habías', 'han', 'has', 'hay', 'haya', 'hayamos', 'hayan', 'hayas', 'hayáis', 'he', 'hemos', 'hube', 'hubiera', 'hubierais', 'hubieran', 'hubieras',
                               'hubieron', 'hubiese', 'hubieseis', 'hubiesen', 'hubieses', 'hubimos', 'hubiste', 'hubisteis', 'hubiéramos', 'hubiésemos', 'hubo', 'la', 'las', 'le', 'les', 'lo', 'los', 'me', 'mi', 'mis', 'mucho', 'muchos', 'muy', 'más', 'mí', 'mía', 'mías', 'mío', 'míos', 'nada', 'nos',
                               'nosotras', 'nosotros', 'nuestra', 'nuestras', 'nuestro', 'nuestros', 'o', 'os', 'otra', 'otras', 'otro', 'otros', 'pero', 'poco', 'porque', 'que', 'quien', 'quienes', 'qué', 'se', 'sea', 'seamos', 'sean', 'seas', 'sentid', 'sentida', 'sentidas', 'sentido', 'sentidos', 'seremos',
                               'será', 'serán', 'serás', 'seré', 'seréis', 'sería', 'seríais', 'seríamos', 'serían', 'serías', 'seáis', 'siente', 'sintiendo', 'sois', 'somos', 'son', 'soy', 'su', 'sus', 'suya', 'suyas', 'suyo', 'suyos', 'sí', 'también', 'tanto', 'te', 'tendremos', 'tendrá', 'tendrán', 'tendrás',
                               'tendré', 'tendréis', 'tendría', 'tendríais', 'tendríamos', 'tendrían', 'tendrías', 'tened', 'tenemos', 'tenga', 'tengamos', 'tengan', 'tengas', 'tengo', 'tengáis', 'tenida', 'tenidas', 'tenido', 'tenidos', 'teniendo', 'tenéis', 'tenía', 'teníais', 'teníamos', 'tenían', 'tenías',
                               'ti', 'tiene', 'tienen', 'tienes', 'todo', 'todos', 'tu', 'tus', 'tuve', 'tuviera', 'tuvierais', 'tuvieran', 'tuvieras', 'tuvieron', 'tuviese', 'tuvieseis', 'tuviesen', 'tuvieses', 'tuvimos', 'tuviste', 'tuvisteis', 'tuviéramos', 'tuviésemos', 'tuvo', 'tuya', 'tuyas', 'tuyo', 'tuyos',
                               'tú', 'un', 'una', 'uno', 'unos', 'vosotras', 'vosotros', 'vuestra', 'vuestras', 'vuestro', 'vuestros', 'y', 'ya', 'yo', 'él', 'éramos'}
  #Translation into english  of stopwords
  #stopWords_without_prepositions = {"'to', 'something', 'some', 'some', 'before', 'like', 'which', 'when', 'of the', 'where', 'during', 'and', 'the', 'she', 'they', 'they', 'he', 'were', 'were', 'were', 'were', 'is', 'that', 'those', 'that', 'that', 'those', 'this', 'was', 'were', 'were', 'were', 'you were', 'be', 'we will', 'will be', 'will be', 'will be', 'will be', 'I will be', 'will be', 'would be', 'would be', 'would be', 'would be', 'would be', 'these', 'this', 'let us', 'this', 'these', 'I am', 'I was', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'was', 'were', 'were', 'were', 'were', 'were', 'was', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were',}


  DIACRITICAL_VOWELS = [('á','a'), ('é','e'), ('í','i'), ('ó','o'), ('ú','u'), ('ü','u')]


    
  SLANG = [('d','de'), ('[qk]','que'), ('xo','pero'), ('xa', 'para'), ('[xp]q','porque'),('es[qk]', 'es que'),
           ('fvr','favor'),('(xfa|xf|pf|plis|pls|porfa)', 'por favor'), ('dnd','donde'), ('tb', 'también'),
           ('(tq|tk)', 'te quiero'), ('(tqm|tkm)', 'te quiero mucho'), ('x','por'), ('\+','mas')]
  #Translation into english o SLANG
  #SLANG = [(‘d’,‘of’), (‘[qk]’,‘what’), (‘xo’,‘but’), (‘xa’,‘for’), (‘[xp]q’,‘why’),(‘es[qk]’,‘is that’),
           #(‘fvr’, ‘please’),(‘(xfa|xf|pf|plis|pls|please)’, ‘please’), (‘dnd’, ‘where’), (‘tb’, ‘also’),
           #(‘(tq|tk)’, ‘I love you’), (‘(tqm|tkm)’, ‘I love you very much’), (‘x’, ‘for’), (‘+’, ‘more’)]
    
  # Delete mentions  @, # , links...
  text = str(text)
  text = re.sub(r'@[A-Za-z0-9]+', ' ', text)
  text = re.sub(r'RT[|\s]', ' ', text)
  text = re.sub(r'#', ' ', text)
  text = re.sub(r'https?:\/\/\S+', ' ', text)

  stemming = True

  lemmatization = False

  #Stemming in spanish
  _stemmer = SnowballStemmer('spanish')

  #lemmatisation in Spanish
  #nlp = spacy.load('es_core_news_sm')

  # Message tokeniser (we use this one from Twitter)
  _tokenizer = TweetTokenizer().tokenize

  _stemming = stemming


  # Convert to lower case
  text = text.lower()


  # Delete numbers and carriage returns
  text = re.sub(r'(\d+|\n)', '', text)

  # Deleting vowels with diacritical marks
  for s,t in DIACRITICAL_VOWELS:
    text = re.sub(r'{0}'.format(s), t, text)

  # Delete repeated characters
  text = re.sub(r'(.)\1{2,}', r'\1\1', text)

  # Normalise laughter and replace with predefined variables
  text = re.sub(r'\b(?=\w*[j])[aeiouj]{4,}\b', ' ', text)
  text = re.sub(r'\b(juas+|lol)\b', ' ', text)

  # translate slang
  for s,t in SLANG:
    text = re.sub(r'\b{0}\b'.format(s), t, text)


  pattern = r'''(?x)                  # Set flag to allow verbose regexps
              (?:[A-Z]\.)+            # Abbreviations, e.g. U.S.A 
              | \w+(?:-\w+)*          # Words with optional internal hyphens
              | \$?\d+(?:\.\d+)?%?    # Currency and precentages, e.g. $12.40 82% 
              | \.\.\.                # Ellipsis 
              | [][.,;"'?():-_`]      # These are separate tokens; includes 
              | [😀\😁\😂\🤣\😃\😄\😅\😆\😉\😊\😋\😎\😍\😘\😗\😙\😚\☺\🙂\🤗\🤩\😌\😛\😜\😝\🤤\🤑\😇\🤭\😺\😸\😹\😻\😽\💪\✌\🖐\✋\👌\👍\👋\👏\🙌\🙏\💋\💘\❤\💓\💔\💕\💖\💗\💙\💚\💛\🧡\💜\🖤\💝\💞\💟\❣\💌\🍺\🍻\🎉\🎊\🙋\🕺\💃] # \:d\:)\:-)\:-d\;d\;-)\=d\;)\:]\:-]\=)\=]\(:\xd\:p\:-p\8)\xp\<3
              | [:d]
              | [:)]
              #| [:-)]
              | [:-d]
              | [;d]
              #| [;-)]
              | [=d]
              | [;)]
              | [:]]
              | [:-]]
              | [=)]
              | [=]]
              | [(:]
              | [xd]
              | [:p]
              | [:-p]
              | [8)]
              | [xp]
              | [<3]
              | [🤔\🤨\😐\😑\😶\🙄\😏\😮\🤐\😯\😒\😕\🙃\😲\😼\🤷] # \:-|\:|]
              | [:-|]
              | [:|]
              | [😣\😥\😪\😫\😓\😔\☹\🙁\😖\😞\😟\😤\😢\😭\😦\😧\😨\😩\🤯\😬\😰\😱\😳\😵\😡\😠\🤬\😷\🤒\🤕\🤢\🤮\🤧\💩\🙀\😿\😾\🖕\👎\⛔\🚫\🤦] # \:-(\:(\:-<\:<\:-[\:[\>:-[\>:[\:-{\:{\:-@\:@\>:-(\>:(\:-(\:(\d:\:\\:/\:-/\:-\\dx\d8
              #| [:-(]
              #| [:(]
              | [:-<]
              | [:<]
              | [:-[]
              | [:[]
              | [>:-[]
              | [>:[]
              | [:-{]
              | [:{]
              | [:-@]
              | [:@]
              #| [>:-(]
              | [>:(]
              | [:'-(]
              | [:'(]
              | [d:]
              | [:\]
              | [:/]
              #| [:-/]
              | [:-\]
              | [dx]
              | [d8]
              '''

  if _stemming:
    text = ' '.join(_stemmer.stem(w) for w in _tokenizer(text))

  if lemmatization:
    text_aux=nlp(text)
    for word in text_aux:
      text+=str(word.lemma_)+" "


  words = nltk.regexp_tokenize(text, pattern)
  re_punc = re.compile('[%s]' % re.escape(string.punctuation))      # Remove punctuation marks

  stripped = [re_punc.sub('', w) for w in words]                    # Remove stopwords


  text = [w for w in stripped if  w.lower() not in stopWords_without_prepositions]


  return (" ".join(text))

**The processed function is applied to each Instagram comment in the corpus. We process the data and apply stemming**

In [None]:
dataset['Id'] = dataset['Id'].apply(processing)
dataset['Id'] = dataset['Id'].str.lower()
dataset["Id"]


**Create the word cloud**

In [None]:
#wordCloud
from wordcloud import WordCloud

#Label Emocion is Emotion

#Label Amor/Admiración is Love/admiration
#Label Gratitud is Gratitude
#Label Tristeza/Pena is Sadness
#Label Enfado/Desprecio/Burla is Anger/contempt/mockery
#Label Comprensión/Empatía/Identificación is Comprehension/empathy/identification
#Label Indeterminado is Neutral

dataset_A = dataset[dataset['Emocion'] == 'Amor/Admiración']
dataset_D = dataset[dataset['Emocion'] == 'Gratitud']
dataset_F = dataset[dataset['Emocion'] == 'Comprensión/Empatía/Identificación']
dataset_J = dataset[dataset['Emocion'] == 'Tristeza/Pena']
dataset_SAD = dataset[dataset['Emocion'] == 'Enfado/Desprecio/Burla']
dataset_OTHERS = dataset[dataset['Emocion'] == 'Indeterminado']

tweets= dataset_J["Id"].head(70).values
tweets=str(tweets)
print(len(tweets))
stop_words_sp = set(stopwords.words('spanish'))
 
wordcloudimage = WordCloud(
                          max_words=50,
                          max_font_size=500,
                          font_step=2,
                          stopwords=stop_words_sp,
                          background_color='white',
                          width=1000,
                          height=720
                          ).generate(tweets)

plt.figure(figsize=(15,7))
plt.axis("off")
plt.imshow(wordcloudimage)
wordcloudimage
plt.show()

**Vocabulary creation with 6 classes (5 emotions + Neutral class)**

In [None]:
#Label Emocion is Emotion

#Label Amor/Admiración is Love/admiration
#Label Gratitud is Gratitude
#Label Tristeza/Pena is Sadness
#Label Enfado/Desprecio/Burla is Anger/contempt/mockery
#Label Comprensión/Empatía/Identificación is Comprehension/empathy/identification
#Label Indeterminado is Neutral

vocab={}
etiquetas=['Amor/Admiración','Gratitud', 'Comprensión/Empatía/Identificación', 'Tristeza/Pena', 'Enfado/Desprecio/Burla','Indeterminado']
numero_clases=len(etiquetas)
_tokenizer = TweetTokenizer().tokenize

for i in dataset.index:
  emotion=dataset["Emocion"][i]
  if emotion =="Amor/Admiración":
    position=0
  elif emotion == "Gratitud":
    posicion=1
  elif emotion == "Comprensión/Empatía/Identificación":
    position=2
  elif emotion == "Tristeza/Pena":
    position=3
  elif emotion == "Enfado/Desprecio/Burla":
    position=4
  else:
    position=5
  for word in _tokenizer(dataset["Id"][i]):
    if word not in vocab.keys():
      vocab[word]=[0] * 6 #con esto lo vuelvo una lista de 7 numeros enteros
      vocab[word][position] = 1
    else:
      vocab[word][position] += 1

print(vocab)

**Entropy and information gain formulas**

In [None]:
def entropy(probs, adjust=1e-15):
  total=0
  for prob in probs:
    if(prob>0):
      total+= (prob + adjust) * np.math.log(prob+adjust,2)

  return total


def IG(corpus_probs, word_weigths, word_probs):

  corpus_entropy= entropy(corpus_probs)
  word_entropy=0

  for i in range(len(word_weigths)):
    #print(i)
    word_entropy+= (word_weigths[i]* entropy(word_probs[i]))
  return corpus_entropy - word_entropy

**Calculation of the IG of each word in the corpus for the classes (6 emotions)**

In [None]:
import operator
tweets_Love = len(dataset_Love)
tweets_Gratitude = len(dataset_Gratitude)
tweets_Empathy = len(dataset_Empathy)
tweets_Sandness = len(dataset_Sandess)
tweets_Anger = len(dataset_Anger)
tweets_Neutral = len(dataset_Neutral)
tweets_total =tweets_Love + tweets_Gratitude + tweets_Empathy + tweets_Sadness + tweets_Anger + tweets_Neutral

class_counts = [ tweets_Love, tweets_Gratitude, tweets_Empathy, tweets_Sadness, tweets_Anger,
                tweets_Neutral]
class_probs = np.array(class_counts) / tweets_total

vocab_entropy = {}
for word in vocab.keys():

    # Frequency of the word in the corpus
    wc1 = sum(vocab[word])

    # Frequency of the word not being in the corpus
    wc0 = tweets_totales - wc1

    # Probabilities of the word to be in each of the classes
    probs_1 = [vocab[word][i] / wc1 for i in range(len(vocab[word]))]

    # Probabilities of the word not being in each of the classes
    probs_0 = [(class_counts[i] - vocab[word][i]) / (tweets_total - wc1) for i in range(len(vocab[word]))]

    # Probabilities of the word being in a message
    p_word = wc1 / tweets_total

    # Probabilities of the word not being in a message
    p_abs_word = (tweets_total - wc1) / tweets_total

    # Calculation of the entropy of each word using IG
    vocab_entropy[word] = IG(class_probs, [p_word, p_abs_word], [probs_1, probs_0])


# They are sorted according to entropy value from highest to lowest
vocab_entropy_ord = dict(sorted(vocab_entropy.items(), key=operator.itemgetter(1)))

print(vocab_entropy_ord)

wordVocab = []
for word in vocab_entropy_ord:
    wordVocab.append(word)


**One Hot Encoding shall be performed on the `Emocion` column. This technique, encodes categorical features as a single-use numeric array. The input to this transformer must be an array of integers or text strings, denoting the values taken by categorical (discrete) features. A binary column is created for each category and returns a sparse matrix.**

In [None]:
one_hot = pd.get_dummies(dataset["Emocion"])
dataset.drop(['Emocion'], axis = 1, inplace = True)
dataset = pd.concat([dataset, one_hot], axis = 1)
dataset

#Label Emocion is Emotion

# 4. Training and Test Data

**The dataset is divided in two, 30% for testing and 70% for training**

In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
X = dataset['Id'].values

y = dataset.iloc[:, -6:].values # with this I store the last 7 columns of the dataset (emotions) in the variable "y"
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)

In [None]:
num_ones = np.sum(y_train, axis=0)
print("The following number of samples is taken from each class:")
print("anger, disgust, fear, joy, others, sadness")
print(num_ones)

**Creation of the Corpus vocabulary**


In [None]:
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from numpy import array
from numpy import asarray
from numpy import zeros

#oov_token is a special token in case this word is not in the dictionary
tokenizer = Tokenizer(oov_token='<OOV>')

# The dictionary is created from the best "X" words (with the highest IG)
tokenizer.fit_on_texts(wordVocab[:2155])

# The following line of code is with the total dictionary (all words)
#tokenizer.fit_on_texts(X_train.tolist()) 


X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)
maxlen = 33

# We add "0" so that all input tensors have the same length, they are parsed to the length of maximum
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)


In [None]:
print(X_test)


# 5. Classification Model

The *TensorFlow* and the *Keras* library will be used

In [None]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout, Flatten
from sklearn.model_selection import GridSearchCV, StratifiedKFold

For the model, the simplest layers (`Dense`) will be used, on them, the activation function *ReLU* will be applied for the first three, and for the last one, *Softmax*.

**ReLU activation function:** The *rectified linear unit activation function (*ReLU*) is the most widely used activation function for deep learning applications with the most successful and widely used results. The *ReLU* function represents a quasi-linear function and therefore retains the properties of linear models, with gradient descent methods.
The activation function of *ReLU* performs a threshold operation for each input element where values less than 0 are set to 0, so the *ReLU* function is given by:

>$f\left ( x \right ) = \max\left ( 0,x \right ) = \left\{\begin{matrix}
x_{i}, & si \; \; x_{i} \geq 0 \\ 0, & si \; \; x_{i} < 0
\end{matrix}\right.$

The main advantage of using *ReLU* in the calculation is that it guarantees a faster calculation, as no exponentials or divisions are calculated, with an overall improved calculation speed.

**Softmax trigger function:** Used to calculate the probability distribution from a vector of real numbers. The *Softmax* function produces output in a range of values between 0 and 1, with the sum of the probabilities being equal to 1. The Softmax function is calculated using the ratio:

>$f\left ( x_{i} \right ) = \frac{exp\left ( x_{i} \right )}{\sum_{j}^{exp\left ( x_{j} \right )}}$

The Softmax function is used in multi-class models, where probabilities are returned for each class, with the target class having the highest probability. The Softmax function appears mainly in almost all output layers of deep learning architectures, where they are used.

**Application of early-stop to avoid overfitting**

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor='accuracy', mode='max', verbose=1, patience=5)


**Cross Validation Hybrid Model**

In this code only the training set is used to fit the model. The model is fitted using the model.fit(X[train], y[train]) function within the for train, test in kf.split(X) loop, where X[train] and y[train] are the training sets corresponding to each iteration of the loop. The test set X[test] and y[test] is used only to evaluate the performance of the model after each iteration of the cycle, using the function model.evaluate(X[test], y[test]).



In [None]:
from keras.models import Sequential
from sklearn.utils.multiclass import type_of_target
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np
from keras.layers import Embedding, Flatten, Dense, LSTM, Conv1D, GlobalMaxPooling1D, MaxPooling1D, Bidirectional, GRU
from sklearn.model_selection import KFold
import statistics
from tensorflow.keras.optimizers import Adam


X = np.concatenate((X_train, X_test), axis=0)
y = np.concatenate((y_train, y_test), axis=0)
filters = 180
units = 96


print(type_of_target(y))
acc_per_fold=[]
loss_per_fold=[]
kf=KFold(n_splits=5, shuffle=True, random_state=999)

cvscores=[]
for train, test in kf.split(X_train, y_train):
  model = Sequential()
  embedding_layer = Embedding(vocab_size, 200, input_length=maxlen)

  model.add(embedding_layer)

  model.add(Conv1D(filters, 8, activation='relu'))

  model.add(MaxPooling1D(10))

  model.add(LSTM(units, dropout=0.2, recurrent_dropout=0.3))
  model.add(Dense(256, activation='relu'))

  # More hidden layers can be added to test different configurations of the hybrid model
  model.add(Dense(128, activation='relu'))
  model.add(Dense(64, activation='relu'))


  # If we want 6 classes (number of emotions) we will use 6 neurons in the last dense layer.
  model.add(Dense(6, activation='softmax'))

  model.compile(optimizer=Adam(learning_rate=0.01), loss='categorical_crossentropy', metrics=['accuracy'])


  model.summary()


  early_stop = EarlyStopping(monitor = 'accuracy', mode = 'max', verbose = 1, patience = 5) # change the patience a bit to train the system more
  model.fit(X[train], y[train], epochs=100, batch_size=256, verbose=1,validation_data = (X[test], y[test]), callbacks=[early_stop])

  scores = model.evaluate(X[test], y[test], verbose=1)
  print(f'Score for fold : {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
  acc_per_fold.append(scores[1] * 100)
  loss_per_fold.append(scores[0])
print(acc_per_fold)
print(statistics.mean(acc_per_fold))

In [None]:
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from numpy import array
from numpy import asarray
from numpy import zeros

instance = dataset["Id"][40] # the message is arbitrarily chosen 40
print(instance)


instance = tokenizer.texts_to_sequences(instance)

flat_list = [] # instance can have more than one sentence, it should be converted into a flat list
for sublist in instance: # we go through each sub-list
    for item in sublist:
        flat_list.append(item)

flat_list = [flat_list]
instance = pad_sequences(flat_list, padding='post', maxlen=maxlen)
model.predict(instance) # we use the trained model to predict the instance class

In [None]:
# We create the following code to insert a sentence and predict the emotion of the sentence according to the trained model
from keras.preprocessing.text import text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences

#Label Amor/Admiración is Love/admiration
#Label Gratitud is Gratitude
#Label Tristeza/Pena is Sadness
#Label Enfado/Desprecio/Burla is Anger/contempt/mockery
#Label Comprensión/Empatía/Identificación is Comprehension/empathy/identification
#Label Indeterminado is Neutral

classes=['Aprobación/Empatía/Confianza', 'Desinterés/Tedio', 'Decepción/Tristeza', 'Desaprobación','Enfado/Ira','Interés/Anticipación/Hype','Indeterminado']

new_text = input("Enter the new phrase to predict its emotion: ")
preprocessed_text = [processing(new_text)]

preprocessed_text = [word.lower() for word in preprocessed_text]

if len(preprocessed_text) > maxlen:
    preprocessed_text = preprocessed_text[:maxlen]

# Convert pre-processed text into a sequence of words
new_text_sequence = tokenizer.texts_to_sequences(preprocessed_text)
# Apply padding to the sequence so that it has the same length.
new_text_padded = pad_sequences(new_text_sequence, padding='post', maxlen=maxlen)

# Making the prediction
prediction = model.predict(new_text_padded)
class_index = np.argmax(prediction)
class_label = classes[class_index]
print("The predicted class for the sentence is:", class_label)


#6. Evaluation of the model

**A *dataframe* is to be created containing the values obtained by *epoch*.

In [None]:
df_model = pd.DataFrame(model.history.history)

In [None]:
df_model['Epoch'] = range(1, df_model.shape[0] + 1)
df_model.index = df_model['Epoch']
df_model

**The accuracy (accuracy) of the model is then calculated**

In [None]:
score = model.evaluate(X_test, y_test, batch_size = 32, verbose = 1)

print('\nAccuracy - Data Test:', round(score[1], 4))

**Finally, the predictions will be calculated to compute the confusion matrix of the model (`confusion_matrix`)**

In [None]:
from sklearn.metrics import confusion_matrix

prediccion = model.predict(X_test)
confusionMatrix = confusion_matrix(np.argmax(y_test, axis = 1), np.argmax(prediccion, axis = 1))
df_confusionMatrix = pd.DataFrame(confusionMatrix,
                                   index = dataset.columns[-6:],
                                   columns = dataset.columns[-6:])
plt.figure(figsize = (6, 4))
sns.heatmap(df_confusionMatrix, annot = True,fmt='g', annot_kws={"size": 14}, cmap = 'BuPu');

**A report showing the main metrics of the classification (classification_report) is also created**

In [None]:
from sklearn.metrics import classification_report
emotions = dataset.columns[-6:]
print(classification_report(np.argmax(y_test, axis = 1), np.argmax(prediccion, axis = 1), target_names=emotions))
