# **1. Libraries**


**Import some libraries and the Corpus (Mental Health dataset) stored in the drive**

In [None]:
#Importar ciertas librerias y montar el drive donde se guarda el corpus
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set_style(style = 'whitegrid')
%matplotlib inline
from google.colab import drive
drive.mount('/content/drive')

#**2. Analysis of the Mental Health Dataset**

In [None]:
dataset = pd.read_csv("/content/drive/MyDrive/DatasetMH.csv", sep = ";", header = None)
dataset.drop(dataset.head(1).index,inplace = True)

#I remove the first line with the heading

# Check whether the first column of the dataset has been removed

dataset.head(6)


**Columns are named to organise Instragram messages with their corresponding polarity (class o category)**

In [None]:
dataset.columns = ["Tweet", "emote", "Rating_Polarity","emocion","none","none","none"]

#Label Polaridad is Polarity
#Label Emoticonos is Emoticons
#Label nada is null

**Graph to show the distribution of messages in the dataset according to emotion**

In [None]:
plt.figure(figsize=(5, 3))
sns.countplot(x = 'Rating_Polarity', data = dataset, palette = 'rocket',
              order=['Negativa', 'Positiva', 'Indeterminado']);
plt.show()

#Label Positiva is Positive
#Label Negativa is Negative
#Label Indeterminado is Neutral

**3 datasets are created, one for each class to print the number of messages of each class, in this case of each polarity (Positive, Negative, Neutral)**

In [None]:
dataset_Positive = dataset[dataset['Rating_Polarity'] == 'Positiva']
dataset_Negative = dataset[dataset['Rating_Polarity'] == 'Negativa']
dataset_None = dataset[dataset['Rating_Polarity'] == 'Indeterminado']


print("NUMBER OF COMMNETS:\n",
      "\nPositives   ", len(dataset_Positive),
      "\nNegatives   ", len(dataset_Negative),
      "\nNeutral     ", len(dataset_None))

**Dataset of Mental Health**

In [None]:
dataset = pd.concat([dataset_Positive, dataset_Negative, dataset_None], axis = 0)
dataset

**Import libraries**

In [None]:
import re
import string
from string import punctuation
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# **3. Data pre-processing**

**Pre-processing and tokenisation function of dataset messages**

In [None]:
import re

from nltk import TweetTokenizer
import spacy
from nltk.stem import SnowballStemmer

# We create the function 'processed' which will delete stopwords and some characters peculiar to social networks
def processing(text):
  #Spanish stopwords
  stopWords_without_prepositions = {'al', 'algo', 'algunas', 'algunos', 'antes', 'como', 'cual', 'cuando', 'del', 'donde', 'durante', 'e', 'el', 'ella', 'ellas', 'ellos', 'era', 'erais', 'eran', 'eras', 'eres', 'es', 'esa', 'esas', 'ese', 'eso', 'esos', 'esta', 'estaba', 'estabais', 'estaban', 'estabas',
                               'estad', 'estada', 'estadas', 'estado', 'estados', 'estamos', 'estando', 'estar', 'estaremos', 'estará', 'estarán', 'estarás', 'estaré', 'estaréis', 'estaría', 'estaríais', 'estaríamos', 'estarían', 'estarías', 'estas', 'este', 'estemos', 'esto', 'estos', 'estoy', 'estuve',
                               'estuviera', 'estuvierais', 'estuvieran', 'estuvieras', 'estuvieron', 'estuviese', 'estuvieseis', 'estuviesen', 'estuvieses', 'estuvimos', 'estuviste', 'estuvisteis', 'estuviéramos', 'estuviésemos', 'estuvo', 'está', 'estábamos', 'estáis', 'están', 'estás', 'esté', 'estéis',
                               'estén', 'estés', 'fue', 'fuera', 'fuerais', 'fueran', 'fueras', 'fueron', 'fuese', 'fueseis', 'fuesen', 'fueses', 'fui', 'fuimos', 'fuiste', 'fuisteis', 'fuéramos', 'fuésemos', 'ha', 'habida', 'habidas', 'habido', 'habidos', 'habiendo', 'habremos', 'habrá', 'habrán', 'habrás',
                               'habré', 'habréis', 'habría', 'habríais', 'habríamos', 'habrían', 'habrías', 'habéis', 'había', 'habíais', 'habíamos', 'habían', 'habías', 'han', 'has', 'hay', 'haya', 'hayamos', 'hayan', 'hayas', 'hayáis', 'he', 'hemos', 'hube', 'hubiera', 'hubierais', 'hubieran', 'hubieras',
                               'hubieron', 'hubiese', 'hubieseis', 'hubiesen', 'hubieses', 'hubimos', 'hubiste', 'hubisteis', 'hubiéramos', 'hubiésemos', 'hubo', 'la', 'las', 'le', 'les', 'lo', 'los', 'me', 'mi', 'mis', 'mucho', 'muchos', 'muy', 'más', 'mí', 'mía', 'mías', 'mío', 'míos', 'nada', 'nos',
                               'nosotras', 'nosotros', 'nuestra', 'nuestras', 'nuestro', 'nuestros', 'o', 'os', 'otra', 'otras', 'otro', 'otros', 'pero', 'poco', 'porque', 'que', 'quien', 'quienes', 'qué', 'se', 'sea', 'seamos', 'sean', 'seas', 'sentid', 'sentida', 'sentidas', 'sentido', 'sentidos', 'seremos',
                               'será', 'serán', 'serás', 'seré', 'seréis', 'sería', 'seríais', 'seríamos', 'serían', 'serías', 'seáis', 'siente', 'sintiendo', 'sois', 'somos', 'son', 'soy', 'su', 'sus', 'suya', 'suyas', 'suyo', 'suyos', 'sí', 'también', 'tanto', 'te', 'tendremos', 'tendrá', 'tendrán', 'tendrás',
                               'tendré', 'tendréis', 'tendría', 'tendríais', 'tendríamos', 'tendrían', 'tendrías', 'tened', 'tenemos', 'tenga', 'tengamos', 'tengan', 'tengas', 'tengo', 'tengáis', 'tenida', 'tenidas', 'tenido', 'tenidos', 'teniendo', 'tenéis', 'tenía', 'teníais', 'teníamos', 'tenían', 'tenías',
                               'ti', 'tiene', 'tienen', 'tienes', 'todo', 'todos', 'tu', 'tus', 'tuve', 'tuviera', 'tuvierais', 'tuvieran', 'tuvieras', 'tuvieron', 'tuviese', 'tuvieseis', 'tuviesen', 'tuvieses', 'tuvimos', 'tuviste', 'tuvisteis', 'tuviéramos', 'tuviésemos', 'tuvo', 'tuya', 'tuyas', 'tuyo', 'tuyos',
                               'tú', 'un', 'una', 'uno', 'unos', 'vosotras', 'vosotros', 'vuestra', 'vuestras', 'vuestro', 'vuestros', 'y', 'ya', 'yo', 'él', 'éramos'}
  #Translation into english  of stopwords
  #stopWords_without_prepositions = {"'to', 'something', 'some', 'some', 'before', 'like', 'which', 'when', 'of the', 'where', 'during', 'and', 'the', 'she', 'they', 'they', 'he', 'were', 'were', 'were', 'were', 'is', 'that', 'those', 'that', 'that', 'those', 'this', 'was', 'were', 'were', 'were', 'you were', 'be', 'we will', 'will be', 'will be', 'will be', 'will be', 'I will be', 'will be', 'would be', 'would be', 'would be', 'would be', 'would be', 'these', 'this', 'let us', 'this', 'these', 'I am', 'I was', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'was', 'were', 'were', 'were', 'were', 'were', 'was', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were',}


  DIACRITICAL_VOWELS = [('á','a'), ('é','e'), ('í','i'), ('ó','o'), ('ú','u'), ('ü','u')]



  SLANG = [('d','de'), ('[qk]','que'), ('xo','pero'), ('xa', 'para'), ('[xp]q','porque'),('es[qk]', 'es que'),
           ('fvr','favor'),('(xfa|xf|pf|plis|pls|porfa)', 'por favor'), ('dnd','donde'), ('tb', 'también'),
           ('(tq|tk)', 'te quiero'), ('(tqm|tkm)', 'te quiero mucho'), ('x','por'), ('\+','mas')]
  #Translation into english o SLANG
  #SLANG = [(‘d’,‘of’), (‘[qk]’,‘what’), (‘xo’,‘but’), (‘xa’,‘for’), (‘[xp]q’,‘why’),(‘es[qk]’,‘is that’),
           #(‘fvr’, ‘please’),(‘(xfa|xf|pf|plis|pls|please)’, ‘please’), (‘dnd’, ‘where’), (‘tb’, ‘also’),
           #(‘(tq|tk)’, ‘I love you’), (‘(tqm|tkm)’, ‘I love you very much’), (‘x’, ‘for’), (‘+’, ‘more’)]

  # Delete mentions  @, # , links...
  text = str(text)
  text = re.sub(r'@[A-Za-z0-9]+', ' ', text)
  text = re.sub(r'RT[|\s]', ' ', text)
  text = re.sub(r'#', ' ', text)
  text = re.sub(r'https?:\/\/\S+', ' ', text)

  stemming = True

  lemmatization = False

  #Stemming in spanish
  _stemmer = SnowballStemmer('spanish')

  #lemmatisation in Spanish
  #nlp = spacy.load('es_core_news_sm')

  # Message tokeniser (we use this one from Twitter)
  _tokenizer = TweetTokenizer().tokenize

  _stemming = stemming


  # Convert to lower case
  text = text.lower()


  # Delete numbers and carriage returns
  text = re.sub(r'(\d+|\n)', '', text)

  # Deleting vowels with diacritical marks
  for s,t in DIACRITICAL_VOWELS:
    text = re.sub(r'{0}'.format(s), t, text)

  # Delete repeated characters
  text = re.sub(r'(.)\1{2,}', r'\1\1', text)

  # Normalise laughter and replace with predefined variables
  text = re.sub(r'\b(?=\w*[j])[aeiouj]{4,}\b', ' ', text)
  text = re.sub(r'\b(juas+|lol)\b', ' ', text)

  # translate slang
  for s,t in SLANG:
    text = re.sub(r'\b{0}\b'.format(s), t, text)


  pattern = r'''(?x)                  # Set flag to allow verbose regexps
              (?:[A-Z]\.)+            # Abbreviations, e.g. U.S.A
              | \w+(?:-\w+)*          # Words with optional internal hyphens
              | \$?\d+(?:\.\d+)?%?    # Currency and precentages, e.g. $12.40 82%
              | \.\.\.                # Ellipsis
              | [][.,;"'?():-_`]      # These are separate tokens; includes
              | [😀\😁\😂\🤣\😃\😄\😅\😆\😉\😊\😋\😎\😍\😘\😗\😙\😚\☺\🙂\🤗\🤩\😌\😛\😜\😝\🤤\🤑\😇\🤭\😺\😸\😹\😻\😽\💪\✌\🖐\✋\👌\👍\👋\👏\🙌\🙏\💋\💘\❤\💓\💔\💕\💖\💗\💙\💚\💛\🧡\💜\🖤\💝\💞\💟\❣\💌\🍺\🍻\🎉\🎊\🙋\🕺\💃] # \:d\:)\:-)\:-d\;d\;-)\=d\;)\:]\:-]\=)\=]\(:\xd\:p\:-p\8)\xp\<3
              | [:d]
              | [:)]
              #| [:-)]
              | [:-d]
              | [;d]
              #| [;-)]
              | [=d]
              | [;)]
              | [:]]
              | [:-]]
              | [=)]
              | [=]]
              | [(:]
              | [xd]
              | [:p]
              | [:-p]
              | [8)]
              | [xp]
              | [<3]
              | [🤔\🤨\😐\😑\😶\🙄\😏\😮\🤐\😯\😒\😕\🙃\😲\😼\🤷] # \:-|\:|]
              | [:-|]
              | [:|]
              | [😣\😥\😪\😫\😓\😔\☹\🙁\😖\😞\😟\😤\😢\😭\😦\😧\😨\😩\🤯\😬\😰\😱\😳\😵\😡\😠\🤬\😷\🤒\🤕\🤢\🤮\🤧\💩\🙀\😿\😾\🖕\👎\⛔\🚫\🤦] # \:-(\:(\:-<\:<\:-[\:[\>:-[\>:[\:-{\:{\:-@\:@\>:-(\>:(\:-(\:(\d:\:\\:/\:-/\:-\\dx\d8
              #| [:-(]
              #| [:(]
              | [:-<]
              | [:<]
              | [:-[]
              | [:[]
              | [>:-[]
              | [>:[]
              | [:-{]
              | [:{]
              | [:-@]
              | [:@]
              #| [>:-(]
              | [>:(]
              | [:'-(]
              | [:'(]
              | [d:]
              | [:\]
              | [:/]
              #| [:-/]
              | [:-\]
              | [dx]
              | [d8]
              '''

  if _stemming:
    text = ' '.join(_stemmer.stem(w) for w in _tokenizer(text))

  if lemmatization:
    text_aux=nlp(text)
    for word in text_aux:
      text+=str(word.lemma_)+" "


  words = nltk.regexp_tokenize(text, pattern)
  re_punc = re.compile('[%s]' % re.escape(string.punctuation))      # Remove punctuation marks

  stripped = [re_punc.sub('', w) for w in words]                    # Remove stopwords


  text = [w for w in stripped if  w.lower() not in stopWords_without_prepositions]


  return (" ".join(text))


The processed function is applied to each Instagram comment in the corpus. We process the data and apply stemming **texto en negrita**

In [None]:
dataset['Tweet'] = dataset['Tweet'].apply(procesado)
dataset['Tweet'] = dataset['Tweet'].str.lower()

dataset["Tweet"]


**Create the word cloud**

In [None]:
#Create wordCloud
from wordcloud import WordCloud


#Label Positiva is Positive
#Label Negativa is Negative
#Label Indeterminado is Neutral

datasetP=dataset[dataset['Rating_Polarity'] == 'Positiva']
datasetN=dataset[dataset['Rating_Polarity'] == 'Negativa']
datasetNO=dataset[dataset['Rating_Polarity'] == 'Indeterminado']
tweets= datasetP["Tweet"].head(1000).values
tweets=str(tweets)
print(len(tweets))
stop_words_sp = set(stopwords.words('spanish'))
stop_words_sp.update(["mas","si","dice","hoy","dia"])
wordcloudimage = WordCloud(
                          max_words=100,
                          max_font_size=500,
                          font_step=2,
                          stopwords=stop_words_sp,
                          background_color='white',
                          width=1000,
                          height=720
                          ).generate(tweets)

plt.figure(figsize=(15,7))
plt.axis("off")
plt.imshow(wordcloudimage)
wordcloudimage
plt.show()

**Vocabulary creation with 3 classes (Positive, Negative, Neutral class)**

In [None]:
vocab={}
etiquetas=["Negativo","Positivo","indeterminado"]
numero_clases=len(etiquetas)
_tokenizer = TweetTokenizer().tokenize


#Label Positiva is Positive
#Label Negativa is Negative
#Label Indeterminado is Neutral

for i in dataset.index:
  polaridad=dataset["Rating_Polarity"][i]
  if polaridad =="Negativo":
    posicion=0
  elif polaridad == "Positivo":
    posicion=1
  else:
    posicion=2
  for word in _tokenizer(dataset["Tweet"][i]):
    if word not in vocab.keys():
      vocab[word]=[0] * 3
      vocab[word][posicion] = 1
    else:
      vocab[word][posicion] += 1

print(vocab)

**Entropy and information gain formulas**

In [None]:
def entropy(probs, adjust=1e-15):
  total=0
  for prob in probs:
    if(prob>0):
      total+= (prob + adjust) * np.math.log(prob+adjust,2)

  return total


def IG(corpus_probs, word_weigths, word_probs):

  corpus_entropy= entropy(corpus_probs)
  word_entropy=0

  for i in range(len(word_weigths)):
    #print(i)
    word_entropy+= (word_weigths[i]* entropy(word_probs[i]))
  return corpus_entropy - word_entropy

**Calculation of the IG of each word in the corpus for the classes (6 emotions)**

---



In [None]:
import operator

tweets_positivos=len(dataset_Positive)
tweets_negativos=len(dataset_Negative)
tweets_indeterminados=len(dataset_None)

tweets_total=tweets_positivos+tweets_negativos+tweets_indeterminados

class_counts=[tweets_negativos,tweets_positivos,tweets_indeterminados]
class_probs=np.array(class_counts) / tweets_total
vocab_entropy={}

vocab_entropy = {}
for word in vocab.keys():

    # Frequency of the word in the corpus
    wc1 = sum(vocab[word])

    # Frequency of the word not being in the corpus
    wc0 = tweets_total - wc1

    # Probabilities of the word to be in each of the classes
    probs_1 = [vocab[word][i] / wc1 for i in range(len(vocab[word]))]

    # Probabilities of the word not being in each of the classes
    probs_0 = [(class_counts[i] - vocab[word][i]) / (tweets_total - wc1) for i in range(len(vocab[word]))]

    # Probabilities of the word being in a message
    p_word = wc1 / tweets_total

    # Probabilities of the word not being in a message
    p_abs_word = (tweets_total - wc1) / tweets_total

    # Calculation of the entropy of each word using IG
    vocab_entropy[word] = IG(class_probs, [p_word, p_abs_word], [probs_1, probs_0])


# They are sorted according to entropy value from highest to lowest
vocab_entropy_ord = dict(sorted(vocab_entropy.items(), key=operator.itemgetter(1)))

print(vocab_entropy_ord)

wordVocab = []
for word in vocab_entropy_ord:
    wordVocab.append(word)



**One Hot Encoding shall be performed on the "Rating_Polarity" column. This technique, encodes categorical features as a single-use numeric array. The input to this transformer must be an array of integers or text strings, denoting the values taken by categorical (discrete) features. A binary column is created for each category and returns a sparse matrix.**

In [None]:
one_hot = pd.get_dummies(dataset["Rating_Polarity"])
dataset.drop(['Rating_Polarity'], axis = 1, inplace = True)
dataset = pd.concat([dataset, one_hot], axis = 1)
dataset

# 4. **Training and Test Data**

**The dataset is divided in two, 30% for testing and 70% for training**

In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
X = dataset['Tweet'].values
y = dataset.iloc[:,-3:].values # with this I store the last 3 columns of the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)

In [None]:
print(y)

**Creation of the Corpus vocabulary**


In [None]:
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from numpy import array
from numpy import asarray
from numpy import zeros

#oov_token is a special token in case this word is not in the dictionary
tokenizer = Tokenizer(oov_token='<OOV>')

# The dictionary is created from the best "X" words (with the highest IG)
tokenizer.fit_on_texts(wordVocab[:2155])

# The following line of code is with the total dictionary (all words)
#tokenizer.fit_on_texts(X_train.tolist())


X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)
maxlen = 33

# We add "0" so that all input tensors have the same length, they are parsed to the length of maximum
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)


# **5. Classification Model**

**The TensorFlow and the Keras library will be used**

In [None]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout, Flatten
from sklearn.model_selection import GridSearchCV, StratifiedKFold

For the model, the simplest layers (Dense) will be used, on them, the activation function ReLU will be applied for the first three, and for the last one, Softmax.

ReLU activation function: The rectified linear unit activation function (ReLU) is the most widely used activation function for deep learning applications with the most successful and widely used results. The *ReLU function represents a quasi-linear function and therefore retains the properties of linear models, with gradient descent methods. The activation function of ReLU performs a threshold operation for each input element where values less than 0 are set to 0, so the ReLU function is given by:

The main advantage of using ReLU in the calculation is that it guarantees a faster calculation, as no exponentials or divisions are calculated, with an overall improved calculation speed.

Softmax trigger function: Used to calculate the probability distribution from a vector of real numbers. The Softmax function produces output in a range of values between 0 and 1, with the sum of the probabilities being equal to 1. The Softmax function is calculated using the ratio:

The Softmax function is used in multi-class models, where probabilities are returned for each class, with the target class having the highest probability. The Softmax function appears mainly in almost all output layers of deep learning architectures, where they are used.

**Application of early-stop to avoid overfitting**


In [None]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor = 'accuracy', mode = 'max', verbose = 1, patience = 2)



**Cross Validation Hybrid Model**

In this code only the training set is used to fit the model. The model is fitted using the model.fit(X[train], y[train]) function within the for train, test in kf.split(X) loop, where X[train] and y[train] are the training sets corresponding to each iteration of the loop. The test set X[test] and y[test] is used only to evaluate the performance of the model after each iteration of the cycle, using the function model.evaluate(X[test], y[test]).


In [None]:
from keras.models import Sequential
from sklearn.utils.multiclass import type_of_target
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np
from keras.layers import Embedding, Flatten, Dense, LSTM, Conv1D, GlobalMaxPooling1D, MaxPooling1D, Bidirectional, GRU
from sklearn.model_selection import KFold
import statistics
from tensorflow.keras.optimizers import Adam

X = np.concatenate((X_train, X_test), axis=0)
y = np.concatenate((y_train, y_test), axis=0)

#y=np.argmax(y,axis=1)
print(type_of_target(y))
acc_per_fold=[]
loss_per_fold=[]
print(type_of_target(y))
kf=KFold(n_splits=5, shuffle=True, random_state=999)
cvscores=[]
for train, test in kf.split(X_train, y_train):
  model = Sequential()
  embedding_layer = Embedding(vocab_size, 200, input_length=maxlen)

  model.add(embedding_layer)

  model.add(Conv1D(180, 8, activation='relu'))

  model.add(MaxPooling1D(10))

  model.add(LSTM(256, dropout=0.2, recurrent_dropout=0.3))


  #If we want 3 classes we will use 3 neurons in the last dense layer.
  model.add(Dense(3, activation='softmax'))
  model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])
  model.summary()


  early_stop = EarlyStopping(monitor = 'accuracy', mode = 'max', verbose = 1, patience = 5)
  model.fit(X[train], y[train], epochs=100, batch_size=32, verbose=1,validation_data = (X[test], y[test]), callbacks=[early_stop])

  scores = model.evaluate(X[test], y[test], verbose=1)
  print(f'Score for fold : {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
  acc_per_fold.append(scores[1] * 100)
  loss_per_fold.append(scores[0])
print(acc_per_fold)
print(statistics.mean(acc_per_fold))

In [None]:
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from numpy import array
from numpy import asarray
from numpy import zeros

instance = dataset["Tweet"][40]
print(instance)


instance = tokenizer.texts_to_sequences(instance)

flat_list = [] # instance can have more than one sentence, it should be converted into a flat list
for sublist in instance: # we go through each sub-list
    for item in sublist:
        flat_list.append(item)

flat_list = [flat_list]
instance = pad_sequences(flat_list, padding='post', maxlen=maxlen)
model.predict(instance) # we use the trained model to predict the instance class

In [None]:
# We create the following code to insert a sentence and predict the emotion of the sentence a
from keras.models import Model
from keras.layers import Input
from keras.preprocessing.text import text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences

#Label Positivo is Positive
#Label Negativo is Negative
#Label Indeterminado is Neutral
classes = ['Positivo','Negativo']

new_text = input("Enter the new phrase to predict its polarity: ")
preprocessed_text = [procesado(new_text)]

preprocessed_text = [word.lower() for word in preprocessed_text]

if len(preprocessed_text) > maxlen:
    preprocessed_text = preprocessed_text[:maxlen]

# Convert pre-processed text into a sequence of words
new_text_sequence = tokenizer.texts_to_sequences(preprocessed_text)
# Apply padding to the sequence so that it has the same length.
new_text_padded = pad_sequences(new_text_sequence, padding='post', maxlen=maxlen)

output_layer = model.layers[-1]
model_proba = Model(inputs=model.input, outputs=output_layer.output)

# Making the prediction
proba = model_proba.predict(new_text_padded)
class_probabilities = proba[0]

# Print the probability percentages for each class
for i, class_probability in enumerate(class_probabilities):
    class_label = classes[i]
    rounded_probability = round(class_probability * 100, 2)  # Round to 2 decimal places
    print(f"The predicted class for the sentence is '{class_label}': {rounded_probability}%")


# **6. Evaluation of the model**

**A dataframe is to be created containing the values obtained by epoch**

In [None]:
df_modelo = pd.DataFrame(model.history.history)

In [None]:
df_modelo['Epoch'] = range(1, df_modelo.shape[0] + 1)
df_modelo.index = df_modelo['Epoch']
df_modelo

**The accuracy (accuracy) of the model is then calculated**

In [None]:
score = model.evaluate(X_test, y_test, batch_size = 512, verbose = 1)

print('\nAccuracy - Data Test:', round(score[1], 4))

**Finally, the predictions will be calculated to compute the confusion matrix of the model (confusion_matrix)**

In [None]:
from sklearn.metrics import confusion_matrix

prediccion = model.predict(X_test)
matriz_confusion = confusion_matrix(np.argmax(y_test, axis = 1), np.argmax(prediccion, axis = 1))
df_matriz_confusion = pd.DataFrame(matriz_confusion,
                                   index = dataset.columns[-3:],
                                    columns = dataset.columns[-3:])
plt.figure(figsize = (6, 4))
sns.heatmap(df_matriz_confusion, annot = True,fmt='g', annot_kws={"size": 14}, cmap = 'BuPu');

**A report showing the main metrics of the classification (classification_report) is also created.**

In [None]:
from sklearn.metrics import classification_report

print(classification_report(np.argmax(y_test, axis = 1), np.argmax(prediccion, axis = 1)))