# 1. Libraries


**Import some libraries and the stored Corpus (dataset)**

In [None]:
#Import certain libraries and mount the drive where the corpus is stored
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set_style(style = 'whitegrid')
%matplotlib inline
from google.colab import drive
drive.mount('/content/drive')

#2. Analysis of the Mental Health dataset

In [None]:
dataset = pd.read_csv("/content/drive/MyDrive/DatasetMH_Emotions.csv", sep = ";", header = None)
#Remove the first line
dataset.drop(dataset.head(1).index,inplace = True)

dataset.head(3)


**Columns are renamed to organise tweets with corresponding emotion**


In [None]:
dataset.columns = ["Id", "Emoticos", "Polaridad","Emocion", "nada", "nada", "nada"]

#Label Polaridad is Polarity
#Label Emoticonos is Emoticons
#Label nada is null

In [None]:
dataset.head(10)


**Graph to show the distribution of Instagram comments in the dataset by emotion**

In [None]:
#Label Positiva is Positive
#Label Negativa is Negative
#Label Indeterminado is Neutral

#Label Amor/Admiración is Love/admiration
#Label Gratitud is Gratitude
#Label Tristeza/Pena is Sadness
#Label Enfado/Desprecio/Burla is Anger/contempt/mockery
#Label Comprensión/Empatía/Identificación is Comprehension/empathy/identification
#Label Indeterminado is Neutral

plt.figure(figsize=(14, 10))
sns.countplot(x = 'Emocion', data = dataset, palette = 'rocket',

              order=['Amor/Admiración', 'Gratitud', 'Comprensión/Empatía/Identificación', 'Tristeza/Pena','Enfado/Desprecio/Burla','Indeterminado']);

**6 datasets are created, one for each class to print the number of tweets of each class, in this case of each emotion**

In [None]:
dataset_Love = dataset[dataset['Emocion'] == 'Amor/Admiración']
dataset_Gratitude = dataset[dataset['Emocion'] == 'Gratitud']
dataset_Empathy = dataset[dataset['Emocion'] == 'Comprensión/Empatía/Identificación']
dataset_Sad = dataset[dataset['Emocion'] == 'Tristeza/Pena']
dataset_Anger = dataset[dataset['Emocion'] == 'Enfado/Desprecio/Burla']
dataset_Neutral = dataset[dataset['Emocion'] == 'Indeterminado']

#Label Amor/Admiración is Love/admiration
#Label Gratitud is Gratitude
#Label Tristeza/Pena is Sadness
#Label Enfado/Desprecio/Burla is Anger/contempt/mockery
#Label Comprensión/Empatía/Identificación is Comprehension/empathy/identification
#Label Indeterminado is Neutral


**Dataset visualisation**

In [None]:
dataset = pd.concat([dataset_Love, dataset_Gratitude, dataset_Empathy,
                     dataset_Sad, dataset_Anger, dataset_Neutral], axis = 0)
dataset

**Import libraries**

In [None]:
import re
import string
from string import punctuation
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

# 3. Data processing

**Text pre-processing and tokenisation function**

In [None]:
import re

from nltk import TweetTokenizer
import spacy
from nltk.stem import SnowballStemmer

# We create the function 'processed' which will delete stopwords and some characters peculiar to social networks
def processing(text):
  #Spanish stopwords 
  stopWords_without_prepositions = {'al', 'algo', 'algunas', 'algunos', 'antes', 'como', 'cual', 'cuando', 'del', 'donde', 'durante', 'e', 'el', 'ella', 'ellas', 'ellos', 'era', 'erais', 'eran', 'eras', 'eres', 'es', 'esa', 'esas', 'ese', 'eso', 'esos', 'esta', 'estaba', 'estabais', 'estaban', 'estabas',
                               'estad', 'estada', 'estadas', 'estado', 'estados', 'estamos', 'estando', 'estar', 'estaremos', 'estará', 'estarán', 'estarás', 'estaré', 'estaréis', 'estaría', 'estaríais', 'estaríamos', 'estarían', 'estarías', 'estas', 'este', 'estemos', 'esto', 'estos', 'estoy', 'estuve',
                               'estuviera', 'estuvierais', 'estuvieran', 'estuvieras', 'estuvieron', 'estuviese', 'estuvieseis', 'estuviesen', 'estuvieses', 'estuvimos', 'estuviste', 'estuvisteis', 'estuviéramos', 'estuviésemos', 'estuvo', 'está', 'estábamos', 'estáis', 'están', 'estás', 'esté', 'estéis',
                               'estén', 'estés', 'fue', 'fuera', 'fuerais', 'fueran', 'fueras', 'fueron', 'fuese', 'fueseis', 'fuesen', 'fueses', 'fui', 'fuimos', 'fuiste', 'fuisteis', 'fuéramos', 'fuésemos', 'ha', 'habida', 'habidas', 'habido', 'habidos', 'habiendo', 'habremos', 'habrá', 'habrán', 'habrás',
                               'habré', 'habréis', 'habría', 'habríais', 'habríamos', 'habrían', 'habrías', 'habéis', 'había', 'habíais', 'habíamos', 'habían', 'habías', 'han', 'has', 'hay', 'haya', 'hayamos', 'hayan', 'hayas', 'hayáis', 'he', 'hemos', 'hube', 'hubiera', 'hubierais', 'hubieran', 'hubieras',
                               'hubieron', 'hubiese', 'hubieseis', 'hubiesen', 'hubieses', 'hubimos', 'hubiste', 'hubisteis', 'hubiéramos', 'hubiésemos', 'hubo', 'la', 'las', 'le', 'les', 'lo', 'los', 'me', 'mi', 'mis', 'mucho', 'muchos', 'muy', 'más', 'mí', 'mía', 'mías', 'mío', 'míos', 'nada', 'nos',
                               'nosotras', 'nosotros', 'nuestra', 'nuestras', 'nuestro', 'nuestros', 'o', 'os', 'otra', 'otras', 'otro', 'otros', 'pero', 'poco', 'porque', 'que', 'quien', 'quienes', 'qué', 'se', 'sea', 'seamos', 'sean', 'seas', 'sentid', 'sentida', 'sentidas', 'sentido', 'sentidos', 'seremos',
                               'será', 'serán', 'serás', 'seré', 'seréis', 'sería', 'seríais', 'seríamos', 'serían', 'serías', 'seáis', 'siente', 'sintiendo', 'sois', 'somos', 'son', 'soy', 'su', 'sus', 'suya', 'suyas', 'suyo', 'suyos', 'sí', 'también', 'tanto', 'te', 'tendremos', 'tendrá', 'tendrán', 'tendrás',
                               'tendré', 'tendréis', 'tendría', 'tendríais', 'tendríamos', 'tendrían', 'tendrías', 'tened', 'tenemos', 'tenga', 'tengamos', 'tengan', 'tengas', 'tengo', 'tengáis', 'tenida', 'tenidas', 'tenido', 'tenidos', 'teniendo', 'tenéis', 'tenía', 'teníais', 'teníamos', 'tenían', 'tenías',
                               'ti', 'tiene', 'tienen', 'tienes', 'todo', 'todos', 'tu', 'tus', 'tuve', 'tuviera', 'tuvierais', 'tuvieran', 'tuvieras', 'tuvieron', 'tuviese', 'tuvieseis', 'tuviesen', 'tuvieses', 'tuvimos', 'tuviste', 'tuvisteis', 'tuviéramos', 'tuviésemos', 'tuvo', 'tuya', 'tuyas', 'tuyo', 'tuyos',
                               'tú', 'un', 'una', 'uno', 'unos', 'vosotras', 'vosotros', 'vuestra', 'vuestras', 'vuestro', 'vuestros', 'y', 'ya', 'yo', 'él', 'éramos'}
  #Translation into english  of stopwords
  #stopWords_without_prepositions = {"'to', 'something', 'some', 'some', 'before', 'like', 'which', 'when', 'of the', 'where', 'during', 'and', 'the', 'she', 'they', 'they', 'he', 'were', 'were', 'were', 'were', 'is', 'that', 'those', 'that', 'that', 'those', 'this', 'was', 'were', 'were', 'were', 'you were', 'be', 'we will', 'will be', 'will be', 'will be', 'will be', 'I will be', 'will be', 'would be', 'would be', 'would be', 'would be', 'would be', 'these', 'this', 'let us', 'this', 'these', 'I am', 'I was', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'was', 'were', 'were', 'were', 'were', 'were', 'was', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were', 'were',}


  DIACRITICAL_VOWELS = [('á','a'), ('é','e'), ('í','i'), ('ó','o'), ('ú','u'), ('ü','u')]

    
  SLANG = [('d','de'), ('[qk]','que'), ('xo','pero'), ('xa', 'para'), ('[xp]q','porque'),('es[qk]', 'es que'),
           ('fvr','favor'),('(xfa|xf|pf|plis|pls|porfa)', 'por favor'), ('dnd','donde'), ('tb', 'también'),
           ('(tq|tk)', 'te quiero'), ('(tqm|tkm)', 'te quiero mucho'), ('x','por'), ('\+','mas')]
  #Translation into english  
  #SLANG = [(‘d’,‘of’), (‘[qk]’,‘what’), (‘xo’,‘but’), (‘xa’,‘for’), (‘[xp]q’,‘why’),(‘es[qk]’,‘is that’),
           #(‘fvr’, ‘please’),(‘(xfa|xf|pf|plis|pls|please)’, ‘please’), (‘dnd’, ‘where’), (‘tb’, ‘also’),
           #(‘(tq|tk)’, ‘I love you’), (‘(tqm|tkm)’, ‘I love you very much’), (‘x’, ‘for’), (‘+’, ‘more’)]

  # Delete mentions  @, # , links...
  text = str(text)
  text = re.sub(r'@[A-Za-z0-9]+', ' ', text)
  text = re.sub(r'RT[|\s]', ' ', text)
  text = re.sub(r'#', ' ', text)
  text = re.sub(r'https?:\/\/\S+', ' ', text)

  stemming = True

  lemmatization = False

  #Stemming in spanish
  _stemmer = SnowballStemmer('spanish')

  #lemmatisation in Spanish
  #nlp = spacy.load('es_core_news_sm')

  # Message tokeniser (we use this one from Twitter)
  _tokenizer = TweetTokenizer().tokenize

  _stemming = stemming


  # Convert to lower case
  text = text.lower()


  # Delete numbers and carriage returns
  text = re.sub(r'(\d+|\n)', '', text)

  # Deleting vowels with diacritical marks
  for s,t in DIACRITICAL_VOWELS:
    text = re.sub(r'{0}'.format(s), t, text)

  # Delete repeated characters
  text = re.sub(r'(.)\1{2,}', r'\1\1', text)

  # Normalise laughter and replace with predefined variables
  text = re.sub(r'\b(?=\w*[j])[aeiouj]{4,}\b', ' ', text)
  text = re.sub(r'\b(juas+|lol)\b', ' ', text)

  # translate slang
  for s,t in SLANG:
    text = re.sub(r'\b{0}\b'.format(s), t, text)


  pattern = r'''(?x)                  # Set flag to allow verbose regexps
              (?:[A-Z]\.)+            # Abbreviations, e.g. U.S.A 
              | \w+(?:-\w+)*          # Words with optional internal hyphens
              | \$?\d+(?:\.\d+)?%?    # Currency and precentages, e.g. $12.40 82% 
              | \.\.\.                # Ellipsis 
              | [][.,;"'?():-_`]      # These are separate tokens; includes 
              | [😀\😁\😂\🤣\😃\😄\😅\😆\😉\😊\😋\😎\😍\😘\😗\😙\😚\☺\🙂\🤗\🤩\😌\😛\😜\😝\🤤\🤑\😇\🤭\😺\😸\😹\😻\😽\💪\✌\🖐\✋\👌\👍\👋\👏\🙌\🙏\💋\💘\❤\💓\💔\💕\💖\💗\💙\💚\💛\🧡\💜\🖤\💝\💞\💟\❣\💌\🍺\🍻\🎉\🎊\🙋\🕺\💃] # \:d\:)\:-)\:-d\;d\;-)\=d\;)\:]\:-]\=)\=]\(:\xd\:p\:-p\8)\xp\<3
              | [:d]
              | [:)]
              #| [:-)]
              | [:-d]
              | [;d]
              #| [;-)]
              | [=d]
              | [;)]
              | [:]]
              | [:-]]
              | [=)]
              | [=]]
              | [(:]
              | [xd]
              | [:p]
              | [:-p]
              | [8)]
              | [xp]
              | [<3]
              | [🤔\🤨\😐\😑\😶\🙄\😏\😮\🤐\😯\😒\😕\🙃\😲\😼\🤷] # \:-|\:|]
              | [:-|]
              | [:|]
              | [😣\😥\😪\😫\😓\😔\☹\🙁\😖\😞\😟\😤\😢\😭\😦\😧\😨\😩\🤯\😬\😰\😱\😳\😵\😡\😠\🤬\😷\🤒\🤕\🤢\🤮\🤧\💩\🙀\😿\😾\🖕\👎\⛔\🚫\🤦] # \:-(\:(\:-<\:<\:-[\:[\>:-[\>:[\:-{\:{\:-@\:@\>:-(\>:(\:-(\:(\d:\:\\:/\:-/\:-\\dx\d8
              #| [:-(]
              #| [:(]
              | [:-<]
              | [:<]
              | [:-[]
              | [:[]
              | [>:-[]
              | [>:[]
              | [:-{]
              | [:{]
              | [:-@]
              | [:@]
              #| [>:-(]
              | [>:(]
              | [:'-(]
              | [:'(]
              | [d:]
              | [:\]
              | [:/]
              #| [:-/]
              | [:-\]
              | [dx]
              | [d8]
              '''

  if _stemming:
    text = ' '.join(_stemmer.stem(w) for w in _tokenizer(text))

  if lemmatization:
    text_aux=nlp(text)
    for word in text_aux:
      text+=str(word.lemma_)+" "


  words = nltk.regexp_tokenize(text, pattern)
  re_punc = re.compile('[%s]' % re.escape(string.punctuation))      # Remove punctuation marks

  stripped = [re_punc.sub('', w) for w in words]                    # Remove stopwords


  text = [w for w in stripped if  w.lower() not in stopWords_without_prepositions]


  return (" ".join(text))

**The processed function is applied to each Instagram comment in the corpus. We process the data and apply stemming**

In [None]:
dataset['Id'] = dataset['Id'].apply(processing)
dataset['Id'] = dataset['Id'].str.lower()
dataset["Id"]


**Create a WordCloud**

In [None]:
#wordCloud
from wordcloud import WordCloud

dataset_A = dataset[dataset['Emocion'] == 'Amor/Admiración']
dataset_D = dataset[dataset['Emocion'] == 'Gratitud']
dataset_F = dataset[dataset['Emocion'] == 'Comprensión/Empatía/Identificación']
dataset_J = dataset[dataset['Emocion'] == 'Tristeza/Pena']
dataset_SAD = dataset[dataset['Emocion'] == 'Enfado/Desprecio/Burla']
dataset_OTHERS = dataset[dataset['Emocion'] == 'Indeterminado']

#Label Emocion is Emotion

#Label Amor/Admiración is Love/admiration
#Label Gratitud is Gratitude
#Label Tristeza/Pena is Sadness
#Label Enfado/Desprecio/Burla is Anger/contempt/mockery
#Label Comprensión/Empatía/Identificación is Comprehension/empathy/identification
#Label Indeterminado is Neutral

tweets= dataset_J["Id"].head(70).values
tweets=str(tweets)
print(len(tweets))
stop_words_sp = set(stopwords.words('spanish'))
wordcloudimage = WordCloud(
                          max_words=50,
                          max_font_size=500,
                          font_step=2,
                          stopwords=stop_words_sp,
                          background_color='white',
                          width=1000,
                          height=720
                          ).generate(tweets)

plt.figure(figsize=(15,7))
plt.axis("off")
plt.imshow(wordcloudimage)
wordcloudimage
plt.show()

**Vocabulary building with 6 classes (emotions)**

In [None]:
#Label Amor/Admiración is Love/admiration
#Label Gratitud is Gratitude
#Label Tristeza/Pena is Sadness
#Label Enfado/Desprecio/Burla is Anger/contempt/mockery
#Label Comprensión/Empatía/Identificación is Comprehension/empathy/identification
#Label Indeterminado is Neutral

vocab={}
labels=['Amor/Admiración','Gratitud', 'Comprensión/Empatía/Identificación', 'Tristeza/Pena', 'Enfado/Desprecio/Burla','Indeterminado']
numero_classes=len(labels)
_tokenizer = TweetTokenizer().tokenize

for i in dataset.index:
  emotion=dataset["Emocion"][i]
  if emocion =="Amor/Admiración":
    position=0
  elif emocion == "Gratitud":
    position=1
  elif emocion == "Comprensión/Empatía/Identificación":
    position=2
  elif emocion == "Tristeza/Pena":
    position=3
  elif emocion == "Enfado/Desprecio/Burla":
    position=4
  else:
    position=5
  for word in _tokenizer(dataset["Id"][i]):
    if word not in vocab.keys():
      vocab[word]=[0] * 6 # with this I turn it into a list of 7 integers
      vocab[word][position] = 1
    else:
      vocab[word][position] += 1

print(vocab)

**Entropy and information gain formulas for selecting the optimal vocabulary for the dataset**

In [None]:
def entropy(probs, adjust=1e-15):
  total=0
  for prob in probs:
    if(prob>0):
      total+= (prob + adjust) * np.math.log(prob+adjust,2)

  return total


def IG(corpus_probs, word_weigths, word_probs):

  corpus_entropy= entropy(corpus_probs)
  word_entropy=0

  for i in range(len(word_weigths)):
    word_entropy+= (word_weigths[i]* entropy(word_probs[i]))
  return corpus_entropy - word_entropy

**Calculation of the IG of each word in the corpus for the 6 classes that are emotions**

In [None]:
import operator
tweets_Love = len(dataset_Love)
tweets_Gratitude = len(dataset_Gratitude)
tweets_Empathy = len(dataset_Empathy)
tweets_Sad = len(dataset_Sad)
tweets_Anger = len(dataset_Anger)
tweets_Neutral = len(dataset_Neutral)
tweets_total =tweets_Love + tweets_Gratitude + tweets_Empathy + tweets_Sad + tweets_Anger + tweets_Neutral

class_counts = [ tweets_Love, tweets_Gratitude, tweets_Empathy, tweets_Sad, tweets_Anger,
                tweets_Neutral]
class_probs = np.array(class_counts) / tweets_total

vocab_entropy = {}
for word in vocab.keys():

    # Frequency of the word in the corpus
    wc1 = sum(vocab[word])

    # Frequency of the word not being in the corpus
    wc0 = tweets_total - wc1

    # Probabilities of the word to be in each of the classes
    probs_1 = [vocab[word][i] / wc1 for i in range(len(vocab[word]))]

    # Probabilities of the word not being in each of the classes
    probs_0 = [(class_counts[i] - vocab[word][i]) / (tweets_total - wc1) for i in range(len(vocab[word]))]

    # Probabilities of the word being in a message
    p_word = wc1 / tweets_total

    # Probability that the word is not in a message
    p_abs_word = (tweets_total - wc1) / tweets_total

    # Calculation of the entropy of each word using IG
    vocab_entropy[word] = IG(class_probs, [p_word, p_abs_word], [probs_1, probs_0])


# vocab_entropy_ord = dict(sorted(vocab_entropy.items(), key=operator.itemgetter(1), reverse=True))
# The words are sorted according to the IG value, from the highest value to the lowest value
vocab_entropy_ord = dict(sorted(vocab_entropy.items(), key=operator.itemgetter(1)))

print(vocab_entropy_ord)

wordsVocab = []
for word in vocab_entropy_ord:
    wordsVocab.append(word)


**One Hot Encoding technique shall be performed on the `Emocion` column. This technique, encodes categorical features as a single-use numeric array. The input to this transformer must be an array of integers or text strings, denoting the values taken by categorical (discrete) features. A binary column is created for each category and a sparse matrix is returned.**

In [None]:
one_hot = pd.get_dummies(dataset["Emocion"])
dataset.drop(['Emocion'], axis = 1, inplace = True)
dataset = pd.concat([dataset, one_hot], axis = 1)
dataset

#Label Emocion is Emotion

# 4. Training and Test Data

**The dataset is divided in two, 30% for testing and 70% for training**

In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
X = dataset['Id'].values

y = dataset.iloc[:, -6:].values # with this I store the last 7 columns of the dataset (the emotions) in the variable "y"
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)

In [None]:
num_ones = np.sum(y_train, axis=0)
print("The following number of samples is taken from each class:")
print("anger, disgust, fear, joy, others, sadness")
print(num_ones)

**Creation of the Corpus vocabulary**


In [None]:
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences 
from numpy import array
from numpy import asarray
from numpy import zeros

# oov_token is a special token in case this word is not in the dictionary
tokenizer = Tokenizer(oov_token='<OOV>')

# The dictionary is created from the best "X" number of words
tokenizer.fit_on_texts(wordVocab[:1650])

# the following line of code is the total dictionary without the "X" best words
# to take all vocabulary words, but select "#tokenizer.fit_on_texts(wordsVocab[:1651])" and enter the desired number of words, in this case 1651
#tokenizer.fit_on_texts(X_train.tolist()) # the tolist is added because it cannot be done with arrays


X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)
maxlen = 33

# We add "0" so that all input tensors have the same length, they are parsed to the length of maximum
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)


In [None]:
print(X_test)


# 5. Classification Model


### Adaptation to RF models

In [None]:
import numpy as np

## This block is to move y_train and y_test from a 6-column form to a 1-column form.
y_test_def = np.empty(y_test.shape[0], dtype=object)

#Label Amor/Admiración is Love/admiration
#Label Gratitud is Gratitude
#Label Tristeza/Pena is Sadness
#Label Enfado/Desprecio/Burla is Anger/contempt/mockery
#Label Comprensión/Empatía/Identificación is Comprehension/empathy/identification
#Label Indeterminado is Neutral

# Iteration for each row of y_test
for i in range(y_test.shape[0]):
    # I get the values of the current row
    row_values = y_test[i]

    # I check the conditions and assign the labels
    if row_values[0] == 1:
        y_test_def[i] = 'Amor/Admiración'
    elif row_values[1] == 1:
        y_test_def[i] = 'Comprensión/Empatía/Identificación'
    elif row_values[2] == 1:
        y_test_def[i] = 'Enfado/Desprecio/Burla'
    elif row_values[3] == 1:
        y_test_def[i] = 'Gratitud'
    elif row_values[4] == 1:
        y_test_def[i] = 'Indeterminado'
    elif row_values[5] == 1:
        y_test_def[i] = 'Tristeza/Pena'


# I check the values of y_test_def
y_test_def.shape

import numpy as np

# I create an empty array to store the modified values of y_test
y_train_def = np.empty(y_train.shape[0], dtype=object)

# Iteration for each row of y_test
for i in range(y_train.shape[0]):
    # I get the values of the current row
    row_values = y_train[i]

    # I check the conditions and assign the labels
    if row_values[0] == 1:
        y_train_def[i] = 'Amor/Admiración'
    elif row_values[1] == 1:
       y_train_def[i] = 'Comprensión/Empatía/Identificación'
    elif row_values[2] == 1:
        y_train_def[i] = 'Enfado/Desprecio/Burla'
    elif row_values[3] == 1:
        y_train_def[i] = 'Gratitud'
    elif row_values[4] == 1:
        y_train_def[i] = 'Indeterminado'
    elif row_values[5] == 1:
        y_train_def[i] = 'Tristeza/Pena'


# I check the values of y_test_def
y_train_def.shape



### RANDOM_FOREST algorithm

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import KFold
import numpy as np


precision_per_class = []
recall_per_class = []
f1_per_class = []
kf = KFold(n_splits=5, shuffle=True, random_state=47)


for train, test in kf.split(X_train):
    model1 = RandomForestClassifier(max_features="sqrt",n_estimators=510) # you can modify the hyperparameters n_stimators and max_features

    model1.fit(X_train, y_train_def)

    y_pred_rf = model1.predict(X_test)

    precision = precision_score(y_test_def, y_pred_rf, average=None)
    recall = recall_score(y_test_def, y_pred_rf, average=None)
    f1 = f1_score(y_test_def, y_pred_rf, average=None)

    precision_per_class.append(precision)
    recall_per_class.append(recall)
    f1_per_class.append(f1)

precision_avg = np.mean(precision_per_class, axis=0)
recall_avg = np.mean(recall_per_class, axis=0)
f1_avg = np.mean(f1_per_class, axis=0)

print("Precision:", precision_avg)

from sklearn.metrics import classification_report

# Obtain the classification report
classification_rep = classification_report(y_test_def, y_pred_rf)

# Print the classification repor
print(classification_rep)

from sklearn.metrics import accuracy_score

# Calculate the accuracy
accuracy = accuracy_score(y_test_def, y_pred_rf)

# Print accuracy with decimals
print("Accuracy: {:.2f}".format(accuracy))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Calculate the confusion matrix
confusion = confusion_matrix(y_test_def, y_pred_rf)

# Obtain class labels
labels = np.unique(y_test_def)

# Create figure and axes
fig, ax = plt.subplots()
sns.heatmap(confusion, annot=True, cmap='Blues', fmt='d', xticklabels=labels, yticklabels=labels, ax=ax)

# Configure labels and chart title
ax.set_xlabel('Predicted')
ax.set_ylabel('True')
ax.set_title('Confusion Matrix')

# Show the confusion matrix
plt.show()

# Calculate the accuracy for each class
accuracy_per_class = np.diag(confusion) / confusion.sum(axis=1)
for i, label in enumerate(labels):
    print(f"Accuracy for each class {label}: {accuracy_per_class[i]}")

# Calculate the global accuracy
total_accuracy = np.trace(confusion) / confusion.sum()
print(f"Global Accuracy: {total_accuracy}")


In [None]:
from sklearn.metrics import classification_report

# Obtain the classification report
classification_rep = classification_report(y_test_def, y_pred_rf)

# Print the ranking report
print(classification_rep)

from sklearn.metrics import accuracy_score

# Calculate accuracy
accuracy = accuracy_score(y_test_def, y_pred_rf)

# Print accuracy with decimals
print("Accuracy: {:.2f}".format(accuracy))
