## Criando dataset


In [3]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.callbacks import EarlyStopping
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

# Carrega o léxico em um DataFrame
lexico = pd.read_csv('lexico_czar.csv')

# Remove stopwords e pontuações
stop_words = set(stopwords.words('portuguese'))
lexico['text'] = lexico['text'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))
lexico['text'] = lexico['text'].apply(lambda x: re.sub('[^a-zA-z0-9\s]', '', x))

# Mapeia as emoções do léxico para os rótulos utilizados no modelo
emotions_mapping = {'Felicidade': 'happy', 'Tristeza': 'sad', 'Raiva': 'angry', 'Neutralidade': 'neutral', 'Sarcasmo': 'sarcasm'}
lexico['emotion'] = lexico['emotion'].map(emotions_mapping)

# Define os dados de entrada e saída
X = lexico['text'].values
Y = pd.get_dummies(lexico['emotion']).values

# Divide os dados em conjunto de treino e teste
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,random_state=42)

# Define o tokenizer e o tamanho máximo das sequências
tokenizer = Tokenizer(num_words=5000, oov_token=True)
tokenizer.fit_on_texts(X_train)
max_length = len(max(X_train, key=len).split())

# Transforma os textos em sequências de tokens e preenche as sequências com zeros (padding)
X_train = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=max_length, padding='post')
X_test = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=max_length, padding='post')

# Define o modelo da rede neural
model = Sequential()
model.add(Embedding(5000, 128, input_length=max_length))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(128, dropout=0.4, recurrent_dropout=0.4))
model.add(Dense(5, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Treina o modelo com o conjunto de treino e valida com o conjunto de teste
early_stopping = EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)
history = model.fit(X_train, Y_train, epochs=10, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

# Avalia o modelo com o conjunto de teste
score = model.evaluate(X_test, Y_test, verbose=1)
print("Test Score:", score[0])
print("Test Accuracy:", score[1])


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Score: 0.6367136240005493
Test Accuracy: 0.7727272510528564
