In [None]:
# import de dados

import pandas as pd
import numpy as np
np.random.seed(0)

from textblob import TextBlob


import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


import nltk
from nltk.corpus import stopwords

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from collections import Counter

In [None]:
path = "data/twitter_training.csv"
train_df = pd.read_csv(path, names=["Tweet_ID", "Entity", "Sentiment", "Tweet_Content"])

test_path = "data/twitter_validation.csv"
test_df = pd.read_csv(test_path, names=["Tweet_ID", "Entity", "Sentiment", "Tweet_Content"])

In [None]:
train_df = train_df.dropna()
test_df = test_df.dropna()

In [None]:
train_df.shape

In [None]:
df = train_df.sample(frac=0.1)
df.reset_index(drop=True, inplace=True)

In [None]:
df.info()

In [None]:
df.head()

In [None]:
#check o balanço da classe target
sentiments = list(df["Sentiment"].unique())
sentiment_nums = [len(df[df["Sentiment"] == sentiment]) / len(df) for sentiment in sentiments]
plt.bar(sentiments, sentiment_nums)

In [None]:
#define o index para cada label possivel 
class_to_index = {"Neutral":0, "Irrelevant":1, "Negative":2, "Positive": 3}

#Cria um dicionario invertido
index_to_class = dict((v,k) for k, v in class_to_index.items())

#cria as funcoes, que convertem os nomes para indices e vice versa
names_to_ids = lambda n: np.array([class_to_index.get(x) for x in n])
ids_to_names = lambda n: np.array([index_to_class.get(x) for x in n])

In [None]:
print(names_to_ids(["Positive", "Negative", "Irrelevant", "Neutral"]))
print(ids_to_names([0,1,2,3]))

In [None]:
#converte os "Sentimentos" para indices
df["Sentiment"] = names_to_ids(df["Sentiment"])

In [None]:
df.head()

In [None]:
def remove_stopwords(ls):
    #Remove as palavras "stops"
    ls = [lemmatiser.lemmatize(word) for word in ls if word not in (stop_english) and (word.isalpha())]
    
    #Junta cada string em um string unica
    ls = " ".join(ls)
    return ls

#separa cada string em um lista de palavras
df["Tweet_Content_Split"] = df["Tweet_Content"].apply(word_tokenize)

#Aplica as funçoes acima para cada entrada no DataFrame
lemmatiser = WordNetLemmatizer()
stop_english = Counter(stopwords.words())

df["Tweet_Content_Split"] = df["Tweet_Content_Split"].apply(remove_stopwords)

In [None]:
df.head()

In [None]:
tokeniser = Tokenizer(num_words=10000, lower=True)

tokeniser.fit_on_texts(df["Tweet_Content_Split"])

In [None]:
tweet_tokens = tokeniser.texts_to_sequences(list(df["Tweet_Content_Split"]))

In [None]:
tweet_tokens = pad_sequences(tweet_tokens, truncating = 'post', padding='post', maxlen=50)

In [None]:
tweet_tokens.shape

In [None]:
y = df["Sentiment"]

df = pd.DataFrame(tweet_tokens)

In [None]:
df.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=1)

In [None]:
model = tf.keras.models.Sequential([
tf.keras.layers.Embedding(10000,16,input_length=50),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(20, return_sequences=True)),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(20)),
tf.keras.layers.Dense(4, activation='softmax')
])
model.compile(
     loss='sparse_categorical_crossentropy',
     optimizer='adam',
     metrics=['accuracy']
)

In [None]:
h = model.fit(
     X_train, y_train,
     validation_data=(X_test, y_test),
     epochs=15,
     callbacks=[tf.keras.callbacks.EarlyStopping(monitor='accuracy', patience=5)]
)

In [None]:
y_pred = np.argmax(model.predict(X_test), axis=1)

y_pred_labels = ids_to_names(y_pred)
y_test_labels = ids_to_names(y_test)

In [None]:
y_unique = list(set(y_test_labels))
cm = confusion_matrix(y_test_labels, y_pred_labels, labels = y_unique, normalize='true')

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=y_unique)
disp.plot()

In [None]:
df = train_df

In [None]:
df["Sentiment"] = names_to_ids(df["Sentiment"])
y = df["Sentiment"]

test_df["Sentiment"] = names_to_ids(test_df["Sentiment"])
y_test = test_df["Sentiment"]

In [None]:
df["Tweet_Content_Split"] = df["Tweet_Content"].apply(word_tokenize)

lemmatiser = WordNetLemmatizer()
stop_english = Counter(stopwords.words())

df["Tweet_Content_Split"] = df["Tweet_Content_Split"].apply(remove_stopwords)

test_df["Tweet_Content_Split"] = test_df["Tweet_Content"].apply(word_tokenize)

test_df["Tweet_Content_Split"] = test_df["Tweet_Content_Split"].apply(remove_stopwords)

In [None]:
tokeniser = Tokenizer(num_words=10000, lower=True)

tokeniser.fit_on_texts(df["Tweet_Content_Split"])

tweet_tokens = tokeniser.texts_to_sequences(list(df["Tweet_Content_Split"]))
tweet_tokens = pad_sequences(tweet_tokens, truncating = 'post', padding='post', maxlen=50)

tweet_tokens_test = tokeniser.texts_to_matrix(list(test_df["Tweet_Content_Split"]))
tweet_tokens_test = pad_sequences(tweet_tokens_test, truncating = 'post', padding='post', maxlen=50)

In [None]:
full_df = pd.DataFrame(tweet_tokens)

full_test_df = pd.DataFrame(tweet_tokens_test)

In [None]:
model = tf.keras.models.Sequential([
tf.keras.layers.Embedding(10000,16,input_length=50),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(20, return_sequences=True)),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(20)),
tf.keras.layers.Dense(4, activation='softmax')
])

model.compile(
     loss='sparse_categorical_crossentropy',
     optimizer='adam',
     metrics=['accuracy']
)

In [None]:
h = model.fit(
     full_df, y,
     validation_data=(full_test_df, y_test),
     epochs=30,
     callbacks=[tf.keras.callbacks.EarlyStopping(monitor='accuracy', patience=5)]
)

In [None]:
y_pred = np.argmax(model.predict(full_test_df), axis=1)

y_pred_labels = ids_to_names(y_pred)
y_test_labels = ids_to_names(y_test)

In [None]:
y_unique = list(set(y_test_labels))
cm = confusion_matrix(y_test_labels, y_pred_labels, labels = y_unique, normalize='true')

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=y_unique)
disp.plot()

In [None]:
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x1f3cac89fa0>

In [None]:
accuracy_score(y_test, y_pred)