# Análise de Sentimento em Textos

#### Importações 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

#### Funções

In [1]:
def ps():
    print('\n\n' + '#'*80+ '\n\n')

#### Coletando o conjunto de dados

In [3]:
path = "sentimentos_tweet.csv"
df = pd.read_csv(path, header=None, encoding='ISO-8859-1', names=['target','id','date','flag', 'user', 'text'])

In [4]:
df = df.sample(n=200000)
df = df.drop(["id", "date", "flag", "user"], axis=1)

In [5]:
print(df.shape)
display(df.head())
ps()
print(df.info())
ps()
print('Quantidade de valores atribuídos aos sentimentos:\n',df['target'].value_counts())

(200000, 2)


Unnamed: 0,target,text
1313205,4,"Band recommendation, I Don't Want To Die In Te..."
1028882,4,Aaaand only SEVEN days until Origin 1. HECK YES!
903079,4,FINALLY FINISHED MY FUCKING FILM FINAL!!! (We...
656881,0,tired!! I want to sleep more but I have to wor...
255348,0,Wishing I had money to take my girlfriend out ...




##############################################################


<class 'pandas.core.frame.DataFrame'>
Int64Index: 200000 entries, 1313205 to 1288408
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   target  200000 non-null  int64 
 1   text    200000 non-null  object
dtypes: int64(1), object(1)
memory usage: 4.6+ MB
None


##############################################################


Quantidade de valores atribuídos aos sentimentos:
 0    100212
4     99788
Name: target, dtype: int64


In [6]:
#nltk.download('stopwords') # Download das palavras a serem removidas ("a", "o", "em", "para")
#nltk.download('punkt') # Download dis modelos treinados para tokenização de textos

#### Preparar os dados

In [7]:
def preprocess_text(text):
    # Remover caracteres especiais e pontuações
    text = re.sub(r'\W', ' ', text)
    
    # Converter o texto para minúsculas
    text = text.lower()
    
    # Tokenização do texto (cria um array contendo cada palavra separadamente)
    words = nltk.word_tokenize(text)
    
    # Remover stopwords
    stop_words = set(stopwords.words('english')) # Carrega o conjunto de palavras consideradas stopwords em inglês
    words = [word for word in words if word not in stop_words] # Remove o stopwords da lista
    
    # Stemming - Obter a palavra na forma raiz
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    
    # Reunir as palavras novamente em um texto
    processed_text = ' '.join(words)
    
    return processed_text

# Aplicar a fuunção de pré-processamento aos textos
df['processed_text'] = df['text'].apply(preprocess_text)

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
# Criar matriz de frequncia de palavras (5000 palavras)
vectorizer = CountVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['processed_text']).toarray()
y = df['target']

#### Treinar o modelo

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [13]:
y_pred = model.predict(X_test)

In [14]:
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Acurácia: {accuracy*100:.2f}%')
print('Matriz de confusão')
print(conf_matrix)

Acurácia: 76.28%
Matriz de confusão
[[14796  5352]
 [ 4136 15716]]


In [20]:
new_text = "I love u"
new_text_processed = preprocess_text(new_text)
new_text_processed = vectorizer.transform([new_text]).toarray()
prediction = model.predict(new_text_processed)
if prediction == 4:
    print(f'Seu comentário ({new_text}) é positivo! :D')
elif prediction == 0:
    print(f'Seu comentário ({new_text}) é negativo! :(')
else:
    print('Predição fora do padrão')

Seu comentário (I love u) é positivo! :D
