In [1]:
# Imports
import re
import pickle
import nltk
import sklearn
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

In [4]:
df = pd.read_csv('/content/exerccicio_aula02.csv')

In [5]:
df.shape

(5000, 2)

In [6]:
df.head()

Unnamed: 0,review,sentiment
0,"A decent movie with some good moments, but ove...",positive
1,Loved the humor and the chemistry between the ...,negative
2,This was the worst movie I have ever seen. I r...,neutral
3,"The plot had potential, but the execution was ...",positive
4,This was the worst movie I have ever seen. I r...,negative


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     5000 non-null   object
 1   sentiment  5000 non-null   object
dtypes: object(2)
memory usage: 78.3+ KB


In [8]:
df.sentiment.value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
negative,2037
positive,1984
neutral,979


In [9]:
#ajusta labels para representação numerica
df.sentiment.replace('positive', 1, inplace = True)
df.sentiment.replace('neutral', 0, inplace = True)
df.sentiment.replace('negative', 0, inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.sentiment.replace('positive', 1, inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.sentiment.replace('neutral', 0, inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setti

In [10]:
#funcao de limpeza geral de dados
def limpa_dados(texto):
	cleaned = re.compile(r'<.*?')
	return re.sub(cleaned, '', texto)

In [11]:
#testar funcao
texto_com_tags = "<p>Este é um exemplo <b>com</b> tags HTML.</p>"
texto_limpo = limpa_dados(texto_com_tags)
print(texto_limpo)

p>Este é um exemplo b>com/b> tags HTML./p>


In [12]:
#aplica funcao ao nosso dataset
df.review = df.review.apply(limpa_dados)

In [13]:
#retira caracteres alfanumerico
def limpa_caracter_especial(texto):
    rem = ''
    for i in texto:
        if i.isalnum():
            rem = rem + i
        else:
            rem = rem + ' '
    return rem

In [14]:
#testando a funcao
texto_com_caracteres_especiais = "olá, mundo! Como vai?"
texto_limpo = limpa_caracter_especial(texto_com_caracteres_especiais)
print(texto_limpo)

olá  mundo  Como vai 


In [15]:
df.review = df.review .apply(limpa_caracter_especial)

In [16]:
def converte_minusculo(texto):
    return texto.lower()
frase = "Esta é uma frasE com LETRAS MaiúscuLAs"
frase_saida = converte_minusculo(frase)
print(frase_saida)

esta é uma frase com letras maiúsculas


In [17]:
df.review = df.review.apply(converte_minusculo)

In [18]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [19]:
def remove_stopwords(texto):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(str(texto))
    return [w for w in words if w not in stop_words]

In [20]:
frase = 'They are right, as this is exactly what happedned with me.'
frase_saida = remove_stopwords(frase)

print(frase_saida)

['They', 'right', ',', 'exactly', 'happedned', '.']


In [21]:
df.review = df.review.apply(remove_stopwords)

In [22]:
def stemmer(texto):
    objeto_stemmer = SnowballStemmer('english')
    return " ".join([objeto_stemmer.stem(w) for w in texto])

In [23]:
df.review = df.review.apply(stemmer)

In [24]:
pd.set_option('display.max_colwidth', 120)

In [25]:
x = np.array(df.iloc[:,0].values)

In [26]:
y=np.array(df.sentiment.values)

In [27]:
x_treino, x_teste, y_treino, y_teste = train_test_split(x, y, test_size=0.2, random_state=0)

In [28]:
vetorizador = CountVectorizer(max_features=1000)

In [29]:
x_treino_final = vetorizador.fit_transform(x_treino).toarray()

In [30]:
x_teste_final = vetorizador.transform(x_teste).toarray()

In [31]:
print("x_treino_final:", x_treino_final.shape)
print("y_treino:", y_treino.shape)

x_treino_final: (4000, 66)
y_treino: (4000,)


In [32]:
print(x_treino_final)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 1 1]]


In [33]:
print("x_teste_final:", x_teste_final.shape)
print("y_teste:", y_teste.shape)

x_teste_final: (1000, 66)
y_teste: (1000,)


In [34]:
modelo_v1 = GaussianNB()

In [35]:
modelo_v1.fit(x_treino_final, y_treino)

In [36]:
modelo_v2 = MultinomialNB(alpha = 1.0, fit_prior = True)

In [37]:
modelo_v2.fit(x_treino_final, y_treino)

In [38]:
modelo_v3 = BernoulliNB(alpha = 1.0, fit_prior = True)

In [39]:
modelo_v3.fit(x_treino_final, y_treino)

In [40]:
ypred_v1 = modelo_v1.predict(x_teste_final)

In [41]:
ypred_v2 = modelo_v2.predict(x_teste_final)

In [42]:
ypred_v3 = modelo_v3.predict(x_teste_final)

In [43]:
  print("Acurácia do Modelo GaussianNB = ", accuracy_score(y_teste,ypred_v1)*100)
  print("Acurácia do Modelo MultinomialNB = ", accuracy_score(y_teste,ypred_v2)*100)
  print("Acurácia do Modelo BernoulliNB = ", accuracy_score(y_teste,ypred_v3)*100)

Acurácia do Modelo GaussianNB =  50.0
Acurácia do Modelo MultinomialNB =  58.4
Acurácia do Modelo BernoulliNB =  58.4


In [44]:
y_proba = modelo_v1.predict_proba(x_teste_final)[:,1]
auc = roc_auc_score(y_teste, y_proba)
print("AUC do Modelo GaussianNB=", auc)

AUC do Modelo GaussianNB= 0.47093399055041935


In [45]:
y_proba = modelo_v2.predict_proba(x_teste_final)[:,1]
auc = roc_auc_score(y_teste, y_proba)
print("AUC do Modelo MultinomialNB=", auc)

AUC do Modelo MultinomialNB= 0.4698414088465757


In [46]:
y_proba = modelo_v3.predict_proba(x_teste_final)[:,1]
auc = roc_auc_score(y_teste, y_proba)
print("AUC do Modelo BernoulliNB=", auc)

AUC do Modelo BernoulliNB= 0.4698414088465757


**Análise Crítica**

Os modelos não apresentaram um bom desempenho, pois os resultados do AUC estão baixos e isso indica que os resultados obtidos são praticamente aleatórios. Para melhorar e alcançar um desempenho adequado, será necessário testar outros modelos que apresentem, no mínimo, uma acurácia de 58,4% e um AUC de 0,70.