## Análise de sentimentos - Machine Learning 

In [1]:
#importações das bibliotecas 
import numpy as np 
import pandas as pd
import re    #expressões regulares
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score
import pickle

In [2]:
comentarios = pd.read_csv('C:/Users/Eduardo/Desktop/TCC-programa/train_gr/train.csv') #carregando dados de treino


In [3]:
print(comentarios.shape) #exibir na tela o tamanho da base de dados
comentarios.head()

(17494, 5)


Unnamed: 0,review_id,title,year,user_review,user_suggestion
0,1,Spooky's Jump Scare Mansion,2016.0,I'm scared and hearing creepy voices. So I'll...,1
1,2,Spooky's Jump Scare Mansion,2016.0,"Best game, more better than Sam Pepper's YouTu...",1
2,3,Spooky's Jump Scare Mansion,2016.0,"A littly iffy on the controls, but once you kn...",1
3,4,Spooky's Jump Scare Mansion,2015.0,"Great game, fun and colorful and all that.A si...",1
4,5,Spooky's Jump Scare Mansion,2015.0,Not many games have the cute tag right next to...,1


In [4]:
comentarios.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17494 entries, 0 to 17493
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   review_id        17494 non-null  int64  
 1   title            17494 non-null  object 
 2   year             17316 non-null  float64
 3   user_review      17494 non-null  object 
 4   user_suggestion  17494 non-null  int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 683.5+ KB


In [5]:
comentarios.user_suggestion.value_counts() #exibir números de reviews positivos e negativos

1    9968
0    7526
Name: user_suggestion, dtype: int64

In [6]:
comentarios.user_review[0] #exibir o primeiro review

"I'm scared and hearing creepy voices.  So I'll pause for a moment and write a review while I wait for my heart beat to return to atleast somewhat calmer times.  This game is adorable and creepy like my happy tree friends but with the graphics sceme of my childhood (but more bubble and 'clean').  Hello 1990's.What charactes there are (that isnot trying to kill me) were likable and a bit odd.  I did do a few noob things though, such as:Oh look a class room full of ghosts from dead children, lets shine my flashlight on them and stand there staring at them..Or, hmm creepy music, I'll turn around and see if I can see what's chasing me.Never before in a game have I been this afraid of finding a locked door."

### Limpeza dos dados

In [7]:
import unidecode
def remover_caracteres(comentario):
    comentario = unidecode.unidecode(comentario)
    binary_data = bytes(comentario, encoding='utf-8')
    text = str(binary_data, encoding='utf-8')
    return re.sub(u'[^a-zA-Z0-9áéíóúÁÉÍÓÚâêîôÂÊÎÔãõÃÕçÇ ]', ' ',   text )

In [8]:
comentarios.user_review = comentarios.user_review.apply(remover_caracteres)

In [9]:
comentarios.user_review[0] #exibir o primeiro review sem caracteres especiais

'I m scared and hearing creepy voices   So I ll pause for a moment and write a review while I wait for my heart beat to return to atleast somewhat calmer times   This game is adorable and creepy like my happy tree friends but with the graphics sceme of my childhood  but more bubble and  clean     Hello 1990 s What charactes there are  that isnot trying to kill me  were likable and a bit odd   I did do a few noob things though  such as Oh look a class room full of ghosts from dead children  lets shine my flashlight on them and stand there staring at them  Or  hmm creepy music  I ll turn around and see if I can see what s chasing me Never before in a game have I been this afraid of finding a locked door '

### Remover Stop Words

In [10]:
def remove_stopWord(comentario):
    stop_words = stopwords.words("english")
    text = comentario.lower()
    frase_tokenize = word_tokenize(text)
    return [w for w in frase_tokenize if w not in stop_words]

In [11]:
comentarios.user_review = comentarios.user_review.apply(remove_stopWord)

In [12]:
comentarios.user_review[0] #exibir o primeiro review normalizado, tokenizado e sem as stop-words

['scared',
 'hearing',
 'creepy',
 'voices',
 'pause',
 'moment',
 'write',
 'review',
 'wait',
 'heart',
 'beat',
 'return',
 'atleast',
 'somewhat',
 'calmer',
 'times',
 'game',
 'adorable',
 'creepy',
 'like',
 'happy',
 'tree',
 'friends',
 'graphics',
 'sceme',
 'childhood',
 'bubble',
 'clean',
 'hello',
 '1990',
 'charactes',
 'isnot',
 'trying',
 'kill',
 'likable',
 'bit',
 'odd',
 'noob',
 'things',
 'though',
 'oh',
 'look',
 'class',
 'room',
 'full',
 'ghosts',
 'dead',
 'children',
 'lets',
 'shine',
 'flashlight',
 'stand',
 'staring',
 'hmm',
 'creepy',
 'music',
 'turn',
 'around',
 'see',
 'see',
 'chasing',
 'never',
 'game',
 'afraid',
 'finding',
 'locked',
 'door']

### Stemming

In [13]:
def stemming (comentario):
    snowBall = SnowballStemmer('english')
    return " ".join([snowBall.stem(word) for word in comentario])

In [14]:
comentarios.user_review = comentarios.user_review.apply(stemming)

In [15]:
comentarios.user_review[0] #exibir o primeiro review stemming

'scare hear creepi voic paus moment write review wait heart beat return atleast somewhat calmer time game ador creepi like happi tree friend graphic sceme childhood bubbl clean hello 1990 charact isnot tri kill likabl bit odd noob thing though oh look class room full ghost dead children let shine flashlight stand stare hmm creepi music turn around see see chase never game afraid find lock door'

### Criando Modelo

In [16]:
bag_of_words = np.array(comentarios.iloc[:,0].values)
y = np.array(comentarios.user_suggestion.values)
cv = CountVectorizer(max_features = 10000)
bag_of_words = cv.fit_transform(comentarios.user_review).toarray()
print ("bag_of_words.shape ",bag_of_words.shape)
print ("y.shape ",y.shape)

bag_of_words.shape  (17494, 10000)
y.shape  (17494,)


In [17]:
treinox, testex, treinoy, testey = train_test_split(bag_of_words,y,test_size=0.2, random_state = 10)

In [18]:
gaussian, multinomial, bernoulli = GaussianNB(), MultinomialNB(alpha=1.0,fit_prior=True),BernoulliNB(alpha=1.0,fit_prior=True)
gaussian.fit(treinox,treinoy)
multinomial.fit(treinox,treinoy)
bernoulli.fit(treinox,treinoy)

BernoulliNB()

In [19]:
ypg = gaussian.predict(testex)
ypm = multinomial.predict(testex)
ypb= bernoulli.predict(testex)

In [20]:
print ("Gausse =", accuracy_score(testey,ypg))
print ("Multinomial =", accuracy_score(testey,ypm))
print ("Bernoulli =", accuracy_score(testey,ypb))

Gausse = 0.5933123749642755
Multinomial = 0.8385252929408402
Bernoulli = 0.8105172906544728


In [21]:
pickle.dump(multinomial,open('modelo1.pkl', 'wb'))
pickle.dump(bernoulli,open('modelo2.pkl', 'wb'))

### Aplicando o Modelo

In [22]:
feed_back = pd.read_csv('C:/Users/Eduardo/Desktop/TCC-programa/test_gr/test.csv') #carregando dados de test
feed_back.head()

Unnamed: 0,review_id,title,year,user_review
0,1603,Counter-Strike: Global Offensive,2015.0,"Nice graphics, new maps, weapons and models. B..."
1,1604,Counter-Strike: Global Offensive,2018.0,I would not recommend getting into this at its...
2,1605,Counter-Strike: Global Offensive,2018.0,Edit 11/12/18I have tried playing CS:GO recent...
3,1606,Counter-Strike: Global Offensive,2015.0,The game is great. But the community is the wo...
4,1607,Counter-Strike: Global Offensive,2015.0,I thank TrulyRazor for buying this for me a lo...


In [23]:
feed_back["classificacao"] = feed_back.user_review
feed_back.head()

Unnamed: 0,review_id,title,year,user_review,classificacao
0,1603,Counter-Strike: Global Offensive,2015.0,"Nice graphics, new maps, weapons and models. B...","Nice graphics, new maps, weapons and models. B..."
1,1604,Counter-Strike: Global Offensive,2018.0,I would not recommend getting into this at its...,I would not recommend getting into this at its...
2,1605,Counter-Strike: Global Offensive,2018.0,Edit 11/12/18I have tried playing CS:GO recent...,Edit 11/12/18I have tried playing CS:GO recent...
3,1606,Counter-Strike: Global Offensive,2015.0,The game is great. But the community is the wo...,The game is great. But the community is the wo...
4,1607,Counter-Strike: Global Offensive,2015.0,I thank TrulyRazor for buying this for me a lo...,I thank TrulyRazor for buying this for me a lo...


In [24]:
feed_back.classificacao = feed_back.classificacao.apply(remover_caracteres)
feed_back.classificacao = feed_back.classificacao.apply(remove_stopWord)
feed_back.classificacao = feed_back.classificacao.apply(stemming)

In [25]:
vetor = list(feed_back["classificacao"])

In [27]:
vetor[0]

'nice graphic new map weapon model develop listen custom bit develop focus much thing import focus chang tick rate match make server 128 improv vac lot two custom realli want focus sticker ui hud chang skin stop mess around weapon'

In [37]:
dicionario = cv.vocabulary_
pickle.dump(dicionario,open('modelo1.pkl', 'wb'))
indice = 0
predicao = []

for indice in range(len(vetor)):
    bow, comentarios = [],word_tokenize(vetor[indice])
    for palavra in comentarios:
        bow.append(comentarios.count(palavra))

    inp=[]
    for i in dicionario:
        inp.append(vetor[indice].count(i[0]))

    predicao.append(multinomial.predict(np.array(inp).reshape(1,10000)))

In [38]:
predicao

[array([1], dtype=int64),
 array([1], dtype=int64),
 array([1], dtype=int64),
 array([1], dtype=int64),
 array([1], dtype=int64),
 array([1], dtype=int64),
 array([1], dtype=int64),
 array([1], dtype=int64),
 array([1], dtype=int64),
 array([1], dtype=int64),
 array([1], dtype=int64),
 array([1], dtype=int64),
 array([1], dtype=int64),
 array([1], dtype=int64),
 array([1], dtype=int64),
 array([1], dtype=int64),
 array([1], dtype=int64),
 array([1], dtype=int64),
 array([1], dtype=int64),
 array([1], dtype=int64),
 array([1], dtype=int64),
 array([1], dtype=int64),
 array([1], dtype=int64),
 array([1], dtype=int64),
 array([1], dtype=int64),
 array([1], dtype=int64),
 array([1], dtype=int64),
 array([1], dtype=int64),
 array([1], dtype=int64),
 array([1], dtype=int64),
 array([1], dtype=int64),
 array([1], dtype=int64),
 array([1], dtype=int64),
 array([1], dtype=int64),
 array([1], dtype=int64),
 array([1], dtype=int64),
 array([1], dtype=int64),
 array([1], dtype=int64),
 array([1], 