In [58]:
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer
from nltk.tokenize import TweetTokenizer

In [31]:
dataset = pd.read_csv('NoThemeTweets.csv')

In [46]:
train_size = 6000
test_size = 4000

negative_tweets = list(dataset['tweet_text'].loc[dataset['sentiment']=='Negativo'].values)
positive_tweets = list(dataset['tweet_text'].loc[dataset['sentiment']=='Positivo'].values)

train_positive_tweets = positive_tweets[:train_size]
train_negative_tweets = negative_tweets[:train_size]

train_labels = [1 for tweet in train_positive_tweets] + [0 for tweet in train_negative_tweets] 
train_tweets = train_positive_tweets + train_negative_tweets

test_positive_tweets = positive_tweets[train_size: train_size+test_size]
test_negative_tweets = negative_tweets[train_size: train_size+test_size]

test_labels = [1 for tweet in test_positive_tweets] + [0 for tweet in test_negative_tweets]
test_tweets = test_positive_tweets + test_negative_tweets

In [33]:
pt_stopwords = stopwords.words('portuguese')

In [34]:
def process_tweet(tweet: str):
    """
    Removes non alphanumeric characters and performs stemming on each word
    Input
        tweet: string with a tweet
    Output
        process_tweet: list with processed words
    """
    tokenizer = TweetTokenizer()
    tokenized_tweet = tokenizer.tokenize(tweet)
    
    stemmer = RSLPStemmer()
    
    processed_tweet = [stemmer.stem(token) for token in tokenized_tweet if token.isalnum()]
    
        
    
    return processed_tweet

In [37]:
def count_tweets(tweets: list, labels: list):
        """
        Count the frequencies of each word in each class
        Input
            tweeets: list containing training tweets
            labels: list with labels for each tweet in training set
        Output
            result: dictionary containing (word, label) as keys and its frequencies as values
        """
        
        result = {}
        
        for tweet, label in zip(tweets, labels):
            for word in process_tweet(tweet):
                
                pair = (word, label)
                
                if pair in result:
                    result[pair] += 1
                else:
                    result[pair] = 1
    
        return result


In [54]:
freqs = count_tweets(train_tweets, train_labels)

In [55]:
def train_naive_bayes(freqs, train_tweets, train_labels):
    """
    Calculates the prior probabiblities and log likehood
    Input:
        freqs: dict with pair (word, label) and frequencies
        train_tweets: textual train data
        train_labels: train labels relative to the sentiment of each tweet
    Output:
        log_prior_probability: log of prior probability
        log_likehood: the log likehood of naive bayes equation
    """
    
    log_likehood = {}
    log_prior = 0
    
    vocabulary = set([pair[0] for pair in freqs.keys()])
    vocabulary_size = len(vocabulary)
    
    N_positives = N_negatives = len(train_tweets)/2
    D = len(train_tweets)
    
    log_prior = np.log(N_positive) = np.log(N_negative)
    
    for word in vocabulary:
        
        
        
        
    

{('14', 1): 7,
 ('par', 1): 455,
 ('eu', 1): 1454,
 ('ir', 1): 75,
 ('o', 1): 1458,
 ('meu', 1): 502,
 ('lik', 1): 17,
 ('já', 1): 352,
 ('dei', 1): 9,
 ('na', 1): 501,
 ('époc', 1): 6,
 ('só', 1): 372,
 ('quer', 1): 261,
 ('consegu', 1): 79,
 ('com', 1): 961,
 ('algum', 1): 110,
 ('cois', 1): 241,
 ('pra', 1): 617,
 ('pod', 1): 263,
 ('dorm', 1): 103,
 ('que', 1): 1959,
 ('lind', 1): 156,
 ('dia', 1): 302,
 ('pq', 1): 191,
 ('da', 1): 700,
 ('pr', 1): 4,
 ('jeit', 1): 43,
 ('é', 1): 1163,
 ('uma', 1): 630,
 ('ofert', 1): 5,
 ('ha', 1): 8,
 ('q', 1): 442,
 ('aprove', 1): 5,
 ('entend', 1): 40,
 ('mas', 1): 623,
 ('iss', 1): 329,
 ('foi', 1): 192,
 ('mais', 1): 488,
 ('porqu', 1): 96,
 ('pens', 1): 95,
 ('em', 1): 494,
 ('outr', 1): 145,
 ('carcinom', 1): 1,
 ('hepa', 1): 1,
 ('canc', 1): 1,
 ('de', 1): 1913,
 ('fig', 1): 2,
 ('deivison', 1): 1,
 ('lut', 1): 4,
 ('contr', 1): 16,
 ('a', 1): 1641,
 ('doenç', 1): 2,
 ('por', 1): 430,
 ('um', 1): 903,
 ('ano', 1): 129,
 ('e', 1): 1841,
 ('

In [57]:
for word, label in freqs.keys():
    print(word)
    print(label)

14
1
par
1
eu
1
ir
1
o
1
meu
1
lik
1
já
1
dei
1
na
1
époc
1
só
1
quer
1
consegu
1
com
1
algum
1
cois
1
pra
1
pod
1
dorm
1
que
1
lind
1
dia
1
pq
1
da
1
pr
1
jeit
1
é
1
uma
1
ofert
1
ha
1
q
1
aprove
1
entend
1
mas
1
iss
1
foi
1
mais
1
porqu
1
pens
1
em
1
outr
1
carcinom
1
hepa
1
canc
1
de
1
fig
1
deivison
1
lut
1
contr
1
a
1
doenç
1
por
1
um
1
ano
1
e
1
falec
1
ont
1
tard
1
esper
1
minh
1
explicaça
1
tenh
1
te
1
ajud
1
aquel
1
min
1
limp
1
tinh
1
marid
1
problem
1
me
1
adicion
1
aqu
1
no
1
fac
1
esquisit
1
princípi
1
ah
1
neg
1
velh
1
pa
1
vei
1
pergunt
1
tô
1
ela
1
dev
1
ter
1
tom
1
as
1
dor
1
viu
1
malza
1
pá
1
voc
1
melhor
1
log
1
se
1
és
1
feliz
1
trabalh
1
niss
1
ent
1
não
1
pesso
1
incomod
1
sua
1
opin
1
dão
1
ouv
1
seu
1
tem
1
grand
1
alcanc
1
vam
1
segu
1
inst
1
quis
1
sdv
1
vc
1
mt
1
obg
1
sint
1
até
1
agr
1
foo
1
400e84d5
1
1e93
1
4d95
1
d99fc5bbb12
1
qu
1
jog
1
youtub
1
seg
1
nem
1
perceb
1
hor
1
rs
1
hop
1
beaten
1
you
1
mr
1
from
1
faz
1
torresm
1
vix
1
tret
1
novelesc
1
bom

1
paté
1
imóvel
1
fun
1
fact
1
orkut
1
ss
1
cust
1
aa
1
amr
1
mark
1
tuan
1
jb
1
ilustr
1
hm
1
dai
1
netflix
1
g
1
tamanh
1
control
1
alin
1
wpp
1
mad
1
max
1
hmm
1
qm
1
gaj
1
apetec
1
nadaa
1
bgd
1
bol
1
interval
1
nolasc
1
extrem
1
medit
1
embal
1
descr
1
drogaaaaa
1
ob
1
instagr
1
refer
1
dieg
1
quiet
1
thank
1
cloverfield
1
paradox
1
tamagosh
1
renat
1
zapp
1
estre
1
golp
1
oxi
1
linda
1
pis
1
vmb
1
sff
1
crat
1
freud
1
lacan
1
usp
1
rib
1
ânim
1
bobylokk
1
lum
1
berry
1
mkt
1
cobr
1
raiv
1
irm
1
deslig
1
afet
1
porq
1
aconselh
1
declar
1
odi
1
manuel
1
ex
1
pret
1
alt
1
ju
1
bost
1
normaallll
1
sogr
1
cri
1
esqueç
1
palestr
1
eac
1
tio
1
diog
1
mic
1
mystic
1
posicion
1
radic
1
presidi
1
afin
1
talent
1
retir
1
mater
1
aquil
1
sof
1
211
1
acr
1
recíproc
1
uef
1
soc
1
alhei
1
destribu
1
ódi
1
ranc
1
quadr
1
pendur
1
bebê
1
sunshin
1
bel
1
mia
1
nct
1
reviv
1
slc
1
punk
1
formul
1
ceg
1
asiá
1
particip
1
v
1
após
1
mereç
1
jbg
1
bg
1
je
1
sel
1
reativ
1
rodryg
1
yur
1
albert
1
braz


look
1
bigod
1
bisco
1
tiquin
1
c3
1
pros
1
cozinh
1
paix
1
poes
1
urn
1
bizuuu
1
magnetud
1
inferniz
1
altur
1
nex
1
ilh
1
azurém
1
fbi
1
chup
1
lab
1
mian
1
haey
1
chuchu
1
vao
1
falenc
1
souc
1
conversei
1
viuuu
1
reafirm
1
espelh
1
portant
1
surub
1
anã
1
travest
1
andré
1
wildest
1
dre
1
mercúri
1
retrógr
1
solte
1
nmr
1
lopez
1
bronz
1
racont
1
vie
1
pedofil
1
assassinat
1
qualific
1
reduz
1
provoc
1
aproximand
1
apetit
1
great
1
cma
1
w
1
pastel
1
simpat
1
colégi
1
depil
1
ihih
1
globonew
1
algarv
1
camarat
1
laboratóri
1
set
1
desvend
1
tipograf
1
confus
1
jéss
1
sobrancelh
1
rascunh
1
tag
1
lib
1
camb
1
tú
1
monic
1
joey
1
mônic
1
abism
1
romp
1
press
1
conqu
1
oop
1
direcion
1
descar
1
crit
1
zin
1
kirish
1
arrecad
1
institu
1
bullying
1
90
1
paperpil
1
indonés
1
batalh
1
medíocr
1
brow
1
bebé
1
wtv
1
influenc
1
contribu
1
crueldad
1
zon
1
necesit
1
morirm
1
ipurpl
1
project
1
luis
1
blog
1
facul
1
fd
1
machuqu
1
anbstony
1
ban
1
térre
1
abelh
1
felixxxxxx
1
sient
1
neym
1
vá

1
mágo
1
trag
1
solid
1
layout
1
descrev
1
desfoc
1
bomd
1
troç
1
eheheh
1
somni
1
ahahahah
1
portim
1
alinh
1
trad
1
tondel
1
scp
1
slb
1
fcp
1
lógic
1
polém
1
nau
1
zentii
1
pom
1
33
1
hmmm
1
amoooo
1
pabll
1
vitt
1
anitt
1
malandr
1
iza
1
pesad
1
ging
1
oswald
1
jorel
1
comic
1
want
1
fag
1
nelson
1
hahaa
1
vít
1
nee
1
leir
1
assedi
1
assed
1
trip
1
aquec
1
zel
1
malh
1
nun
1
spop
1
sag
1
manc
1
voad
1
trocadilh
1
choc
1
medic
1
proac
1
logic
1
8ª
1
full
1
temper
1
orég
1
azeit
1
quinhent
1
exag
1
econôm
1
reconsider
1
vident
1
expliqu
1
burgu
1
tenil
1
minibi
1
denúnc
1
estré
1
fexxt
1
rebol
1
odeiovc
1
estic
1
hollaaaaaaaaaaaand
1
wing
1
demian
1
goril
1
amass
1
garimp
1
obrigadass
1
urg
1
oloc
1
obvi
1
monopoly
1
plu
1
equal
1
quick
1
garf
1
faqu
1
interpret
1
prism
1
blanc
1
nathan
1
mano
1
amoooooo
1
interromp
1
gu
1
rolet
1
indiferenç
1
pentab
1
romên
1
variant
1
perfect
1
favel
1
estend
1
boraa
1
retard
1
secalh
1
gi
1
diam
1
puls
1
elo
1
heter
1
emoj
1
enton
1
cha
1
vilon
1


0
atras
0
abandon
0
sáb
0
bj
0
inst
0
priv
0
iphon
0
3030
0
num
0
senh
0
favor
0
febbbb
0
rox
0
fur
0
infecc
0
kkkkk
0
esquec
0
diss
0
sext
0
johnny
0
verdad
0
rim
0
obg
0
tad
0
performanc
0
prettymuch
0
curt
0
pizz
0
própri
0
estômag
0
sard
0
chav
0
vdd
0
caus
0
jog
0
tapet
0
vermelh
0
desfil
0
aaah
0
confi
0
capaz
0
louc
0
viu
0
cu
0
rar
0
lad
0
paraguay
0
chuv
0
jeongguk
0
forç
0
dev
0
8honeyab
0
pao
0
trat
0
dói
0
estrut
0
jbjb
0
aq
0
donkkk
0
segu
0
algum
0
minut
0
atr
0
block
0
ao
0
sig
0
aiiii
0
ignor
0
milit
0
vocêeeee
0
agreg
0
conhec
0
empat
0
falh
0
fof
0
ist
0
maç
0
black
0
peg
0
565
0
mê
0
tard
0
renn
0
paci
0
dem
0
embaix
0
azar
0
bombom
0
aplic
0
android
0
download
0
gif
0
friend
0
ror
0
can
0
egul
0
6
0
motor
0
gui
0
ceas
0
bor
0
ofend
0
tu
0
soub
0
ep
0
sid
0
just
0
ow
0
val
0
incri
0
coment
0
pervers
0
revers
0
mald
0
ilumin
0
univers
0
int
0
borm
0
fujosh
0
vergonh
0
exam
0
medic
0
diminu
0
hemorrág
0
esva
0
enquant
0
14º
0
máx
0
daqu
0
12º
0
saraaah
0
saud
0
prai
0


centímetr
0
editor
0
ditabrand
0
sof
0
armári
0
leit
0
alcânt
0
ediç
0
fc
0
órg
0
batalh
0
21
0
apen
0
idad
0
doenç
0
z
0
muitp
0
recíproc
0
modeusooo
0
admit
0
sungja
0
minhyuk
0
peggy
0
boat
0
yunjin
0
nossaaaa
0
analis
0
6ce8ed46
0
b24
0
9401c54fabe3
0
kssjk
0
dour
0
898dccb0
0
86d0
0
4e93
0
80bf
0
601ae1c555fb
0
magin
0
mago
0
velvet
0
tdo
0
delic
0
eae
0
asa
0
fing
0
brig
0
merm
0
sign
0
rebec
0
janel
0
rinit
0
top
0
wing
0
arranc
0
lágr
0
first
0
lov
0
reflection
0
seesaw
0
18h
0
incr
0
pareç
0
anjr
0
aaaaaaaaa
0
alfi
0
estabil
0
emoc
0
capítul
0
pedac
0
decepç
0
motiv
0
cin
0
jacaraip
0
tat
0
ctz
0
carinh
0
influenc
0
joan
0
hshshshsh
0
goal
0
amulet
0
trev
0
alegr
0
ruç
0
10º
0
dde
0
tru
0
gentee
0
au
0
noç
0
kkt
0
frequ
0
felip
0
fat
0
theu
0
leo
0
ambas
0
consecu
0
paranó
0
apag
0
naquel
0
déi
0
presenç
0
nin
0
frigoríf
0
térm
0
colic
0
anosh
0
br
0
ha
0
wla
0
jbr
0
5a6r
0
kskskdk
0
shhhh
0
andré
0
cbl
0
nn
0
saúd
0
fg
0
útil
0
aqui
0
cop
0
raf
0
kokor
0
loc
0
ccpt
0
ps
0
amb

0
român
0
madonn
0
amém
0
69208e78
0
d5
0
b930
0
8286fa09b4e8
0
hauahauahau
0
rob
0
bris
0
nikol
0
emocionad
0
ksksk
0
sksksk
0
admin
0
engr
0
camz
0
friozin
0
calç
0
ginás
0
rítm
0
anhemb
0
wif
0
yeontan
0
ds
0
salg
0
direction
0
colg
0
ferid
0
kdjskskak
0
fuç
0
desgram
0
guit
0
percuss
0
flr
0
fracass
0
ahhshshdhd
0
alucin
0
louil
0
infânc
0
alyc
0
ira
0
carl
0
matu
0
botafog
0
agregam
0
lobit
0
law
0
svu
0
angel
0
nul
0
kat
0
henriqu
0
pitoc
0
necessit
0
kakskakskkskzkz
0
kon
0
retribu
0
esfreg
0
pokémon
0
tet
0
tiffany
0
ijwd
0
apic
0
pend
0
geng
0
pdl
0
dodg
0
41
0
81
0
matin
0
ôh
0
awwwww
0
kdkxkl
0
defici
0
coid
0
livestre
0
magand
0
kkm
0
junh
0
facul
0
xa
0
fací
0
momm
0
webnam
0
jyp
0
kam
0
anoni
0
tromb
0
amarg
0
nobr
0
cheeeru
0
poxan
0
ocl
0
fraq
0
ous
0
tind
0
memor
0
covard
0
novooo
0
veterinár
0
animal
0
amô
0
minha
0
overwatch
0
procrastin
0
consci
0
1x
0
jb
0
crédit
0
troux
0
nutell
0
meninaaa
0
doyoung
0
infecç
0
minecraft
0
cumpr
0
promess
0
mederrub
0
but
0
sagitar