In [116]:
import pandas as pd
import numpy as np
import re
import nltk 
from nltk import word_tokenize, FreqDist
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [117]:
df = pd.read_csv('amazon_cells_labelled.txt',sep="\t")
df.columns = ['sentence', 'label']
df.head()

Unnamed: 0,sentence,label
0,"Good case, Excellent value.",1
1,Great for the jawbone.,1
2,Tied to charger for conversations lasting more...,0
3,The mic is great.,1
4,I have to jiggle the plug to get it to line up...,0


In [16]:
negative_doc = df[df['label'] == 0].copy()
negative_doc.head()

Unnamed: 0,sentence,label
2,Tied to charger for conversations lasting more...,0
4,I have to jiggle the plug to get it to line up...,0
5,If you have several dozen or several hundred c...,0
7,"Needless to say, I wasted my money.",0
8,What a waste of money and time!.,0


In [17]:
positive_doc = df[df['label'] == 1].copy()
positive_doc.head()

Unnamed: 0,sentence,label
0,"Good case, Excellent value.",1
1,Great for the jawbone.,1
3,The mic is great.,1
6,If you are Razr owner...you must have this!,1
9,And the sound quality is great.,1


In [0]:
sentences = df['sentence'].values
negative_sentences = negative_doc['sentence'].values
positive_sentences = positive_doc['sentence'].values

In [19]:
total_documents = len(df.label)
total_pdocs = len(positive_doc.label)
total_ndocs = len(negative_doc.label)
print(total_pdocs + total_ndocs == total_documents)

True


In [20]:
#Naive Bayes
negative_prior = total_ndocs/total_documents
positive_prior = total_pdocs/total_documents
#balanced dataset :)
print(negative_prior, positive_prior)

0.4994994994994995 0.5005005005005005


In [21]:
sentences[0]

'Good case, Excellent value.'

In [0]:
tokens = []
for line in sentences: 
  tokened = nltk.word_tokenize(line)
  for token in tokened: 
    tokens.append(token)

frequency = nltk.FreqDist(tokens)

In [0]:
ptokens = []

for line in positive_sentences:
  tokened = nltk.word_tokenize(line)
  for token in tokened: 
    ptokens.append(token)
pfreq = nltk.FreqDist(ptokens)

ntokens = []
for line in negative_sentences: 
  tokened = nltk.word_tokenize(line)
  for token in tokened: 
    ntokens.append(token)
nfreq = nltk.FreqDist(ntokens)

In [0]:
for word, value in frequency.items(): 
  pfreq[word] = np.log((pfreq[word] + 1 ) / frequency[word] + 1)
  nfreq[word] = np.log((nfreq[word] + 1 ) / frequency[word] + 1)

In [0]:
class naive_bayes():
  def __init__(self, document):
    self.document = document
    #sentence col name == 'sentences'
    self.sentences = document['sentence'].values
    self.positive_sentences = document[document['label']==1].copy()
    self.negative_sentences = document[document['label']==0].copy()

    self.positive_sentences = self.positive_sentences['sentence'].values
    self.negative_sentences = self.negative_sentences['sentence'].values

    #Getting priors for positive and negative classes
    self.N_doc = len(document['label'])
    self.p_prior = np.log(len(positive_sentences) / self.N_doc)
    self.n_prior = np.log(len(negative_sentences) / self.N_doc)
    
    #computing likelihoods for words given class
    self.vocabulary, self.pfreq, self.nfreq = self.get_tokens()

  def get_tokens(self):
    list_of_tokens = []
    #for vocab
    for line in self.sentences: 
      tokenized = nltk.word_tokenize(line)
      for word in tokenized: 
        list_of_tokens.append(word)
    
    vocabulary = set(list_of_tokens)


    #for pvocab
    pvocab = []
    for line in self.positive_sentences: 
      tokenized = nltk.word_tokenize(line)
      for word in tokenized: 
        pvocab.append(word)
    pfreq = FreqDist(pvocab)

    #for nvocab
    nvocab = []
    for line in self.sentences: 
      tokenized = nltk.word_tokenize(line)
      for word in tokenized: 
        nvocab.append(word)
    nfreq = FreqDist(nvocab)



    return [vocabulary, pfreq, nfreq]
    
    
  def get_probability(self, word):
    p_l = np.log((self.pfreq[word] + 1) / (len(self.pfreq) + len(self.vocabulary)))
    n_l = np.log((self.nfreq[word] + 1) / (len(self.nfreq) + len(self.vocabulary)))

    # positive = self.p_prior * p_l
    # negative = self.n_prior * n_l

    return [p_l, n_l]



v = naive_bayes(df).vocabulary

In [207]:
naive_bayes(df).get_probability('awesome')

[-6.787844982309579, -7.023758954738443]

In [0]:
s = 'hate'
naive_classifer = naive_bayes(df)
positive = 0
negative  = 0
for word in word_tokenize(s): 
  positive += naive_classifer.get_probability(word)[0]
  negative += naive_classifer.get_probability(word)[1]
positive = positive * naive_classifer.p_prior
negative = negative * naive_classifer.n_prior

In [267]:
if positive < negative: 
  print("Positive Sentiment")
else:
  print("Negative Sentiment")

Negative Sentiment
