# Sentiment Analysis Using Naive Bayes

In [1]:
# importing the necessary libraries
import pdb
from nltk.corpus import stopwords, twitter_samples
import numpy as np
import pandas as pd
import nltk

import string
from nltk.tokenize import TweetTokenizer
from os import getcwd
from sklearn.model_selection import train_test_split


In [2]:
# Download the dataset and stopwords
nltk.download("stopwords")
nltk.download("twitter_samples")

[nltk_data] Error loading stopwords: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>
[nltk_data] Error loading twitter_samples: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>


False

In [3]:
filePath = f"{getcwd()}/../tmp2/"
nltk.data.path.append(filePath)

In [4]:
# Divide into positive and negative tweets
positive_tweets=twitter_samples.strings("positive_tweets.json")
negative_tweets=twitter_samples.strings("negative_tweets.json")

In [5]:
# get the sets of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

# split the data into two pieces, one for training and one for testing (validation set)
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

train_x = train_pos + train_neg
test_x = test_pos + test_neg

# avoid assumptions about the length of all_positive_tweets
train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
test_y = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg)))

## Preprocessing the tweets

In [6]:
# import the preprocessing libraries
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer

import re
import string

In [7]:
# A custom function to clean tweets
def clean_tweet(tweet):
    '''
    This function accepts as input, a string of unprocessed tweets and produces
    preprocessed tweets as the output. '''
    
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # eliminate tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    clean_tweets = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
            word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            clean_tweets.append(stem_word)

    return clean_tweets

In [8]:
# Custom function to check if tweets are preprocessed
def lookup(function):
    occurences = {('sad', 0): 4,
             ('happy', 1): 12,
             ('oppressed', 0): 7}
    word = 'happy'
    label = 1
    if function(occurences, word, label) == 12:
        return "It works"
    return "Doesnt work!!!!!!!!!"

In [9]:
def lookup(freqs, word, label):
    n = 0

    pair = (word, label)
    if (pair in freqs):
        n = freqs[pair]

    return n

In [10]:
# UNQ_C1 (UNIQUE CELL IDENTIFIER, DO NOT EDIT)
def count_tweets(result, tweets, ys):
   
    for y, tweet in zip(ys, tweets):
        for word in clean_tweet(tweet):
            pair = (word,y)
            if pair in result:
                result[pair] += 1
            else:
                result[pair] = 1
   
    return result

In [11]:
# Testing the count_tweets() function


result = {}
tweets = ['i am joyous', 'i am tricked', 'i am sad', 'i am hungry', 'i am tired']
ys = [1, 0, 0, 0, 0]
count_tweets(result, tweets, ys)

{('joyou', 1): 1,
 ('trick', 0): 1,
 ('sad', 0): 1,
 ('hungri', 0): 1,
 ('tire', 0): 1}

## Building the Model -Naive Bayes Classifier

In [12]:
# Build the freqs dictionary for later uses

freqs = count_tweets({}, train_x, train_y)

In [13]:
def train_naive_bayes(freqs, train_x, train_y):
    loglikelihood = {}
    logprior = 0
    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)

    # calculate N_pos and N_neg
    N_pos = N_neg = 0
    for pair in freqs.keys():
        # if the label is positive (greater than zero)
        if pair[1] > 0:

            # Increment the number of positive words by the count for this (word, label) pair
            N_pos += freqs[pair]

        # else, the label is negative
        else:

            # increment the number of negative words by the count for this (word,label) pair
            N_neg += freqs[pair]

    # Calculate D, the number of documents
    D = len(train_y)

    # Calculate D_pos, the number of positive documents (*hint: use sum(<np_array>))
    D_pos =np.sum(train_y)

    # Calculate D_neg, the number of negative documents (*hint: compute using D and D_pos)
    D_neg = D-D_pos

    # Calculate logprior
    logprior =np.log(D_pos) - np.log(D_neg)

    # For each word in the vocabulary...
    for word in vocab:
        # get the positive and negative frequency of the word
        freq_pos =lookup(freqs,word,1)
        freq_neg =lookup(freqs,word,0)

        # calculate the probability that each word is positive, and negative
        p_w_pos = (freq_pos + 1) / (N_pos + V)
        p_w_neg = (freq_neg + 1) / (N_neg + V)

        # calculate the log likelihood of the word
        loglikelihood[word] = np.log(p_w_pos/p_w_neg)

    return logprior, loglikelihood


In [14]:
logprior, loglikelihood = train_naive_bayes(freqs, train_x, train_y)
print(logprior)
print(len(loglikelihood))

0.0
9089


In [15]:
# Custom function to test the accuracy of the classifier
def naive_bayes_predict(tweet, logprior, loglikelihood):
    word_l = clean_tweet(tweet)   
    p = 0   # probablity starts at zero
    p += logprior # increment probablity by the logprior

    for word in word_l:
        if word in loglikelihood:
            p += loglikelihood[word]

    return p


In [16]:
# Testing the accuracy of the classifier using synthetic tweets
tweet = ["Shit", "Awesome", "Great", "Dissapointed", "Poor"]

for item in tweet:
    p=naive_bayes_predict(item, logprior, loglikelihood)
    if p>=0:
        print(round(p,2),"--> Positive sentiment")
    elif p<=0:
        print(round(p,2),"--> Negative sentiment")
    

# print(p)

-0.81 --> Negative sentiment
1.8 --> Positive sentiment
2.14 --> Positive sentiment
0.0 --> Positive sentiment
-1.83 --> Negative sentiment


In [17]:
def test_classifier(test_x, test_y, logprior, loglikelihood):
    
    accuracy = 0 
   
    y_hats = []
    for tweet in test_x:
        # if the prediction is > 0
        if naive_bayes_predict(tweet, logprior, loglikelihood) > 0:
            y_hat_i = 1
        else:
            y_hat_i = 0

        # append the predicted class to the list y_hats
        y_hats.append(y_hat_i)

    # error is the average of the absolute values of the differences between y_hats and test_y
    error = np.mean(np.absolute(y_hats-test_y))

    # Accuracy is 1 minus the error
    accuracy = 1-error

    

    return accuracy


### Testing the accuracy of the classifier

In [18]:
print("Naive Bayes accuracy = %0.4f" %
      (test_classifier(test_x, test_y, logprior, loglikelihood)))

Naive Bayes accuracy = 0.9940
