# NLP with Naive Bayes

**Objective:** Given a tweet, predict if it has **Positive** sentiment or **Negative** sentiment.

**TODO:**
* Train a naive bayes model on a sentiment analysis task
* Test using your model
* Compute ratios of positive words to negative words
* Do some error analysis
* Predict on your own tweet

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.corpus import twitter_samples, stopwords

In [2]:
# Load data
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

# Split Data into train and test set
train_pos = all_positive_tweets[:4000]
test_pos = all_positive_tweets[4000:]

train_neg = all_negative_tweets[:4000]
test_neg = all_negative_tweets[:4000]

train_x = train_pos + train_neg
test_x = test_pos + test_neg

train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
test_y = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg)))

#### Process Data

In [3]:
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
import re
import string

In [4]:
def process_tweet(tweet):
    cleaned_tokens = []
    
    stemmer = PorterStemmer()
    stop_words = stopwords.words('english')
    tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True,strip_handles=True)
    
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    tweet = re.sub(r'#', '', tweet)
    tweet = re.sub(r'\$\w*', '', tweet)
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    
    tweet_tokens = tokenizer.tokenize(tweet)
    for word in tweet_tokens:
        if (word not in stop_words) and (word not in string.punctuation):
            cleaned_tokens.append(stemmer.stem(word))
    return cleaned_tokens

In [5]:
custom_tweet = "RT @Twitter @chapagain Hello There! Have a great day. :) #good #morning http://chapagain.com.np"

process_tweet(custom_tweet)

['hello', 'great', 'day', ':)', 'good', 'morn']

In [6]:
def build_freqs(tweets, labels):
    freqs = {}
    
    for tweet, label in zip(tweets, labels):
        for word in process_tweet(tweet):
            pair = (word, label)
            freqs[pair] = freqs.get(pair, 0) + 1
    return freqs

In [7]:
tweets = ['i am happy', 'i am tricked', 'i am sad', 'i am tired', 'i am tired']
ys = [1, 0, 0, 0, 0]
build_freqs(tweets, ys)

{('happi', 1): 1, ('trick', 0): 1, ('sad', 0): 1, ('tire', 0): 2}

In [8]:
# Build freqs from the training set
freqs = build_freqs(train_x, train_y)

In [60]:
('xxxibmchll', 1) in freqs.keys()

True

In [73]:
def train_naive_bayes(train_x, train_y, freqs):
    loglikelihood = {}
    
    vocab = set([word[0] for word in freqs.keys()])
    V = len(vocab)
    
    Npos = 0
    Nneg = 0
    for pair in freqs.keys():
        if pair[1] > 0:
            Npos += freqs[pair]
        else:
            Nneg += freqs[pair]
    
    D = len(train_y)
    Dpos = len(list(filter(lambda x: x > 0, train_y)))
    Dneg = len(list(filter(lambda x: x <= 0, train_y)))
    
    log_prior = np.log(Dpos/D) - np.log(Dneg/D)
    
    for word in vocab:
        pair_pos, pair_neg = (word, 1), (word, 0)
        freq_w_pos = freqs.get(pair_pos, 0)
        freq_w_neg = freqs.get(pair_neg, 0)
        
        p_w_pos = (freq_w_pos + 1) / (Npos + V)
        p_w_neg = (freq_w_neg + 1) / (Nneg + V)
        
        loglikelihood[word] = np.log(p_w_pos / p_w_neg)
    return log_prior, loglikelihood

In [74]:
log_prior, loglikelihood = train_naive_bayes(train_x, train_y, freqs)

In [75]:
print(log_prior)
print(len(loglikelihood))

0.0
9089


In [76]:
def predict_naive_bayes(tweet, log_prior, loglikelihood):
    
    cleaned_tweet = process_tweet(tweet)
    
    p = 0
    p += log_prior
    for word in cleaned_tweet:
        p += loglikelihood.get(word, 0)
    
    return p

In [77]:
my_tweet = 'She smiled.'
p = predict_naive_bayes(my_tweet, log_prior, loglikelihood)
print('The expected output is', p)

The expected output is 1.5740278623499175


In [78]:
def test_naive_bayes(test_x, test_y, log_prior, loglikelihood):
    y_hat = []
    for tweet in test_x:
        if predict_naive_bayes(tweet, log_prior, loglikelihood) > 0:
            y_hat_temp = 1
        else:
            y_hat_temp = 0
        y_hat.append(y_hat_temp)
        
    error = np.mean(np.abs(y_hat - test_y))
        
    accuracy = 1 - error
    
    return accuracy

In [80]:
print("Naive Bayes accuracy = %0.4f" %
      (test_naive_bayes(test_x, test_y, log_prior, loglikelihood)))

Naive Bayes accuracy = 0.9966


In [83]:
for tweet in ['I am happy', 'I am bad', 'this movie should have been great.', 'great', 'great great', 'great great great', 'great great great great']:
    # print( '%s -> %f' % (tweet, naive_bayes_predict(tweet, logprior, loglikelihood)))
    p = predict_naive_bayes(tweet, log_prior, loglikelihood)
#     print(f'{tweet} -> {p:.2f} ({p_category})')
    print(f'{tweet} -> {p:.2f}')

I am happy -> 2.15
I am bad -> -1.29
this movie should have been great. -> 2.14
great -> 2.14
great great -> 4.28
great great great -> 6.41
great great great great -> 8.55
