In [60]:
import re
import nltk
import numpy as np
import pandas as pd
import string
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from os import getcwd
from sklearn.model_selection import train_test_split
from collections import  defaultdict, Counter
import math

In [47]:
data = {
    'review': [
        "I loved this movie, it was fantastic!",
        "What a waste of time, absolutely horrible.",
        "An excellent film with great performances.",
        "Terrible movie, I hated it.",
        "I enjoyed every minute of it.",
        "The movie was boring and predictable.",
        "Brilliant storytelling and amazing visuals!",
        "Not worth watching at all.",
        "One of the best movies I've seen.",
        "Worst film ever. Don't recommend."
    ],
    'sentiment': [
        "positive", "negative", "positive", "negative", "positive",
        "negative", "positive", "negative", "positive", "negative"
    ]
}

# data = {
#     'review': [
#         "I am happy because i am learning NLP",
#         "I am happy not sad",
#         "I am sad, I am not learning NLP",
#         "I am sad, not happy",
#     ],
#     'sentiment': [
#         "positive", "positive", "negative", "negative"
#     ]
# }

df = pd.DataFrame(data)
df

Unnamed: 0,review,sentiment
0,"I loved this movie, it was fantastic!",positive
1,"What a waste of time, absolutely horrible.",negative
2,An excellent film with great performances.,positive
3,"Terrible movie, I hated it.",negative
4,I enjoyed every minute of it.,positive
5,The movie was boring and predictable.,negative
6,Brilliant storytelling and amazing visuals!,positive
7,Not worth watching at all.,negative
8,One of the best movies I've seen.,positive
9,Worst film ever. Don't recommend.,negative


In [48]:
punctuations = set(string.punctuation)
stopwords = stop_words = {
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves',
    'you', 'your', 'yours', 'yourself', 'yourselves',
    'he', 'him', 'his', 'himself',
    'she', 'her', 'hers', 'herself',
    'it', 'its', 'itself',
    'they', 'them', 'their', 'theirs', 'themselves',
    'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those',
    'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
    'have', 'has', 'had', 'having',
    'do', 'does', 'did', 'doing',
    'a', 'an', 'the',
    'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while',
    'of', 'at', 'by', 'for', 'with', 'about', 'against',
    'between', 'into', 'through', 'during', 'before', 'after',
    'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under',
    'again', 'further', 'then', 'once',
    'here', 'there', 'when', 'where', 'why', 'how',
    'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such',
    'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very',
    'can', 'will', 'just', 'don', 'should', 'now'
}

stemmer = PorterStemmer()
# preprocess data
def preprocess_data(x):
    x = x.lower()
    x = "".join(char for char in x if char not in punctuations)
    words = word_tokenize(x)
    words = [word for word in words if word not in stopwords]
    stemmed = [stemmer.stem(word) for word in words]

    return stemmed


df["clean_review"] = df["review"].apply(preprocess_data)
df["clean_review"]

0                   [love, movi, fantast]
1          [wast, time, absolut, horribl]
2           [excel, film, great, perform]
3                   [terribl, movi, hate]
4                   [enjoy, everi, minut]
5                   [movi, bore, predict]
6     [brilliant, storytel, amaz, visual]
7                          [worth, watch]
8            [one, best, movi, ive, seen]
9    [worst, film, ever, dont, recommend]
Name: clean_review, dtype: object

In [49]:
X_train, X_test, y_train, y_test = train_test_split(df["clean_review"], df["sentiment"], test_size = 0.3, random_state = 42)

print("Shape X_train: ", X_train.shape)
print("Shape X_test: ", X_test.shape)
print("Shape y_train: ", y_train.shape)
print("Shape y_test: ", y_test.shape)

Shape X_train:  (7,)
Shape X_test:  (3,)
Shape y_train:  (7,)
Shape y_test:  (3,)


In [32]:
X_train, y_train

(0                   [love, movi, fantast]
 7                          [worth, watch]
 2           [excel, film, great, perform]
 9    [worst, film, ever, dont, recommend]
 4                   [enjoy, everi, minut]
 3                   [terribl, movi, hate]
 6     [brilliant, storytel, amaz, visual]
 Name: clean_review, dtype: object,
 0    positive
 7    negative
 2    positive
 9    negative
 4    positive
 3    negative
 6    positive
 Name: sentiment, dtype: object)

In [52]:
# build word frequency
def build_word_freq(x, y):
    word_freq = defaultdict(lambda: {'positive': 0, "negative": 0})
    for words, review in zip(x, y):
        for word in words:
            word_freq[word][review] += 1

    n_pos = 0
    n_neg = 0
    for word in word_freq:
        n_pos += word_freq[word]["positive"]
        n_neg += word_freq[word]["negative"]
        
    return word_freq, n_pos, n_neg

word_freq, n_pos, n_neg = build_word_freq(X_train, y_train)

print(f"Word frequnecy: {word_freq}\n Total positive class occurences: {n_pos}\n Total negative class occurences: {n_neg}")

Word frequnecy: defaultdict(<function build_word_freq.<locals>.<lambda> at 0x0000022786DFE5C0>, {'love': {'positive': 1, 'negative': 0}, 'movi': {'positive': 1, 'negative': 1}, 'fantast': {'positive': 1, 'negative': 0}, 'worth': {'positive': 0, 'negative': 1}, 'watch': {'positive': 0, 'negative': 1}, 'excel': {'positive': 1, 'negative': 0}, 'film': {'positive': 1, 'negative': 1}, 'great': {'positive': 1, 'negative': 0}, 'perform': {'positive': 1, 'negative': 0}, 'worst': {'positive': 0, 'negative': 1}, 'ever': {'positive': 0, 'negative': 1}, 'dont': {'positive': 0, 'negative': 1}, 'recommend': {'positive': 0, 'negative': 1}, 'enjoy': {'positive': 1, 'negative': 0}, 'everi': {'positive': 1, 'negative': 0}, 'minut': {'positive': 1, 'negative': 0}, 'terribl': {'positive': 0, 'negative': 1}, 'hate': {'positive': 0, 'negative': 1}, 'brilliant': {'positive': 1, 'negative': 0}, 'storytel': {'positive': 1, 'negative': 0}, 'amaz': {'positive': 1, 'negative': 0}, 'visual': {'positive': 1, 'negat

In [58]:
# build conditional liklihood
def build_prob(word_freq, n_pos, n_neg):
    vocab_size = len(word_freq)
    
    for word, freqs in word_freq.items():
        word_freq[word]["p_pos"] = (freqs["positive"] + 1) / (n_pos + vocab_size)
        word_freq[word]["p_neg"] = (freqs["negative"] + 1) / (n_neg + vocab_size)
        
    return word_freq

word_freq_prob = build_prob(word_freq, n_pos, n_neg)
word_freq_prob

defaultdict(<function __main__.build_word_freq.<locals>.<lambda>()>,
            {'love': {'positive': 1,
              'negative': 0,
              'p_pos': 0.05555555555555555,
              'p_neg': 0.03125,
              'lambda': 0.5753641449035618},
             'movi': {'positive': 1,
              'negative': 1,
              'p_pos': 0.05555555555555555,
              'p_neg': 0.0625,
              'lambda': -0.11778303565638351},
             'fantast': {'positive': 1,
              'negative': 0,
              'p_pos': 0.05555555555555555,
              'p_neg': 0.03125,
              'lambda': 0.5753641449035618},
             'worth': {'positive': 0,
              'negative': 1,
              'p_pos': 0.027777777777777776,
              'p_neg': 0.0625,
              'lambda': -0.8109302162163288},
             'watch': {'positive': 0,
              'negative': 1,
              'p_pos': 0.027777777777777776,
              'p_neg': 0.0625,
              'lambda': -0.8109302

In [56]:
# get log liklihood of each word
def get_log_liklihood(word_freq_prob):
    for word in word_freq_prob:
        word_freq_prob[word]["lambda"] = math.log(word_freq_prob[word]["p_pos"] / word_freq_prob[word]["p_neg"])

    return word_freq_prob

word_freq_prob = get_log_liklihood(word_freq_prob)
word_freq_prob

defaultdict(<function __main__.build_word_freq.<locals>.<lambda>()>,
            {'love': {'positive': 1,
              'negative': 0,
              'p_pos': 0.05555555555555555,
              'p_neg': 0.03125,
              'lambda': 0.5753641449035618},
             'movi': {'positive': 1,
              'negative': 1,
              'p_pos': 0.05555555555555555,
              'p_neg': 0.0625,
              'lambda': -0.11778303565638351},
             'fantast': {'positive': 1,
              'negative': 0,
              'p_pos': 0.05555555555555555,
              'p_neg': 0.03125,
              'lambda': 0.5753641449035618},
             'worth': {'positive': 0,
              'negative': 1,
              'p_pos': 0.027777777777777776,
              'p_neg': 0.0625,
              'lambda': -0.8109302162163288},
             'watch': {'positive': 0,
              'negative': 1,
              'p_pos': 0.027777777777777776,
              'p_neg': 0.0625,
              'lambda': -0.8109302

In [73]:
# get log prior of positive and negative classes
def get_log_prior(y):
    label_counts = Counter(y)
    n_total = len(y)
    
    log_priors = {}
    for label in label_counts:
        log_priors[f"p_{label}"] = label_counts[label] / n_total

    return log_priors

log_priors = get_log_prior(y_train)
log_priors

{'p_positive': 0.5714285714285714, 'p_negative': 0.42857142857142855}

### Inference on test example

In [76]:
def infer_message(text, word_freq_prob, log_priors):
    stemmed_words = preprocess_data(text)
    log_liklihood = 0
    
    for word in stemmed_words:
        if word not in word_freq_prob.keys():
            continue
        log_liklihood += word_freq_prob[word]["lambda"]

    score = math.log(log_priors["p_positive"] / log_priors["p_negative"]) + log_liklihood

    if score > 0:
        return f"Positive Sentiment ==> with Score = {score}"
    elif score < 0:
        return f"Negative Sentiment ==> with Score = {score}"
    else:
        return f"Neutral Sentiment ==> with Score = {score}"

In [83]:
infer_message("One of the best movies I've seen.", word_freq_prob, log_priors)

'Positive Sentiment ==> with Score = 0.16989903679539733'