# Naive Bayes Sentiment Analysis
This is even non-parametric meaning we just need to prepare the data and we're good!

In [43]:
import numpy as np
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import pandas as pd

In [20]:
# Example use of tokenizer
tweet_text = "Hey! @CDPR! When do we get Cyberpunk 2088? #FuckArasaka"
tokenizer = RegexpTokenizer(r"\w+")
tokenizer.tokenize(tweet_text)

['Hey', 'CDPR', 'When', 'do', 'we', 'get', 'Cyberpunk', '2088', 'FuckArasaka']

In [21]:
nltk.download("stopwords")
stopwords_set = set(stopwords.words("english"))
print(stopwords_set)

{'very', 'who', 'out', "we're", 'ourselves', "they'll", 'be', 'again', 'himself', 'some', 'during', 'own', 'when', 'your', 'having', 'over', 'should', "isn't", "we'll", "shan't", "wouldn't", 'each', "he'll", 'from', 'in', 'to', 'because', 'doing', 'off', 'these', "i'd", 'between', 'can', "should've", 'while', 'only', "they'd", 'were', 'nor', 'hasn', 'a', 'an', 'me', 'she', 'what', 'they', 't', 'there', 'with', 'yours', 'against', 'all', 'the', 'where', 'theirs', 'wouldn', "hadn't", 'or', 'i', 'once', 'his', 'weren', 'won', 'hadn', 'before', 'any', "don't", 'for', "it's", 'he', 'ma', 'shan', "we've", "you've", 'than', 'through', 'not', 'both', 'is', 'so', 've', 'this', 'here', 'whom', 'haven', "mightn't", "i've", 'him', 's', 'as', 'mightn', 'then', 'o', "you'd", 'how', "i'm", 'ours', 'why', 'that', 'down', "she'd", 'shouldn', 'does', "doesn't", "aren't", 'by', "couldn't", "i'll", 'now', 'do', 'and', "he'd", "you're", 'yourself', 'more', 'had', 'we', "you'll", 'other', 'aren', 'being', "

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/cheslaff/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
dataframe = pd.read_csv("sentiment_analysis.csv")
dataframe

Unnamed: 0,Year,Month,Day,Time of Tweet,text,sentiment,Platform
0,2018,8,18,morning,What a great day!!! Looks like dream.,positive,Twitter
1,2018,8,18,noon,"I feel sorry, I miss you here in the sea beach",positive,Facebook
2,2017,8,18,night,Don't angry me,negative,Facebook
3,2022,6,8,morning,We attend in the class just for listening teac...,negative,Facebook
4,2022,6,8,noon,"Those who want to go, let them go",negative,Instagram
...,...,...,...,...,...,...,...
494,2015,10,18,night,"According to , a quarter of families under six...",negative,Twitter
495,2021,2,25,morning,the plan to not spend money is not going well,negative,Instagram
496,2022,5,30,noon,uploading all my bamboozle pictures of facebook,neutral,Facebook
497,2018,8,10,night,congratulations ! you guys finish a month ear...,positive,Twitter


In [29]:
# Again, remove all the other info
data = dataframe[(dataframe["sentiment"] == "positive") | (dataframe["sentiment"] == "negative")][["text", "sentiment"]]
data

Unnamed: 0,text,sentiment
0,What a great day!!! Looks like dream.,positive
1,"I feel sorry, I miss you here in the sea beach",positive
2,Don't angry me,negative
3,We attend in the class just for listening teac...,negative
4,"Those who want to go, let them go",negative
...,...,...
492,"Sorry, we`ll try to keep it down.",negative
494,"According to , a quarter of families under six...",negative
495,the plan to not spend money is not going well,negative
497,congratulations ! you guys finish a month ear...,positive


In [30]:
data["sentiment"].value_counts()

sentiment
positive    166
negative    134
Name: count, dtype: int64

In [54]:
X = np.array(data["text"])
y = np.array(data["sentiment"])

In [55]:
print(X[0])
# Tokenize sentences
for i in range(X.shape[0]):
    X[i] = tokenizer.tokenize(X[i])
print(X[0])

What a great day!!! Looks like dream.
['What', 'a', 'great', 'day', 'Looks', 'like', 'dream']


In [56]:
# Remove Stopwords
print(X[0])
for i in range(X.shape[0]):
    X[i] = [w for w in X[i] if w.lower() not in stopwords_set]
print(X[0])

['What', 'a', 'great', 'day', 'Looks', 'like', 'dream']
['great', 'day', 'Looks', 'like', 'dream']


In [57]:
# Stem words
stemmer = PorterStemmer()
print(X[0])
for i in range(X.shape[0]):
    X[i] = [stemmer.stem(w) for w in X[i]]
print(X[0])

['great', 'day', 'Looks', 'like', 'dream']
['great', 'day', 'look', 'like', 'dream']


**We're done with simple preprocessing!**
Time to form our dataset we'll be passing as input

In [58]:
y = np.array([int(lbl == "positive")for lbl in y])
y[:10]

array([1, 1, 0, 0, 0, 1, 1, 1, 0, 0])

In [61]:
vocab = set()
for sentence in X:
    for word in sentence:
        if word not in vocab:
            vocab.add(word)

In [66]:
stemmer.stem("happy")

'happi'

In [70]:
positive_counts = {}
negative_counts = {}

for word in vocab:
    for sentence, label in zip(X, y):
        if word in sentence and label == 0:
            negative_counts[word] = negative_counts.get(word, 1) + 1  # Laplacian smoothing
        elif word in sentence and label == 1:
            positive_counts[word] = positive_counts.get(word, 1) + 1  # Same here

for word in vocab:
    if word not in positive_counts:
        positive_counts[word] = 1  # And here it's also laplacian smoothing
    elif word not in negative_counts:
        negative_counts[word] = 1
    
positive_counts["cute"], negative_counts["cute"]

(3, 1)

In [98]:
positive_sum = sum([v for v in positive_counts.values()])
negative_sum = sum([v for v in negative_counts.values()])
positive_sum, negative_sum

(1925, 1680)

In [100]:
positive_probas = {}
negative_probas = {}
for word in vocab:
    positive_probas[word] = positive_counts[word] / positive_sum
    negative_probas[word] = negative_counts[word] / negative_sum

In [101]:
positive_probas["cute"], negative_probas["cute"]

(0.0015584415584415584, 0.0005952380952380953)

In [106]:
# Okay, this is it!
# No training for this one
# Let's test it out!

def predict(sentence):
    tokenized = tokenizer.tokenize(sentence)
    cleaned = [w for w in tokenized if w.lower() not in stopwords_set]
    stemmed = [stemmer.stem(w) for w in cleaned]
    log_prior = np.log(166 / 134)  # Sorry for hardcoding it, but we need it since the data is a bit skewed
    score = 0.0
    for word in stemmed:
        if word not in vocab:
            continue  # unknown word is considered neutral and adds nothing to the score
        score += np.log(positive_probas[word] / negative_probas[word])
    print(score)
    print("Positive" if score > 0.0 else "Negative")

In [110]:
predict("Hello there people! :D")

1.655627294903475
Positive


In [111]:
predict("Sad depression sad! :(")

-4.790423157647622
Negative


### The higher the score is the more positive sentiment is
### The less the score is the more negative sentiment is
### Score of 0 is neutral
But I think you know all these details if you're going through the specialization like I do.

In [114]:
predict("Deez Nuts")  # Ceritified brainrot

0.5570150062353653
Positive


# Looks Like it works as expected
(Notice: It's a very very very very very very dirty baseline)