## The Aim of this project is to perform Sentiment Analysis using Naive Bayes from Scratch as well as using Libraries

In [171]:
# import libraries
import pandas as pd
import numpy as np
import nltk
import regex as re
import string
from nltk.corpus import stopwords
nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.metrics import accuracy_score
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from collections import Counter 
from copy import deepcopy
import math


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [83]:
# Read Files

train_data=pd.read_csv('Data2/all-data.csv')
# test_data=pd.read_csv('Data1/test.csv')

In [84]:
train_data

Unnamed: 0,sentiment,text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...
...,...,...
4841,negative,LONDON MarketWatch -- Share prices ended lower...
4842,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843,negative,Operating profit fell to EUR 35.4 mn from EUR ...
4844,negative,Net sales of the Paper segment decreased to EU...


### Data Preprocessing

In [85]:
# training and Testing data have missing values
train_data.isnull().sum()

sentiment    0
text         0
dtype: int64

In [86]:
# test_data.isnull().sum()

In [87]:
# Drop missing values
train_data.dropna(inplace=True)
# test_data.dropna(inplace=True)

In [88]:
def preprocessing(data):
 
    # Lowercasing
    data['text'] =  data['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))


    # Remove \n, \\n
    data['text'] =  data['text'].apply(lambda x: x.replace('\\n',''))
    data['text'] = data['text'].apply(lambda x: x.replace('\n',''))


    # # Remove https
    data['text'] =  data['text'].apply(lambda i: re.sub(r"http\S+", "", i))

#     Remove punctuations
    data['text'] =  data['text'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

#     Remove stopwords
    stop_words = set(stopwords.words('english'))
    data['text'] = data['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop_words))

    # # Remove digits
    pattern='[0-9]'
    
    data['text'] =  data['text'].apply(lambda i: re.sub(pattern, '', i))

    return data



In [89]:
train=preprocessing(train_data)
# test=preprocessing(test_data)

### Sentiment Analysis using Sklearn

In [90]:
# Create Bag of Words
count_vect = CountVectorizer()
bow = count_vect.fit_transform(train['text'])
bow = np.array(bow.todense())


In [93]:
X = bow
y = train['sentiment']

In [103]:
# Split the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    stratify=y)

In [104]:
# Create model
model = MultinomialNB().fit(X_train, y_train)


In [105]:
y_pred = model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
print('F1 score:', f1_score(y_test, y_pred, average="macro"))

Accuracy: 0.711340206185567
F1 score: 0.6540706094636626


### Sentiment Analysis from Scratch

Bag of Words and Vocabulary

In [122]:
def bag_of_words(vocab, text):
    

    words = text.split()
    b=Counter(words)
    
    bag_of_words={**vocab, **b}
    return bag_of_words

In [112]:

def create_vocabulary(text):
    
    vocabulary={}
    for tweet in text:
#         print(tweet[0])
        token=tweet.split()
        [vocabulary.update({x: 0}) for x in token if x not in vocabulary.keys()]

    return vocabulary

In [153]:
# Count no of words for each label
def count_label_words(data,vocab,labels):

    classes=data.groupby('sentiment')
    class_count_list=[]
    for i in labels:
        group=classes.get_group(i)
        dictionary_count={x:0 for x in vocab.keys()}
        for review in group['text']:

            words = review.split()
            b=Counter(words)
            dictionary_count = {
                key: dictionary_count.get(key, 0) + dict(b).get(key, 0) for key in dictionary_count
            }

           

        class_count_list.append(dictionary_count)
        
    return class_count_list

In [193]:
def test_train_split(dataset):
    
    X=dataset[['text','sentiment']]
    Y=dataset['sentiment']

    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
    return  X_train, X_test, y_train, y_test



In [194]:
X_train, X_test, y_train, y_test=test_train_split(train)

In [197]:
def laplace_smoothing(count_word_in_class,count_class,vocabulary_len):
    
    a=count_word_in_class+1
    b=count_class+vocabulary_len
    return math.log(a/b)

In [198]:
def trainNaiveBayes(x, label_words,labels,vocabulary):
    
    N=len(x) 
    count=[0,0,0]
    
    k=0
    for i in labels: 
        count[k]=len(x.groupby('sentiment').get_group(i)) # stores count for each label
        k+=1
    class_likelihood={}
    for i in range(0,len(labels)):
        class_likelihood[i]=deepcopy(vocabulary)
        
    class_priors=[0]*len(labels)
    
    for l in range(0,len(labels)):
        
        class_priors[l]= math.log(count[l]/N)
        for word in vocabulary.keys():
            
            #             count of word wi in class c +1/count of all words in class +|v|

            Probability= laplace_smoothing(label_words[l][word],sum(label_words[l].values()),len(vocabulary))
            class_likelihood[l][word]=  Probability
            
    return class_likelihood,class_priors



        
        

In [199]:
# Create Vocabulary
vocabulary=create_vocabulary(X_train['text'].to_numpy())


In [200]:
# Create Bag of Words
bow=[]

for text in X_train['text']:
    bow.append(bag_of_words(vocabulary,text))

In [201]:
# Create Word count for each label
labels=['positive','negative','neutral']
label_count=count_label_words(X_train,vocabulary,labels)


Training

In [202]:
class_likelihood,class_priors=trainNaiveBayes(X_train, label_count,labels,deepcopy(vocabulary))

Testing

In [226]:
def testNaiveBayes(test,class_likelihood,class_priors,labels,vocabulary):
    
    predictions=[]
    
    for text in test:
        # storing priors for each class
        scores=[prior for prior in class_priors]

        words = text.split()

        for word in words:
            if word in vocabulary.keys(): #ignore words that are not in test data

                for l in range(0,len(labels)):
#                     Since in class_likelihoods, we have probabilities stored for each word for each class
                    scores[l]+=class_likelihood[l][word]
        max_value = max(scores)
        value=scores.index(max_value)+1
        if value==1:
            predictions.append('positive')
        elif value==2:
            predictions.append('negative')
        else:
            predictions.append('neutral')

#         predictions.append(scores.index(max_value)+1)
        
    return predictions


In [227]:
predictions=testNaiveBayes(X_test['text'],class_likelihood,class_priors,labels,deepcopy(vocabulary))

In [229]:
print('Accuracy:', accuracy_score(y_test, predictions))
print('F1 score:', f1_score(y_test, predictions, average="macro"))

Accuracy: 0.7010309278350515
F1 score: 0.6280812669577336


Dataset used: https://www.kaggle.com/datasets/ankurzing/sentiment-analysis-for-financial-news