Sentiment Analysis using **Bag of Words Vectorization-Based Models**

In [1]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

data = pd.read_csv('Finance_data.csv')
data.head()

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral


In [3]:
data.Sentiment.value_counts()

neutral     3130
positive    1852
negative     860
Name: Sentiment, dtype: int64

In [4]:
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
text_counts = cv.fit_transform(data['Sentence'])

In [5]:
#Splitting the data into training and testing
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, data['Sentiment'], test_size = 0.20, random_state = 6, stratify = data['Sentiment']) # VS stratify = data['Sentiment']

In [6]:
#Training the model
MNB = MultinomialNB()
MNB.fit(X_train, Y_train)

In [7]:
#Caluclating the accuracy score of the model
predicted = MNB.predict(X_test)
accuracy_score = metrics.accuracy_score(predicted, Y_test)
print("Accuracy Score: ",accuracy_score)

Accuracy Score:  0.6663815226689478


In [8]:
#For Model Testing
test_data = pd.read_csv('sampled2.csv')

In [9]:
test_data.Sentiment.value_counts()

positive    52
negative    35
neutral     12
Name: Sentiment, dtype: int64

In [10]:
test_label = test_data['Sentiment'].tolist()
test_sentence = test_data['Sentence'].tolist()

test_counts = cv.transform(test_data['Sentence'])

test_predicted = MNB.predict(test_counts)
score = metrics.accuracy_score(test_predicted, test_label)

print("Accuracy Score: ", score)

Accuracy Score:  0.5151515151515151
