In [2]:
### Class Definitions

import random

class Sentiment:
    NEGATIVE = 'NEGATIVE'
    NEUTRAL = 'NEUTRAL'
    POSITIVE = 'POSITIVE'

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
    
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else: ## Score of 4 and 5
            return Sentiment.POSITIVE  


class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
    def evenly_distribute(self):
        negative = [x for x in self.reviews if x.sentiment == Sentiment.NEGATIVE]
        positive = [x for x in self.reviews if x.sentiment == Sentiment.POSITIVE]
        positive_shrunk = positive[:len(negative)]

        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)
        return self.reviews

In [10]:
### Load Data

import json

file_name = './data/Books_small_10000.json'

reviews = []

with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'],review['overall']))



print(type(reviews[5]))

<class '__main__.Review'>


In [12]:
from sklearn.model_selection import train_test_split 

training, test = train_test_split(reviews, test_size=0.33, train_size=0.67, random_state=42, shuffle=True) ## splitting data into training and test sets

print(len(training))
print(len(test))

train_container = ReviewContainer(training).evenly_distribute()  ## reducing the amount of positive reviews to equal to the number of the negative reviews in the training set.
test_container = ReviewContainer(test).evenly_distribute() ## reducing the amount of positive reviews to equal to the number of the negative reviews in the test set.

print(len(train_container))


## Training Data
x_train = [x.text for x in train_container] ## list comprehension -- storing the text of the reviews in the input set for training 
y_train = [y.sentiment for y in train_container] ## list comprehension -- storing the sentiment of the reviews in the output set for training

print(len(x_train))


## Test Data
x_test = [x.text for x in test_container] ## list comprehension -- storing the text of the reviews in the input set for testing 
y_test = [y.sentiment for y in test_container] ## list comprehension -- storing the sentiment of the reviews in the output set for testing



6700
3300
872
872


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

train_x_vectors = vectorizer.fit_transform(x_train) ## creates a matrix, where each row is a review and the columns are the words in the training data. The values are the frequency of the words in the reviews.
test_x_vectors = vectorizer.transform(x_test)

print(train_x_vectors.shape)
print(test_x_vectors.shape)
print(train_x_vectors[0].toarray()) ## Prints out the frequency of the words in the first review in the training data.
print(test_x_vectors[0].toarray()) ## Prints out the frequency of the words in the first review in the test data.

#print(vectorizer.get_feature_names_out()) ## retrieves the words in the training data.
#print(x_train[0])
analyze = vectorizer.build_analyzer()
#print(analyze(x_train[0]))


#print(train_x_vectors.toarray())


(872, 8906)
(416, 8906)
[[0 0 0 ... 0 0 0]]
[[0 0 0 ... 0 0 0]]


In [14]:
### Classification -- Support Vector Machine

from sklearn import svm

clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors, y_train)

clf_svm.predict(test_x_vectors[0]) ## Sentiment prediction of the text at the 0th index.



array(['POSITIVE'], dtype='<U8')

In [17]:
### Classification -- Decision Tree

from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()

clf_dec.fit(train_x_vectors, y_train)

clf_dec.predict(test_x_vectors[0])


array(['NEGATIVE'], dtype='<U8')

In [16]:
### Classification -- Naive Bayes

from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()

clf_gnb.fit(train_x_vectors.toarray(), y_train)

clf_gnb.predict(test_x_vectors[0].toarray())

array(['POSITIVE'], dtype='<U8')

In [18]:
### Classification -- Logistic Regression

from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()

clf_log.fit(train_x_vectors, y_train)

clf_log.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

In [22]:
### Evaluation - Mean Accuracy

#print(clf_svm.score(test_x_vectors, y_test))

#print(clf_dec.score(test_x_vectors, y_test))

#print(clf_gnb.score(test_x_vectors.toarray(), y_test))

#print(clf_log.score(test_x_vectors, y_test))

### F1 Scores

from sklearn.metrics import f1_score

print(f1_score(y_test, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE]))
print(f1_score(y_test, clf_dec.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE]))
print(f1_score(y_test, clf_gnb.predict(test_x_vectors.toarray()), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE]))
print(f1_score(y_test, clf_log.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE]))

print(y_train.count(Sentiment.POSITIVE))
y_train.count(Sentiment.NEGATIVE)


[0.8028169  0.79310345]
[0.63461538 0.63461538]
[0.59574468 0.66666667]
[0.82325581 0.81094527]
436


436

In [20]:
print(y_test.count(Sentiment.NEGATIVE))
print(y_test.count(Sentiment.NEGATIVE))

208
208
