## Data Class

In [80]:
import random

class Sentiment:
    NEGATIVE = "NEVATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else: #Score of 4 or greater
            return Sentiment.POSITIVE
        
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
        
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)

## Load Data

In [81]:
import json
import os

file_name = os.path.join('data', 'sentiment', 'Books_small_10000.json')

reviews = []

with open(file_name) as file:
    for line in file:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'],review['overall']))
        
reviews[5].sentiment

'POSITIVE'

## Prep Data

In [82]:
from sklearn.model_selection import train_test_split
# This will split our data into a training set and a testing set.
training, test = train_test_split(reviews, test_size=0.33, random_state = 42)

train_container = ReviewContainer(training)

test_container = ReviewContainer(test)

In [83]:
# This will split the training and test variables into the review text and its sentiment value

train_container.evenly_distribute()
training_x = train_container.get_text()
training_y = train_container.get_sentiment()

test_container.evenly_distribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

### Bag of Words Vectorization

In [84]:
from sklearn.feature_extraction.text import CountVectorizer #Use Tfidf instead next time

vectorizer = CountVectorizer()
train_x_vectors = vectorizer.fit_transform(training_x)
test_x_vectors = vectorizer.transform(test_x)

train_x_vectors

<872x8906 sparse matrix of type '<class 'numpy.int64'>'
	with 53647 stored elements in Compressed Sparse Row format>

## Classification

#### Linear SVM

In [85]:
from sklearn import svm #support vector machine

clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors, training_y)

test_x[0]

clf_svm.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

#### Decision Tree

In [86]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, training_y)

clf_dec.predict(test_x_vectors[0])

array(['NEVATIVE'], dtype='<U8')

#### Logistic Regression

In [87]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, training_y)

clf_log.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

### Evaluation

In [88]:
# Check to see how accurate these classifcations are at predicting the correct values

# Mean accuracy
print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
print(clf_log.score(test_x_vectors, test_y))

0.7980769230769231
0.6610576923076923
0.8149038461538461


In [89]:
# F1 Scores
from sklearn.metrics import f1_score

f1_score(test_y, clf_svm.predict(test_x_vectors), average = None, labels = [Sentiment.POSITIVE, Sentiment.NEGATIVE])

array([0.8028169 , 0.79310345])

### Testing With New Data

In [95]:
test_set = ['I thoroughly enjoyed this, 5 stars', 'bad book do not buy', 'horrible waste of time']
new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)

array(['POSITIVE', 'NEVATIVE', 'NEVATIVE'], dtype='<U8')

### Ways to Improve