In [29]:
import json
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB


In [5]:
class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()

    def get_sentiment(self):
        if self.score <= 2:
            return 'NEGATIVE'
        elif self.score == 3:
            return 'NEUTRAL'
        else:
            return 'POSITIVE' # (Score of 4 or 5)

### Load the Dataset

In [19]:
reviews = []

dataset_path = 'data/books_small.json'
with open (dataset_path) as f:
    for line in f:
        review = json.loads(line)

        reviews.append(Review(review['reviewText'], review['overall']))
        

#### <center>Split data into training and testing sets</center>

In [11]:
training, testing = train_test_split(reviews, test_size= 0.3, random_state = 42)

train_x = [x.text for x in training]
train_y = [x.sentiment for x in training]

test_x = [x.text for x in testing]
test_y = [x.sentiment for x in testing]


#### <center>Bag of words vectorization (convert text to numerical vectors)</center>

In [17]:
vectorizer = CountVectorizer()

train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)


<h1><center>Supervised Classifiers</center></h1>

##### Logistic Regression:

In [30]:
clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)

clf_log.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

##### Decision Tree:

In [23]:
clf_decision = DecisionTreeClassifier()
clf_decision.fit(train_x_vectors, train_y)

clf_decision.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

##### Linear SVM:

In [32]:
clf_svm = svm.SVC(kernel = 'linear')
clf_svm.fit(train_x_vectors, train_y)

clf_svm.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

##### Naive Bayes:

In [40]:
# Naive Bayes needs dense vectors not sparse
clf_gnb = GaussianNB()
clf_gnb.fit(train_x_vectors.toarray(), train_y)

clf_gnb.predict(test_x_vectors[0].toarray())

array(['POSITIVE'], dtype='<U8')