In [11]:
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
import json


In [4]:
class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()

    def get_sentiment(self):
        if self.score <= 2:
            return "NEGATIVE"
        elif self.score == 3:
            return "NEUTRAL"
        else:
            return "POSITIVE"

Load dataset

In [14]:
dataset_path = 'datasets/bookReviews.json'

reviews = []

with open(dataset_path) as f:
    for line in f:
        review = json.loads(line)

        text = review['reviewText']
        score = review['overall']

        reviews.append(Review(text, score))

reviews[5].sentiment

'POSITIVE'

In [6]:
training, test = train_test_split(reviews, test_size=0.33, random_state=42)

print(training[0].sentiment)

POSITIVE


In [7]:
train_x = [x.text for x in training]
train_y = [x.sentiment for x in training]

test_x = [x.text for x in test]
test_y = [x.sentiment for x in test]

train_x[0]
train_y[0]

'POSITIVE'

<br>

**BoW vectorization**

In [8]:
vectorizer = CountVectorizer()

train_x_vectors = vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(test_x)

train_x[0]
train_x_vectors[0]


<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 17 stored elements and shape (1, 7372)>

***Classification***
<br>

##### Linear SVM classifier

In [9]:
clf_svm = svm.SVC(kernel = 'linear') #Classifier

clf_svm.fit(train_x_vectors, train_y)

test_x_vectors[0]


clf_svm.predict(test_x_vectors[0])


array(['POSITIVE'], dtype='<U8')

#### Decision Tree Classifier

In [10]:
clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')