Data Class

class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
    
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else: #Score of 4 or 5
            return Sentiment.POSITIVE

Load Data

In [7]:
import json

file_name = './data/sentiment/books_small.json'

reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))

reviews[5].sentiment

'POSITIVE'

Prep Data

In [24]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size=0.33, random_state=42)

In [26]:
train_x = [x.text for x in training]
train_y = [x.sentiment for x in training]

test_x = [x.text for x in test]
test_y = [x.sentiment for x in test]

# Bag of words vectorization

In [31]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# This book is great !
# This book was so bad

vectorizer = TfidfVectorizer()

# vectorizer.fit(train_x)
# train_x_vectors = vectorizer.transform(train_x)
train_x_vectors = vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(test_x)

print(train_x[0])
print(train_x_vectors[0].toarray())

  (0, 2007)	0.20155461003908146
  (0, 3545)	0.09152360145738062
  (0, 5197)	0.20786775381390304
  (0, 1515)	0.24081563602839037
  (0, 539)	0.2795844985810354
  (0, 7353)	0.19986187253464202
  (0, 2895)	0.36946021882113367
  (0, 6593)	0.11098821327231712
  (0, 6475)	0.29281825356337265
  (0, 1558)	0.348387812231681
  (0, 3054)	0.1552116262620815
  (0, 562)	0.17938877649953142
  (0, 6595)	0.07218827779667952
  (0, 1800)	0.3400250782774752
  (0, 350)	0.16011186063775978
  (0, 1148)	0.1612216085384897
  (0, 7086)	0.3834351360039767
  (1, 1494)	0.15591333181975053
  (1, 873)	0.15784295329054357
  (1, 3662)	0.08789184786305634
  (1, 2545)	0.14763819739564865
  (1, 3874)	0.18779758605854352
  (1, 2722)	0.08236785186367496
  (1, 4595)	0.15784295329054357
  (1, 6060)	0.07780690207478286
  :	:
  (669, 5899)	0.1386555682825347
  (669, 4841)	0.06080823508144671
  (669, 7133)	0.02507626035207097
  (669, 6592)	0.06282617307554751
  (669, 4612)	0.030916375427241714
  (669, 6709)	0.055385413390901965
