In [19]:
import random

class Sentiment:
    NEGATIVE = "Negative"
    NEUTRAL = "Neutral"
    POSITIVE = "Positive"

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
    
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE
    
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [i.text for i in self.reviews]
        
    def get_sentiment(self):
        return [i.sentiment for i in self.reviews]
        
    def evenly_distribute(self):
        negative = list(filter(lambda x : x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x : x.sentiment == Sentiment.POSITIVE, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)
        

In [2]:
import json

file_name = 'Books_small_10000.json'

reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))

In [3]:
print(reviews[0].sentiment)

Positive


### Data Preperation

In [20]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size = 0.33, random_state = 42)

train_container = ReviewContainer(training)
test_container = ReviewContainer(test)

train_container.evenly_distribute()
test_container.evenly_distribute()

len(test_container.reviews)

416

In [21]:
train_x = train_container.get_text()
train_y = train_container.get_sentiment()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

train_y.count(Sentiment.POSITIVE)

436

### Bag of words representation

In [31]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)
train_x_vectors.shape

(872, 8906)

### Classification step

#### Linear SVM

In [32]:
from sklearn import svm

clf_svm = svm.SVC(kernel = 'linear')

clf_svm.fit(train_x_vectors, train_y)

print(test_x[3])

clf_svm.predict(test_x_vectors[3])

Unlike some heroines not to be named here, I can see why the hero is drawn to this one.  A fun, quick read with interesting and well-drawn imagery and a different premise.  Enjoyed it so much that I sought out the other title in the growing series and look forward to more.


array(['Positive'], dtype='<U8')

#### Decision Tree

In [33]:
from sklearn import tree

clf_dt = tree.DecisionTreeClassifier()

clf_dt = clf_dt.fit(train_x_vectors, train_y)

print(test_x[3])

clf_dt.predict(test_x_vectors[3])

Unlike some heroines not to be named here, I can see why the hero is drawn to this one.  A fun, quick read with interesting and well-drawn imagery and a different premise.  Enjoyed it so much that I sought out the other title in the growing series and look forward to more.


array(['Positive'], dtype='<U8')

#### Naive Bayes

In [34]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()

clf_gnb = clf_gnb.fit(train_x_vectors.toarray(), train_y)

print(test_x[3])

clf_gnb.predict(test_x_vectors[3].toarray())

Unlike some heroines not to be named here, I can see why the hero is drawn to this one.  A fun, quick read with interesting and well-drawn imagery and a different premise.  Enjoyed it so much that I sought out the other title in the growing series and look forward to more.


array(['Negative'], dtype='<U8')

### Evaluation


In [35]:
clf_svm.score(test_x_vectors, test_y)

0.8076923076923077

In [36]:
clf_dt.score(test_x_vectors, test_y)

0.6442307692307693

In [37]:
clf_gnb.score(test_x_vectors.toarray(), test_y)

0.6610576923076923

In [38]:
from sklearn.metrics import f1_score

print(f1_score(test_y, clf_gnb.predict(test_x_vectors.toarray()), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE]))

print(f1_score(test_y, clf_dt.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE]))

print(f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE]))

[0.65693431 0.66508314]
[0.6407767  0.64761905]
[0.80582524 0.80952381]


### Tuning with Grid Search

In [39]:
from sklearn.model_selection import GridSearchCV

In [40]:
parameters = {'kernel':('linear', 'poly', 'rbf'), 'C':(1, 4, 8, 16, 32)}

svc = svm.SVC()

clf = GridSearchCV(svc, parameters, cv = 5)

clf.fit(train_x_vectors, train_y)

GridSearchCV(cv=5, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': (1, 4, 8, 16, 32),
                         'kernel': ('linear', 'poly', 'rbf')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [41]:
print(clf.score(test_x_vectors, test_y))
print(f1_score(test_y, clf.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE]))

0.8197115384615384
[0.82269504 0.81662592]


### Saving the Model

In [42]:
import pickle
with open('Sentiment_model.pkl', 'wb') as f:
    pickle.dump(clf, f)
    