In [0]:
import pandas as pd
import json

## Load Data

In [0]:
#the way I prefer

url_books = 'https://raw.githubusercontent.com/KeithGalli/sklearn/master/data/category/Books_small.json'
df_books = pd.read_json(url_books, lines=True)[['reviewText', 'overall']]
df_books

Unnamed: 0,reviewText,overall
0,Da Silva takes the divine by storm with this u...,4
1,For me personally it's the most disappointing ...,2
2,"Very simple book, but leaves you feeling good....",4
3,I read a library copy of this exceptionally we...,5
4,With the government knowing this could happen ...,5
...,...,...
995,I thoroughly enjoyed this book. I've read the ...,5
996,I was impressed with not only the characters o...,5
997,I like the characters. I had read the short s...,5
998,"She got way she wanted,but can she pick the ri...",5


In [0]:
#The way from tutorial
import random

class Sentiment:
    NEGATIVE = 'NEGATIVE'
    NEUTRAL = 'NEUTRAL'
    POSITIVE = 'POSITIVE'


class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()


    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else: 
            return Sentiment.POSITIVE


class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
        
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))

        print("negative ",  len(negative))
        print("positive ", len(positive))

        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)

    def get_text(self):
        return [x.text for x in self.reviews]

    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]


In [0]:
def get_sentiment(score):
       if score <= 2:
           return Sentiment.NEGATIVE
       elif score == 3:
           return Sentiment.NEUTRAL
       else: 
           return Sentiment.POSITIVE

df_books['Sentiment'] = df_books.overall.apply(get_sentiment)
df_books

Unnamed: 0,reviewText,overall,Sentiment
0,Da Silva takes the divine by storm with this u...,4,POSITIVE
1,For me personally it's the most disappointing ...,2,NEGATIVE
2,"Very simple book, but leaves you feeling good....",4,POSITIVE
3,I read a library copy of this exceptionally we...,5,POSITIVE
4,With the government knowing this could happen ...,5,POSITIVE
...,...,...,...
995,I thoroughly enjoyed this book. I've read the ...,5,POSITIVE
996,I was impressed with not only the characters o...,5,POSITIVE
997,I like the characters. I had read the short s...,5,POSITIVE
998,"She got way she wanted,but can she pick the ri...",5,POSITIVE


In [0]:
#file = 'Books_small.json'
file = 'Books_small_10000.json'


reviews = []

with open(file) as f:
    for record in f:
        review = json.loads(record)
        reviews.append(Review(review['reviewText'], review['overall']))

In [0]:
reviews[5].sentiment

'POSITIVE'

## Prep data

In [0]:
 #import numpy as np
from sklearn.model_selection import train_test_split

In [0]:
train, test = train_test_split(reviews, test_size=0.33, random_state=42)

train_container = ReviewContainer(train)
test_container = ReviewContainer(test)

In [0]:
# train_x = [x.text for x in train]
# train_y = [x.sentiment for x in train ]

# test_x = [x.text for x in test]
# test_y = [x.sentiment for x in test]

train_container.evenly_distribute()
test_container.evenly_distribute()

train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_x = test_container.get_text()
test_y = test_container.get_sentiment()


print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEGATIVE))

negative  436
positive  5611
negative  208
positive  2767
436
436


## Bag of words

In [0]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [0]:
#vectorizer = CountVectorizer()

vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)

In [0]:
#print(vectorizer.get_feature_names())
#print(train_x_vectors.shape)
print(train_x_vectors[1])
print(train_x[1])

  (0, 8826)	0.35232758843467893
  (0, 6394)	0.23798774394466
  (0, 3625)	0.29330147444557725
  (0, 8797)	0.24864994293082324
  (0, 392)	0.2583657337513091
  (0, 8408)	0.1725247986812799
  (0, 4439)	0.23479278125501726
  (0, 991)	0.1893024668390034
  (0, 5361)	0.22884222655230535
  (0, 2828)	0.32205063939427275
  (0, 8497)	0.47804063970394606
  (0, 387)	0.21150535007934668
  (0, 7929)	0.15402035435218872
  (0, 7976)	0.08635978706150498
  (0, 3177)	0.11124633018543452
  (0, 4264)	0.1048572784626126
I am very excited for the next book! Keep up the amazing work:) very very happy reader......THIS IS A WOW BOOK


In [0]:
#How to get index of word
vectorizer.vocabulary_.get('awful')

687

## Classification

#### Linear SVM

In [0]:
#support vector classifier
from sklearn import svm


clf_svm = svm.SVC(kernel = 'linear')
clf_svm.fit(train_x_vectors, train_y)

print(test_x[0])
clf_svm.predict(test_x_vectors[0])

This book is published by Amazon - so I was surprised at the clunky writing. The manuscript cried out for an editor who could have removed some of the cliches and the profusion of needless adjectives and adverbs. Of course, no editor could have breathed much life into the wooden characters and stilted dialogue - but that's another issue.The quality of the writing shows itself early when we learn that one character has &#34;dread etched on his face,&#34; and another &#34;burned with curiosity&#34; while a third has a mouth &#34;frozen in a crooked half smile&#34; and for a fourth, &#34;anger coursed through his body.&#34; A young girl writes in her diary that &#34;Virginie respects that which is haram or forbidden.&#34; Kind of the diarist to explain.I have to admit I did not get through this long slog set in Egypt in 1919 and 1940. The world is too full of good books to waste time on mediocre ones.


array(['NEGATIVE'], dtype='<U8')

#### DecisionTree

In [0]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

print(test_x[0])
clf_dec.predict(test_x_vectors[0])

This book is published by Amazon - so I was surprised at the clunky writing. The manuscript cried out for an editor who could have removed some of the cliches and the profusion of needless adjectives and adverbs. Of course, no editor could have breathed much life into the wooden characters and stilted dialogue - but that's another issue.The quality of the writing shows itself early when we learn that one character has &#34;dread etched on his face,&#34; and another &#34;burned with curiosity&#34; while a third has a mouth &#34;frozen in a crooked half smile&#34; and for a fourth, &#34;anger coursed through his body.&#34; A young girl writes in her diary that &#34;Virginie respects that which is haram or forbidden.&#34; Kind of the diarist to explain.I have to admit I did not get through this long slog set in Egypt in 1919 and 1940. The world is too full of good books to waste time on mediocre ones.


array(['NEGATIVE'], dtype='<U8')

#### Logistic Regression

In [0]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()

clf_log.fit(train_x_vectors, train_y)

print(test_x[0])
clf_log.predict(test_x_vectors[0])

This book is published by Amazon - so I was surprised at the clunky writing. The manuscript cried out for an editor who could have removed some of the cliches and the profusion of needless adjectives and adverbs. Of course, no editor could have breathed much life into the wooden characters and stilted dialogue - but that's another issue.The quality of the writing shows itself early when we learn that one character has &#34;dread etched on his face,&#34; and another &#34;burned with curiosity&#34; while a third has a mouth &#34;frozen in a crooked half smile&#34; and for a fourth, &#34;anger coursed through his body.&#34; A young girl writes in her diary that &#34;Virginie respects that which is haram or forbidden.&#34; Kind of the diarist to explain.I have to admit I did not get through this long slog set in Egypt in 1919 and 1940. The world is too full of good books to waste time on mediocre ones.


array(['NEGATIVE'], dtype='<U8')

## Evaluation

check how well we predict results

In [0]:
#Mean Accuracy

In [0]:
clf_svm.score(test_x_vectors, test_y)

0.8076923076923077

In [0]:
clf_dec.score(test_x_vectors, test_y)

0.6129807692307693

In [0]:
clf_log.score(test_x_vectors, test_y)

0.8052884615384616

In [0]:
#F1 scores

from sklearn.metrics  import f1_score

f1_score(test_y, clf_svm.predict(test_x_vectors), average = None, labels = [Sentiment.NEGATIVE, Sentiment.NEUTRAL, Sentiment.POSITIVE])

  average, "true nor predicted", 'F-score is', len(true_sum)


array([0.80952381, 0.        , 0.80582524])

In [0]:
# Very good positive, but negative and neutral are bad results

In [0]:
f1_score(test_y, clf_dec.predict(test_x_vectors), average = None, labels = [Sentiment.NEGATIVE, Sentiment.NEUTRAL, Sentiment.POSITIVE])

array([0.61575179, 0.        , 0.61016949])

In [0]:
# good positive, but negative and neutral are VERY bad results

In [0]:
f1_score(test_y, clf_log.predict(test_x_vectors), average = None, labels = [Sentiment.NEGATIVE, Sentiment.NEUTRAL, Sentiment.POSITIVE])

array([0.80760095, 0.        , 0.80291971])

In [0]:
# VERY good positive, but negative and neutral are VERY bad results

## Improving our model

In [0]:
print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEUTRAL))
print(train_y.count(Sentiment.NEGATIVE))

436
0
436


There are much more examples of Positive reviews rather than negative, that is why our model is good for positive test-reviews, bad bad at neutral and negative. We need more examples of negative and neutral reviews

In [0]:
cont = ReviewContainer(train)
cont.evenly_distribute()

negative  436
positive  5611


In [0]:
len(cont.reviews)

872

## Test time!

In [0]:
test_set = ['I dont like this book. Too silly', 'well, i do not think this book is bad. It is great, recommend ', 'horrible waste of time']

test_set_vectors = vectorizer.transform(test_set)

clf_svm.predict(test_set_vectors)

array(['NEGATIVE', 'POSITIVE', 'NEGATIVE'], dtype='<U8')

## Grid Search

In [0]:
from  sklearn.model_selection import GridSearchCV

parameters = {'kernel':('linear', 'rbf'), 'C':[1, 2, 4, 8, 16]}
svc = svm.SVC()

#cv = cross-validation
clf = GridSearchCV(svc, parameters)

clf.fit(train_x_vectors, train_y)

GridSearchCV(estimator=SVC(),
             param_grid={'C': [1, 2, 4, 8, 16], 'kernel': ('linear', 'rbf')})

In [0]:
clf.best_params_

{'C': 2, 'kernel': 'rbf'}

In [0]:
print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
print()

Best parameters set found on development set:

{'C': 2, 'kernel': 'rbf'}

Grid scores on development set:

0.827 (+/-0.047) for {'C': 1, 'kernel': 'linear'}
0.829 (+/-0.035) for {'C': 1, 'kernel': 'rbf'}
0.820 (+/-0.029) for {'C': 2, 'kernel': 'linear'}
0.830 (+/-0.031) for {'C': 2, 'kernel': 'rbf'}
0.825 (+/-0.047) for {'C': 4, 'kernel': 'linear'}
0.830 (+/-0.031) for {'C': 4, 'kernel': 'rbf'}
0.825 (+/-0.047) for {'C': 8, 'kernel': 'linear'}
0.830 (+/-0.031) for {'C': 8, 'kernel': 'rbf'}
0.825 (+/-0.047) for {'C': 16, 'kernel': 'linear'}
0.830 (+/-0.031) for {'C': 16, 'kernel': 'rbf'}



## Saving Model

In [0]:
import pickle 

with open('sentiment_clf.pkl', 'wb') as f:
    pickle.dump(clf, f)

## Loading Model

In [0]:
with open('sentiment_clf.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)

loaded_clf.predict(test_x_vectors[:2])    

array(['NEGATIVE', 'POSITIVE'], dtype='<U8')