### Data Class

In [1]:
import random

class Sentiment:
    POSITIVE = 'POSITIVE'
    NEUTRAL = 'NEUTRAL'
    NEGATIVE = 'NEGATIVE'

class Review:
    def __init__(self, comment, score):
        self.comment = comment
        self.score = score
        self.sentiment = self.get_sentiment()
    
    
    def get_sentiment(self):
        if self.score > 3:
            return Sentiment.POSITIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.NEGATIVE

class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
    
    def get_comment(self):
        return [x.comment for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
    
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        
        positive_shrink = positive[:len(negative)]
        self.reviews = negative + positive_shrink
        random.shuffle(self.reviews)

### Load Data

In [2]:
import json

reviews = []
with open('./data/Books_small_10000.json') as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))
reviews[4].sentiment

'NEUTRAL'

### Prep Data

In [3]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(reviews, test_size=0.33, random_state=42)

train_container = ReviewContainer(train)
test_container = ReviewContainer(test)

In [4]:
train_container.evenly_distribute() # it takes negative and positive counts equally to be measure mean more properly
train_X = train_container.get_comment()
train_y = train_container.get_sentiment()

test_container.evenly_distribute()
test_X = test_container.get_comment()
test_y = test_container.get_sentiment()

print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEGATIVE))

436
436


#### Bag of word vectorization

Sample sentences<br>
1:This product is good <br> 
2:This product was bad <br>
#### bag of word: [this, product, is, good, was, bad] <br>
Sample count vector<br>
1:[1,1,1,1,0,0]<br>
2:[1,1,0,0,1,1]<br>

In [21]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#vectorizer = CountVectorizer()
vectorizer = TfidfVectorizer()
train_X_vectors = vectorizer.fit_transform(train_X)
test_X_vectors = vectorizer.transform(test_X)

print(train_X[0])
print(train_X_vectors[0].toarray())


This book didn't capture my niece's interest.  There wasn't a great variety of textures to offer her any variety.  I was disappointed in this purchase.
[[0. 0. 0. ... 0. 0. 0.]]


## Classification

#### Linear SVM

In [22]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_X_vectors, train_y)

test_X[0]

clf_svm.predict(test_X_vectors[0])

array(['POSITIVE'], dtype='<U8')

#### Decision Tree

In [23]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_X_vectors, train_y)

clf_dec.predict(test_X_vectors[0])

array(['NEGATIVE'], dtype='<U8')

#### Naive Bayes

In [24]:
from sklearn.naive_bayes import GaussianNB

clf_naiv = GaussianNB()
clf_naiv.fit(train_X_vectors.toarray(), train_y)

clf_naiv.predict(test_X_vectors[0].toarray())

array(['POSITIVE'], dtype='<U8')

#### Logistic Regression

In [25]:
from sklearn.linear_model import LogisticRegression

clf_lgc = LogisticRegression()
clf_lgc.fit(train_X_vectors, train_y)

clf_lgc.predict(test_X_vectors[0])

array(['POSITIVE'], dtype='<U8')

## Evaluation

#### Mean Accuracy

In [26]:
print(clf_svm.score(test_X_vectors, test_y))
print(clf_dec.score(test_X_vectors, test_y))
print(clf_naiv.score(test_X_vectors.toarray(), test_y))
print(clf_lgc.score(test_X_vectors, test_y))

0.8076923076923077
0.6394230769230769
0.6610576923076923
0.8052884615384616


#### F1 Score

In [27]:
from sklearn.metrics import f1_score

print(f1_score(test_y, clf_svm.predict(test_X_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE]))

[0.80582524 0.80952381]


In [28]:
print(test_y.count(Sentiment.POSITIVE))


208


In [29]:
test_set = ['I thoroughly enjoyed this, 5 stars', 'bad book do not buy', 'horrible waste of time', 'what a great book', 'what a great book, I can not finish!']
new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)

array(['POSITIVE', 'NEGATIVE', 'NEGATIVE', 'POSITIVE', 'NEGATIVE'],
      dtype='<U8')

### Tuning our model(with GridSearch)

In [33]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel': ('linear', 'rbf'), 'C': (1, 4, 8, 16, 32)}

svc = svm.SVC()
clf_grid_search = GridSearchCV(svc, parameters, cv=5)

clf_grid_search.fit(train_X_vectors, train_y)



GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': (1, 4, 8, 16, 32), 'kernel': ('linear', 'rbf')})

In [36]:
print(clf_grid_search.score(test_X_vectors, test_y))

0.8076923076923077


### Saving Model

In [40]:
import pickle

with open('./models/sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(clf_grid_search, f)

### Loading Model

In [41]:
with open('./models/sentiment_classifier.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)

print(test_X[0])
loaded_clf.predict(test_X_vectors[0])

I choose this book because I enjoy books about the 2nd world war.  Great reading and another &#34;can't put the book down.&#34; I reccommend it to all that is into this subject.


array(['POSITIVE'], dtype='<U8')

In [42]:
# thank to Keith Galli https://www.youtube.com/watch?v=M9Itm95JzL0