### Data Class

In [131]:
import random

class Sentiment:
    POSITIVE = 'POSITIVE'
    NEUTRAL = 'NEUTRAL'
    NEGATIVE = 'NEGATIVE'

class Review:
    def __init__(self, comment, score):
        self.comment = comment
        self.score = score
        self.sentiment = self.get_sentiment()
    
    
    def get_sentiment(self):
        if self.score > 3:
            return Sentiment.POSITIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.NEGATIVE

class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
    
    def get_comment(self):
        return [x.comment for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
    
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        
        positive_shrink = positive[:len(negative)]
        self.reviews = negative + positive_shrink
        random.shuffle(self.reviews)

### Load Data

In [132]:
import json

reviews = []
with open('./data/Books_small_10000.json') as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))
reviews[4].sentiment

'NEUTRAL'

### Prep Data

In [133]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(reviews, test_size=0.33, random_state=42)

train_container = ReviewContainer(train)
test_container = ReviewContainer(test)

In [148]:
train_container.evenly_distribute() # it takes negative and positive counts equally to be measure mean more properly
train_X = train_container.get_comment()
train_y = train_container.get_sentiment()

test_container.evenly_distribute()
test_X = test_container.get_comment()
test_y = test_container.get_sentiment()

print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEGATIVE))

436
436


#### Bag of word vectorization

Sample sentences<br>
1:This product is good <br> 
2:This product was bad <br>
#### bag of word: [this, product, is, good, was, bad] <br>
Sample count vector<br>
1:[1,1,1,1,0,0]<br>
2:[1,1,0,0,1,1]<br>

In [149]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
train_X_vectors = vectorizer.fit_transform(train_X)
test_X_vectors = vectorizer.transform(test_X)

print(train_X[0])
print(train_X_vectors[0].toarray())


I've been drinking ACV for a while. The book has a lot of information about different ways to use it. Every topic is written in a short section. The book is a very quick and easy read. Never had any idea how many uses/benefits you can get from apple cider vinegar. I now take a dose with honey every day. A good reference book.
[[0 0 0 ... 0 0 0]]


## Classification

#### Linear SVM

In [150]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_X_vectors, train_y)

test_X[0]

clf_svm.predict(test_X_vectors[0])

array(['POSITIVE'], dtype='<U8')

#### Decision Tree

In [151]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_X_vectors, train_y)

clf_dec.predict(test_X_vectors[0])

array(['NEGATIVE'], dtype='<U8')

#### Naive Bayes

In [152]:
from sklearn.naive_bayes import GaussianNB

clf_naiv = GaussianNB()
clf_naiv.fit(train_X_vectors.toarray(), train_y)

clf_naiv.predict(test_X_vectors[0].toarray())

array(['NEGATIVE'], dtype='<U8')

#### Logistic Regression

In [153]:
from sklearn.linear_model import LogisticRegression

clf_lgc = LogisticRegression()
clf_lgc.fit(train_X_vectors, train_y)

clf_lgc.predict(test_X_vectors[0])

array(['POSITIVE'], dtype='<U8')

## Evaluation

#### Mean Accuracy

In [154]:
print(clf_svm.score(test_X_vectors, test_y))
print(clf_dec.score(test_X_vectors, test_y))
print(clf_naiv.score(test_X_vectors.toarray(), test_y))
print(clf_lgc.score(test_X_vectors, test_y))

0.7980769230769231
0.6370192307692307
0.6346153846153846
0.8149038461538461


#### F1 Score

In [155]:
from sklearn.metrics import f1_score

print(f1_score(test_y, clf_svm.predict(test_X_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE]))

[0.8028169  0.79310345]


In [156]:
print(test_y.count(Sentiment.POSITIVE))


208
