In [82]:
import random

class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"


class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else: #Score 4 ou 5
            return Sentiment.POSITIVE
        

class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
    
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentient(self):
        return [x.sentiment for x in self.reviews]
    
    def evely_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)
        

## Load Data

In [65]:
import json

file_name = "./data/sentiment/Books_small_10000.json"

reviews = []

with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))

In [66]:
reviews[5].sentiment

'POSITIVE'

## Prep Data

In [87]:
from sklearn.model_selection import train_test_split


training, test = train_test_split(reviews, test_size=0.33, random_state=42)

train_container = ReviewContainer(training)
test_container = ReviewContainer(test)

In [88]:
train_container.evely_distribute()

train_x = train_container.get_text()
train_y = train_container.get_sentient()

test_container.evely_distribute()

test_x = test_container.get_text()
test_y = test_container.get_sentient()


## Bag of words vectorization

In [113]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)


## Classification

#### Linear SVM

In [114]:
from sklearn.svm import SVC

clf_svc = SVC(kernel='linear')

clf_svc.fit(train_x_vectors, train_y)
clf_svc.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

#### Decision Tree

In [115]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

#### Naive Bayes

In [116]:
from sklearn.naive_bayes import GaussianNB
#error 
clf_gnb = GaussianNB()
clf_gnb.fit(train_x_vectors.todense(), train_y)

clf_gnb.predict(test_x_vectors.todense()[0])

array(['NEGATIVE'], dtype='<U8')

#### Logistic Regression

In [117]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)

clf_log.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

## Evaluation

In [118]:
# Mean Accuracy
print(f"Model SVC Score: {clf_svc.score(test_x_vectors, test_y)}")
print(f"Model Decision Tree Score: {clf_dec.score(test_x_vectors, test_y)}")
print(f"Model Naive Bayes Score: {clf_gnb.score(test_x_vectors.todense(), test_y)}")
print(f"Model Linear Regression Score: {clf_log.score(test_x_vectors, test_y)}")



Model SVC Score: 0.8076923076923077
Model Decision Tree Score: 0.6490384615384616
Model Naive Bayes Score: 0.6610576923076923
Model Linear Regression Score: 0.8052884615384616


In [119]:
# F1 Scores
from sklearn.metrics import f1_score

print(f"SVC F1 Score: {f1_score(test_y, clf_svc.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])}")
print(f"Decision Tree F1 Score: {f1_score(test_y, clf_dec.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])}")
print(f"Naive Bayes F1 Score: {f1_score(test_y, clf_gnb.predict(test_x_vectors.todense()), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])}")
print(f"Linear Regression F1 Score: {f1_score(test_y, clf_log.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])}")

SVC F1 Score: [0.80582524 0.80952381]
Decision Tree F1 Score: [0.635      0.66203704]
Naive Bayes F1 Score: [0.65693431 0.66508314]
Linear Regression F1 Score: [0.80291971 0.80760095]


In [120]:
test_set = ['I thoroughly enjoyed this, 5 stars', "nice", "horrible waste of time"]
new_test = vectorizer.transform(test_set)

clf_svc.predict(new_test)

array(['POSITIVE', 'POSITIVE', 'NEGATIVE'], dtype='<U8')

## Tuning our model (with Grid Search)

In [122]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel': ('linear', 'rbf'), 'C': (1,4,8,16,32)}

svc = SVC()
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(train_x_vectors, train_y)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': (1, 4, 8, 16, 32), 'kernel': ('linear', 'rbf')})

In [123]:
print(f"Grid SeachCV SVC Score: {clf.score(test_x_vectors, test_y)}")


Grid SeachCV SVC Score: 0.8197115384615384


## Saving Model

In [124]:
import pickle

with open('./models/sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(clf, f)


Load model

In [125]:
with open('./models/sentiment_classifier.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)

In [127]:
print(test_x[0])
loaded_clf.predict(test_x_vectors[0])

I thoroughly enjoyed this story and the whole series.  I recommend reading it.  I liked the way the author made a toned down version too.  Unsung herself in the series was really clever too!


array(['POSITIVE'], dtype='<U8')