In [1]:
#importing all packages
import numpy as np
import pandas as pd
import random 
import matplotlib .pyplot as plt 
%matplotlib inline 

### Data class

In [2]:
class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"
    
    
class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()

    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:  # Score of 4 or 5
            return Sentiment.POSITIVE
        
        
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews

    def get_text(self):
        return [x.text for x in self.reviews]

    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]

    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)

### Load data

In [3]:
import json

file_name = 'books_small_10000.json'

reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))

### Prep Data for BOW

In [4]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size=0.33, random_state=42)

train_container = ReviewContainer(training)

test_container = ReviewContainer(test)

train_container.evenly_distribute()
train_x_bow = train_container.get_text()
train_y_bow = train_container.get_sentiment()

test_container.evenly_distribute()
test_x_bow = test_container.get_text()
test_y_bow = test_container.get_sentiment()

#print(train_y_bow.count(Sentiment.POSITIVE))
#print(train_y_bow.count(Sentiment.NEGATIVE))

### Prep Data for TFIDF

In [5]:
from sklearn.model_selection import train_test_split

training2, test2 = train_test_split(reviews, test_size=0.33, random_state=17)

train_container_2 = ReviewContainer(training2)

test_container_2 = ReviewContainer(test2)

train_container_2.evenly_distribute()
train_x_tfidf = train_container_2.get_text()
train_y_tfidf = train_container_2.get_sentiment()

test_container_2.evenly_distribute()
test_x_tfidf = test_container_2.get_text()
test_y_tfidf = test_container_2.get_sentiment()

#print(train_y_tfidf.count(Sentiment.POSITIVE))
#print(train_y_tfidf.count(Sentiment.NEGATIVE))

### Bag of words

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
bow_vectorizer = CountVectorizer()

train_x_vectors_bow = bow_vectorizer.fit_transform(train_x_bow)

test_x_vectors_bow = bow_vectorizer.transform(test_x_bow)



### TFIDF vectorizer

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer()

train_x_vectors_tfidf = tfidf.fit_transform(train_x_tfidf)

test_x_vectors_tfidf = tfidf.transform(test_x_tfidf)


### Classification Models

#### 1) Logistic Regression


In [8]:
from sklearn.linear_model import LogisticRegression
Log_Reg = LogisticRegression(random_state=0,solver='lbfgs')

##### Bag of Words Feature

In [9]:
Log_Reg.fit(train_x_vectors_bow,train_y_bow)
print(Log_Reg.score(test_x_vectors_bow, test_y_bow))

0.8149038461538461


#### F1 score BOW LR

In [10]:
from sklearn.metrics import f1_score

lr_bow_f1=f1_score(test_y_bow, Log_Reg.predict(test_x_vectors_bow), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])
lr_bow_f1

array([0.82051282, 0.808933  ])

#### TFIDF Feature

In [11]:
final_model=Log_Reg.fit(train_x_vectors_tfidf,train_y_tfidf)
print(Log_Reg.score(test_x_vectors_tfidf, test_y_tfidf))

0.8469387755102041


#### F1 Score TFIDF LR

In [12]:
from sklearn.metrics import f1_score

lr_tfidf_f1=f1_score(test_y_tfidf, Log_Reg.predict(test_x_vectors_tfidf), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])
lr_tfidf_f1

array([0.84210526, 0.85148515])

#### 2) Linear SVM 

In [13]:
from sklearn import svm
clf_svm = svm.SVC(kernel='linear')

#### Bag of Words Feature

In [14]:
clf_svm.fit(train_x_vectors_bow, train_y_bow)
print(clf_svm.score(test_x_vectors_bow, test_y_bow))

0.7980769230769231


#### F1 Score BOW SVM 

In [15]:
from sklearn.metrics import f1_score

svm_bow_f1=f1_score(test_y_bow, clf_svm.predict(test_x_vectors_bow), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])
svm_bow_f1

array([0.8028169 , 0.79310345])

#### TFIDF Feature

In [16]:
clf_svm.fit(train_x_vectors_tfidf, train_y_tfidf)
print(clf_svm.score(test_x_vectors_tfidf, test_y_tfidf))

0.8443877551020408


#### F1 Score TFIDF SVM

In [17]:
from sklearn.metrics import f1_score

svm_tfidf_f1=f1_score(test_y_tfidf, clf_svm.predict(test_x_vectors_tfidf), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])
svm_tfidf_f1

array([0.83905013, 0.84938272])

#### 3) Decision Trees

In [18]:
from sklearn.tree import DecisionTreeClassifier
clf_dec = DecisionTreeClassifier()

#### Bag of Words Feature

In [19]:
clf_dec.fit(train_x_vectors_bow, train_y_bow)
print(clf_dec.score(test_x_vectors_bow, test_y_bow))

0.6322115384615384


#### F1 Score BOW DT

In [20]:
from sklearn.metrics import f1_score

dt_bow_f1=f1_score(test_y_bow,clf_dec.predict(test_x_vectors_bow), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])
dt_bow_f1

array([0.6313253 , 0.63309353])

#### TFIDF feature

In [21]:
clf_dec.fit(train_x_vectors_tfidf, train_y_tfidf)
print(clf_dec.score(test_x_vectors_tfidf, test_y_tfidf))

0.6811224489795918


#### F1 Score TFIDF DT 

In [22]:
from sklearn.metrics import f1_score

dt_tfidf_f1=f1_score(test_y_tfidf, clf_dec.predict(test_x_vectors_tfidf), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])
dt_tfidf_f1

array([0.68193384, 0.68030691])

### Model Comparision 

#### BOW comparision 

In [23]:
Algo_1 = ['LogisticRegression(Bag-of-Words)','LinearSVM(Bag-of-Words)','DecisionTree(Bag-of-Words)']

f1_score_1 = [lr_bow_f1,svm_bow_f1,dt_bow_f1]

comparision_1 = pd.DataFrame({'Model':Algo_1,'F1_Score':f1_score_1},index=[i for i in range(1,4)])

comparision_1.T

Unnamed: 0,1,2,3
Model,LogisticRegression(Bag-of-Words),LinearSVM(Bag-of-Words),DecisionTree(Bag-of-Words)
F1_Score,"[0.8205128205128205, 0.8089330024813896]","[0.8028169014084507, 0.7931034482758621]","[0.6313253012048193, 0.6330935251798561]"


#### TFIDF Comparision

In [24]:
Algo_2 = ['LogisticRegression(TF-IDF)','LinearSVM(TF-IDF)','DecisionTree(TF-IDF)']

f1_score_2 = [lr_tfidf_f1,svm_tfidf_f1,dt_tfidf_f1]

comparision_2 = pd.DataFrame({'Model':Algo_2,'F1_Score':f1_score_2},index=[i for i in range(1,4)])

comparision_2.T

Unnamed: 0,1,2,3
Model,LogisticRegression(TF-IDF),LinearSVM(TF-IDF),DecisionTree(TF-IDF)
F1_Score,"[0.8421052631578948, 0.8514851485148514]","[0.8390501319261213, 0.8493827160493828]","[0.6819338422391857, 0.680306905370844]"


### Final model

In [25]:
# as logisticRegression(tfidf) has best f1 score it will be the model

### Saving Result 

In [26]:
import pickle

with open('./models/sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(final_model, f)

### Load Model

In [27]:
with open('./models/sentiment_classifier.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)

In [31]:
print(test_x_tfidf[0])
loaded_clf.predict(test_x_vectors_tfidf[0])

The plot outline itself is interesting, but the 'hero' is a unconvincing Mary Sue. The challenges take too long to read, but not long enough book time to be reasonable.


array(['NEGATIVE'], dtype='<U8')