In [38]:
import random
class Sentiment:
    NE='NEGATIVE'
    NL='NEUTRAL'
    PE='POSITIVE'
    
class Review:
    def __init__(self,text,score):
        self.text=text
        self.score=score
        self.sentiment=self.get_sentiment()
        
    def get_sentiment(self):
        if self.score<=2:
            return Sentiment.NE
        elif self.score==3:
            return Sentiment.NL
        else:
            return Sentiment.PE
        
class ReviewContainer:
    def __init__(self,reviews):
        self.reviews=reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
    
        
    def evenly_distribute(self):
        negative=list(filter(lambda x:x.sentiment==Sentiment.NE,self.reviews))
        positive=list(filter(lambda x:x.sentiment==Sentiment.PE,self.reviews))
        positive_shrunk=positive[:len(negative)]
        self.reviews=negative+positive_shrunk
        random.shuffle(self.reviews)

In [39]:
import json

reviews=[]
with open('books_small_10000.json') as f:
    for line in f:
        review=json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))
        
print(reviews[5].score)
print(reviews[5].sentiment)

5.0
POSITIVE


## Prep Data

In [55]:
from sklearn.model_selection import train_test_split

training,test=train_test_split(reviews,test_size=0.33, random_state=42)
train_container = ReviewContainer(training)
test_container=ReviewContainer(test)

train_container.evenly_distribute()


In [56]:
train_x= train_container.get_text()
train_y= train_container.get_sentiment()


test_container.evenly_distribute()  #it is important to do it because our training model contains balanced data and our test data doesnt, which impacts the overall accuracy!
test_x= train_container.get_text()
test_y= train_container.get_sentiment()

# print(train_y.count(Sentiment.PE))
# print(train_y.count(Sentiment.NE))


### Bag of words vectorization

In [31]:
#from sklearn.feature_extraction.text import CountVectorizer

#vectorizer=CountVectorizer()
#vectorizer.fit_transform(train_x)

In [57]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer=CountVectorizer()
train_x_vectors=vectorizer.fit_transform(train_x)
#print(vectorizer.get_feature_names())

test_x_vectors=vectorizer.transform(test_x)

print(train_x[0])
print(train_x_vectors[0].toarray())

Scholarly, Enlightening, Educational, Mysticism, Profound and Intriguing are some of the words that best describe Author Stanislaw Kapuscinski&#8217;s book, &#8220;The Key to Immortality.&#8221;I have always been an avid reader of The Gospel of Thomas and am constantly in search of books that capture my attention and teach me in the process, especially as they pertain to different Gospels. The Key to Immortality accomplishes both those avenues in a compelling manner that by far outweighs other books discussing the same Gospel.While I am no theologian by any means, I do consider myself to be very profuse and knowledgeable with topics surrounding the bible, especially the New Testament. Most books I have read are cumbersome and not easy to read which don&#8217;t flow well. Perhaps it&#8217;s because I am a layperson thatis why I have not had much luck finding a book in this genre that truly captivates me from the start. That was not the case with &#8220;The Key to Immortality.&#8221; In 

### Classification

In [58]:
from sklearn import svm

clf_svm=svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors,train_y)

test_x[0]
clf_svm.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

### Decision Tree

In [59]:
from sklearn.tree import DecisionTreeClassifier

clf_dec= DecisionTreeClassifier()
clf_dec.fit(train_x_vectors,train_y)

clf_dec.predict(test_x_vectors[0])


array(['POSITIVE'], dtype='<U8')

### Naive Bayes

In [60]:
from sklearn.naive_bayes import GaussianNB

clf_gnb= DecisionTreeClassifier()
clf_gnb.fit(train_x_vectors,train_y)

clf_gnb.predict(test_x_vectors[0])


array(['POSITIVE'], dtype='<U8')

### Logistic Regression

In [61]:
from sklearn.linear_model import LogisticRegression

clf_log=LogisticRegression()
clf_log.fit(train_x_vectors,train_y)

clf_log.predict(test_x_vectors[0])

array(['POSITIVE'], dtype='<U8')

## Evaluation

In [62]:
print(clf_svm.score(test_x_vectors,test_y))
print(clf_dec.score(test_x_vectors,test_y))
print(clf_gnb.score(test_x_vectors,test_y))
print(clf_log.score(test_x_vectors,test_y))

1.0
1.0
1.0
0.9988532110091743


### F1 Scores

In [63]:
from sklearn.metrics import f1_score

f1_score(test_y,clf_svm.predict(test_x_vectors),average=None, labels=[Sentiment.PE, Sentiment.NL, Sentiment.NE])

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


array([1., 0., 1.])

## Testing our Model(Qualitative Analysis)

In [64]:
test_set=['I thoroughly enjoyed this, 5 stars',"bad book do not buy","horrible waste of time"]
new_test=vectorizer.transform(test_set)

clf_svm.predict(new_test)

array(['POSITIVE', 'NEGATIVE', 'NEGATIVE'], dtype='<U8')

### Tuning our model (with Grid Search)

In [None]:
# the below procedure can only be performed on training set!
from sklearn.model_selection import GridSearchCV

params={'kernel':('linear','rbf'),'C':(1,4,8,16,32)}

svc=svm.SVC()
clf=GridSearchCV(svc,params,cv=5)
clf.fit(train_x_vectors,train_y)