In [1]:
import random

class Sentiment:
    NEGATIVE="NEGATIVE"
    NEUTRAL="NEUTRAL"
    POSITIVE="POSITIVE"

class Review:
    def __init__(self,text,score):
        self.text=text
        self.score=score
        self.sentiment=self.get_sentiment()

    def get_sentiment(self):
        if self.score<=2:
            return Sentiment.NEGATIVE
        elif self.score==3:
            return Sentiment.NEUTRAL
        else :
            return Sentiment.POSITIVE
        
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews=reviews

    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]

    def evenly_distribute(self):
        negative= list(filter(lambda x:x.sentiment==Sentiment.NEGATIVE, self.reviews))

        positive=list(filter(lambda x:x.sentiment==Sentiment.POSITIVE, self.reviews))

        positive_shrink=positive[:len(negative)]

        self.reviews=negative+positive_shrink
        random.shuffle(self.reviews)


LOAD DATA

In [2]:
import json

file_name="./Books_small_10000.json"

reviews=[]

with open(file_name) as f:
    for line in f:
        review=json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall'])) 

reviews[2].text




PREP DATA

In [3]:
from sklearn.model_selection import train_test_split

training, test=train_test_split(reviews, test_size=0.33, random_state=42)

train_cont=ReviewContainer(training)
test_cont=ReviewContainer(test)

train_cont.evenly_distribute()
test_cont.evenly_distribute()

In [4]:
train_x= train_cont.get_text()
train_y= train_cont.get_sentiment()

test_x=test_cont.get_text()
test_y=test_cont.get_sentiment()


Bag of words vectorization

In [18]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# Convert a collection of text documents to a matrix of token counts.
# vectorizer = CountVectorizer()
# train_x_vectors=vectorizer.fit_transform(train_x) #transforming and then 

# test_x_vectors=vectorizer.transform(test_x)

tfidf vectorization


In [32]:
vectorizer=TfidfVectorizer()
train_x_vectors=vectorizer.fit_transform(train_x)
test_x_vectors=vectorizer.transform(test_x)

CLASSIFICATION

Linear SVM

In [33]:
from sklearn import svm

clf_svm=svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors, train_y)

SVC(kernel='linear')

In [34]:
print(test_y[0])

clf_svm.predict(test_x_vectors[0])

NEGATIVE


array(['NEGATIVE'], dtype='<U8')

Decision Tree

In [35]:
from sklearn.tree import DecisionTreeClassifier
clf_desc=DecisionTreeClassifier()

clf_desc.fit(train_x_vectors, train_y)

clf_desc.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

Naive Bayes

In [36]:
from sklearn.naive_bayes import GaussianNB

clf_nb=GaussianNB()
clf_nb.fit(train_x_vectors.toarray(), train_y)

clf_nb.predict(test_x_vectors[0].toarray())

#requires a dense numpy array as X so using toarray() function

array(['NEGATIVE'], dtype='<U8')

Logistic Regression

In [37]:
from sklearn.linear_model import LogisticRegression

clf_lr=LogisticRegression()
clf_lr.fit(train_x_vectors, train_y)

clf_lr.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

EVALUATION

In [38]:
clf_svm.score(test_x_vectors, test_y)

0.8076923076923077

In [39]:
clf_desc.score(test_x_vectors, test_y)

0.6514423076923077

In [40]:
clf_nb.score(test_x_vectors.toarray(), test_y)

0.6610576923076923

In [41]:
clf_lr.score(test_x_vectors, test_y)

0.8052884615384616

In [42]:
from sklearn.metrics import f1_score

print(f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE]))

print(f1_score(test_y, clf_nb.predict(test_x_vectors.toarray()), average=None, labels=[Sentiment.POSITIVE,Sentiment.NEGATIVE]))

# pretty good for positive labels but pretty trash for neutral and negative labels, this is a common trend for all the models

#this might be a data issue and not a model issue

[0.80582524 0.80952381]
[0.65693431 0.66508314]


In [49]:
#qualitative analysis of the performance of our model

test_set=["This is good","I really enjoyed it", "this is bad","this is trash, piece of junk, garbage, utterly disappointed", "definitely recomend"]

new_test=vectorizer.transform(test_set)

clf_svm.predict(new_test)

array(['POSITIVE', 'POSITIVE', 'NEGATIVE', 'NEGATIVE', 'POSITIVE'],
      dtype='<U8')

Tuning our model with Grid Search

In [55]:
from sklearn.model_selection import GridSearchCV

#grid search will choose a kernel and C value that it finds out to work best for the given data
parameters={'kernel':['linear', 'rbf'], 'C':[1,4,8,16,32]}

svc=svm.SVC()
clf=GridSearchCV(svc,parameters, cv=5)

clf.fit(train_x_vectors, train_y)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': [1, 4, 8, 16, 32], 'kernel': ['linear', 'rbf']})

In [56]:
print(clf.score(test_x_vectors, test_y))

0.8197115384615384


### Saving Model


In [58]:
import pickle


In [60]:
with open("./models/sentiment_classifier.pkl","wb") as f:
    pickle.dump(clf,f)

### Load model



In [62]:
with open("./models/sentiment_classifier.pkl", 'rb') as f:
    loaded_clf=pickle.load(f)

In [68]:
print(loaded_clf.predict(test_x_vectors)[0])
print(test_x[0])

I wanted to like this and I wanted to read all of them because it sounded so go. But, I won't. The characters are so immature and the writing was just not good.
