In [1]:
import random

class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE
        
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
    
        
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)
        

# load data

In [2]:
import json

file_name = './data/sentiment/Books_small_10000.json'

reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)  
        reviews.append(Review(review['reviewText'], review['overall']))
        
reviews[5].score

5.0

# prep data

In [3]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size=0.33, random_state=42)

train_container = ReviewContainer(training)
test_container = ReviewContainer(test)


In [4]:
# train_x = [x.text for x in training]
# train_y = [x.sentiment for x in training]
train_container.evenly_distribute()
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

# test_x = [x.text for x in test]
# test_y = [x.sentiment for x in test]

test_container.evenly_distribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEGATIVE))


436
436


# bag of words vectorization

In [5]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(test_x)

 


# Classification

# linear SVM

In [6]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors, train_y)

print(test_x[0])

clf_svm.predict(test_x_vectors[0])


Sorry, didn't like it from the beginning but kept picking it up to finish.  Too wordy, too juvenile, too drawn out.  I'm glad I've finished so now I can move on to one I'm actually interested in.


array(['NEGATIVE'], dtype='<U8')

# Decision Tree

In [7]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

# Naive Bayes

In [8]:
from sklearn.naive_bayes import GaussianNB

# Convert sparse matrices to dense arrays
train_x_vectors_dense = train_x_vectors.toarray()
test_x_vectors_dense = test_x_vectors.toarray()

clf_gnb = GaussianNB()
clf_gnb.fit(train_x_vectors_dense, train_y)

# Make a prediction
clf_gnb.predict(test_x_vectors_dense[0].reshape(1, -1))  # Reshaping to maintain 2D input

# clf_gnb.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

# Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression

clf_lg = LogisticRegression()

clf_lg.fit(train_x_vectors, train_y)

clf_lg.predict(test_x_vectors[0])

array(['NEGATIVE'], dtype='<U8')

# Evaluation

In [10]:
# Mean Accuracy

print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
print(clf_gnb.score(test_x_vectors_dense, test_y))
print(clf_lg.score(test_x_vectors, test_y))

0.8076923076923077
0.6442307692307693
0.6610576923076923
0.8052884615384616


In [11]:
# F1 score
from sklearn.metrics import f1_score

f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])


array([0.80582524, 0.80952381])

In [12]:
test_set =['i had a great experience', 'bad book donot buy.', 'its life transforming.']

new_test = vectorizer.transform(test_set)
clf_svm.predict(new_test)

array(['POSITIVE', 'NEGATIVE', 'POSITIVE'], dtype='<U8')

# Tuning our model (with GridSearch)

In [15]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel': ('linear', 'rbf'), 'C': (1,4,8,16,32)}

svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv=5 )              
clf.fit(train_x_vectors, train_y)              
              
              


GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': (1, 4, 8, 16, 32), 'kernel': ('linear', 'rbf')})

In [16]:
print(clf_svm.score(test_x_vectors, test_y))

0.8076923076923077


# Saving Model

In [None]:
#save model


In [17]:
import pickle

with open('./models/sdentiment_classifier.pkl', 'wb') as f:
    pickle.dump(clf, f)

In [None]:
#load model

In [18]:
with open('./models/sdentiment_classifier.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)

In [20]:
print(test_x[0])
loaded_clf.predict(test_x_vectors[0])

Sorry, didn't like it from the beginning but kept picking it up to finish.  Too wordy, too juvenile, too drawn out.  I'm glad I've finished so now I can move on to one I'm actually interested in.


array(['NEGATIVE'], dtype='<U8')