In [None]:
import random

class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE
        
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
        
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
#         neutral = list(filter(lambda x: x.sentiment == Sentiment.NEUTRAL, self.reviews))
        positive_shrunk = positive[:len(negative)]
#         neutral_shrunk = neutral[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)
        
        

### Load Data

In [None]:
import json

In [None]:
file_name = "./datasets/SciLearn/books_review.json"

reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))

reviews[0].text

### Data Preparation

In [None]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size = 0.33, random_state = 42)

train_container = ReviewContainer(training)

test_container = ReviewContainer(test)


In [None]:
train_container.evenly_distribute()

train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_container.evenly_distribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

# train_y.count(Sentiment.NEUTRAL)

In [None]:
len(train_x)

In [None]:
len(test_x)

In [None]:
train_x[0]

### Bags of words vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer()

train_x_vectors = vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(test_x)

print(train_x[0])
print(train_x_vectors[0].toarray())

# train_x_vectors
# train_y

### Classification

### Linear svm

In [None]:
from sklearn import svm

clf_svm = svm.SVC(kernel = 'linear')

clf_svm.fit(train_x_vectors, train_y)

test_x[0]

clf_svm.predict(test_x_vectors[0])

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier(random_state = 0)

clf_dec.fit(train_x_vectors, train_y)

clf_dec.predict(test_x_vectors[0])

### Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

clf_naive = GaussianNB()

clf_naive.fit(train_x_vectors.toarray(), train_y)

clf_naive.predict(test_x_vectors[0].toarray())

### K nearest neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

clf_neigh = KNeighborsClassifier()

clf_neigh.fit(train_x_vectors, train_y)

clf_neigh.predict(test_x_vectors[0])

### Evaluation

In [None]:
# Mean accuracy score for every classifier

print('SVM scored', clf_svm.score(test_x_vectors, test_y))
print('Decision Tree scored', clf_dec.score(test_x_vectors, test_y))
print('Naive bayes scored', clf_naive.score(test_x_vectors.toarray(), test_y))
print('K nearest neighbor scored', clf_neigh.score(test_x_vectors, test_y))

In [None]:
# F1 Scores

from sklearn.metrics import f1_score

print('F1 score for SVM is',f1_score(test_y, clf_svm.predict(test_x_vectors), average = None, labels = [Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))
print('F1 score for Decision Tree is',f1_score(test_y, clf_dec.predict(test_x_vectors), average = None, labels = [Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))
print('F1 score for Naive Bayes is',f1_score(test_y, clf_naive.predict(test_x_vectors.toarray()), average = None, labels = [Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))
print('F1 score for K nearest Neighbor is',f1_score(test_y, clf_neigh.predict(test_x_vectors), average = None, labels = [Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE]))

In [None]:
print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEUTRAL))
print(train_y.count(Sentiment.NEGATIVE))

In [None]:
test_set = ['I thorougly enjoyed this book, 5 star', 'Bad book, do not buy', 'horrible waste of time']
new_test = vectorizer.transform(test_set)

In [None]:
print(clf_svm.predict(new_test))
print(clf_neigh.predict(new_test))
print(clf_naive.predict(new_test.toarray()))
print(clf_dec.predict(new_test))

In [None]:
test_set1 = ['The most interesting book I have ever read', 'I did not like it, especially the negative attitude of the characters', 'Very creative author, highly recommended']
new_test1 = vectorizer.transform(test_set1)

In [None]:
print(clf_svm.predict(new_test1))
print(clf_neigh.predict(new_test1))
print(clf_naive.predict(new_test1.toarray()))
print(clf_dec.predict(new_test1))

### Tuning our model (with grid search)

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel': ('linear', 'rbf'), 'C' : (1,4,8,16,32)}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv = 5)
clf.fit(train_x_vectors, train_y)

In [None]:
print('SVM scored', clf.score(test_x_vectors, test_y))

### Saving model

In [None]:
import pickle

with open('./datasets/SciLearn/models/sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(clf, f)


In [None]:
with open('./datasets/SciLearn/models/sentiment_classifier.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)
    

In [None]:
print(test_x[25])

loaded_clf.predict(test_x_vectors[25])