## Data Class


In [None]:
class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"
    
class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
    
    def get_sentiment(self):
        if (self.score <= 2):
            return 'NEGATIVE'
        elif (self.score == 3):
            return 'NEUTRAL'
        else:
            return 'POSITIVE'

##  Load Data

In [None]:
import json
file_name= r'Data/Books_small_10000.json'
reviews=[]
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'],review['overall']))

## Prep Data

In [None]:
from sklearn.model_selection import train_test_split
training, testing = train_test_split(reviews, test_size=0.33, random_state=42)

In [None]:
x_train = [x.text for x in training]
y_train =[x.sentiment for x in training]

x_test =[x.text for x in testing]
y_test =[x.sentiment for x in testing]

## Bags of words vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
x_train_vector = vectorizer.fit_transform(x_train)
x_test_vector = vectorizer.transform(x_test)

## Classification

#### Linear SVM

In [None]:
from sklearn import svm
clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(x_train_vector, y_train)
print(x_test[0])
print(y_test[0])
print(clf_svm.predict(x_test_vector[0]))

#### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf_dec = DecisionTreeClassifier()
clf_dec.fit(x_train_vector, y_train)
print(clf_dec.predict(x_test_vector[0]))

#### Naive Bayes


In [None]:
from sklearn.naive_bayes import GaussianNB
clf_gnb = GaussianNB()
clf_gnb.fit(x_train_vector.toarray(), y_train)
print(clf_gnb.predict(x_test_vector[0].toarray()))

#### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
clf_log = LogisticRegression()
clf_log.fit(x_train_vector, y_train)
print(clf_log.predict(x_test_vector[0]))

## Evaluation

In [None]:
# Mean Accuracy
print(clf_svm.score(x_test_vector, y_test))
print(clf_dec.score(x_test_vector, y_test))
print(clf_gnb.score(x_test_vector.toarray(), y_test))
print(clf_log.score(x_test_vector, y_test))

In [None]:
# F1 score
from sklearn.metrics import f1_score
print(f1_score(y_test, clf_svm.predict(x_test_vector), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE, Sentiment.NEUTRAL]))
print(f1_score(y_test, clf_dec.predict(x_test_vector), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE, Sentiment.NEUTRAL]))
print(f1_score(y_test, clf_gnb.predict(x_test_vector.toarray()), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE, Sentiment.NEUTRAL]))
print(f1_score(y_test, clf_log.predict(x_test_vector), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE, Sentiment.NEUTRAL]))