## Data Classes

In [72]:
import random

# enum class for string consistency
class Sentiment:
    NEGATIVE = 'NEGATIVE'
    NEUTRAL = 'NEUTRAL'
    POSITIVE = 'POSITIVE'

# class for review
class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
    
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE
    
    def get_text(self):
        return self.text

# class for collection of reviews
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
     
    # get even qty of pos/neg reviews
    def evenly_distribute(self):
        # getting all neg/pos reviews
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        # shrinking positive reviews to equal qty of negative reviews
        positive_shrunk = positive[:len(negative)]
        
        # new review set w/equal qty of pos/neg reviews
        self.reviews = negative + positive_shrunk
        # shuffle reviews to avoid too many pos/neg reviews together
        random.shuffle(self.reviews)

        

## Load Data

In [77]:
import json
file_name = 'beauty_small.json'

reviews = []
with open(file_name) as f:
    for line in f:
        # loads the line as a dictionary
        review = json.loads(line)
        # appending a review object
        reviews.append(Review(review['reviewText'], review['overall']))
print(reviews[0].get_text())       

the clear scalp hair and beauty oil is a decent hair oil that many will appreciate for its light weight and decent fragrance  it is not thick like coconut oil tends to be and it also does not have the strong unpleasant smell of coconut oil  the primary ingredient as others have mentioned is indeed mineral oil so you would expect it to be a lighter weight than coconut oil  it will leave your hair and scalp with a nice shine as all oils do and also leave decent aroma that will not bother most people  for those seeking a true scalp therapy you will probably still need to turn to a heavier weight oil but for general everyday hair needs this one is  just a bit better than baby oil though at a much higher price


## Prep Data

In [78]:
# split data into train/test sets
from sklearn.model_selection import train_test_split
training, test = train_test_split(reviews, test_size = 0.33, random_state = 25)

# create review container for train set
train_container = ReviewContainer(training)
train_container.evenly_distribute()

# create review container for test set
test_container = ReviewContainer(test)
test_container.evenly_distribute()


## Parametizing

In [79]:
#separating input/output parameters
#input is text (x), output is sentiment (y)

train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_x = test_container.get_text()
test_y = test_container.get_sentiment()



## Vectorization - Bag of Words

In [80]:
# Bag of Words - convert text to a quantitative vector

# CountVectrizer - each word carries same weight (contrary to Tfid)
# TfidfVectorizer - term frequency inverse document frequency. a word is LESS important if
# it occurs in MORE documents (is, a, are, etc.)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer()
#fit - to learn vocabulary from the input data
#transform - to vectorize each review
train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)


# Classification

### Linear SVM

In [81]:
from sklearn import svm

# initialize svm classifier
clf_svm = svm.SVC(kernel = 'linear')

#fitting the classifier to training data
clf_svm.fit(train_x_vectors, train_y)

# test
print(test_x[0])
print(test_y[0])
clf_svm.predict(test_x_vectors[0])


if youre like me you probably have a slew of hair products you bought to try used a few times then decided you were sorry you bought them so they go to the back of the cabinet until you clear it out and throw them away so its with a bit of skepticism i try new hair products any more many of them do a good initial job but the next day you have to really work with your hair to try and get it looking good again my hair is short thick and rather fine so even with a layered cut i am always trying to give it some curl definition and volume enter this productmotions naturally  you define my curls creme has been doing a phenomenal job with my hair it leaves it soft cleanfeeling and gives a definite shine i have used this 3 days in a row at times and it doesnt leave my hair oily limp or overproducted maybe thats a new word but you know what i mean i wash my hair every 23 days and its nice not to have to wash it daily to remove styling products i will also be reviewing another motions naturally 

array(['POSITIVE'], dtype='<U8')

### Decision Tree

In [82]:
from sklearn.tree import DecisionTreeClassifier

# initialize decision tree classifier
clf_dec = DecisionTreeClassifier()

# fitting training data to decision tree classifier
clf_dec.fit(train_x_vectors, train_y)

# test
print(test_x[0])
print(test_y[0])
clf_dec.predict(test_x_vectors[0])

if youre like me you probably have a slew of hair products you bought to try used a few times then decided you were sorry you bought them so they go to the back of the cabinet until you clear it out and throw them away so its with a bit of skepticism i try new hair products any more many of them do a good initial job but the next day you have to really work with your hair to try and get it looking good again my hair is short thick and rather fine so even with a layered cut i am always trying to give it some curl definition and volume enter this productmotions naturally  you define my curls creme has been doing a phenomenal job with my hair it leaves it soft cleanfeeling and gives a definite shine i have used this 3 days in a row at times and it doesnt leave my hair oily limp or overproducted maybe thats a new word but you know what i mean i wash my hair every 23 days and its nice not to have to wash it daily to remove styling products i will also be reviewing another motions naturally 

array(['POSITIVE'], dtype='<U8')

### Naive Bayes

In [85]:
from sklearn.naive_bayes import GaussianNB

# initialize naive bayes classifier
clf_gnb = GaussianNB()

# fitting training data to naive bayes classifier
clf_gnb.fit(train_x_vectors.todense(), train_y)

# test
print(test_x[0])
print(test_y[0])
clf_gnb.predict(test_x_vectors.todense()[0])

if youre like me you probably have a slew of hair products you bought to try used a few times then decided you were sorry you bought them so they go to the back of the cabinet until you clear it out and throw them away so its with a bit of skepticism i try new hair products any more many of them do a good initial job but the next day you have to really work with your hair to try and get it looking good again my hair is short thick and rather fine so even with a layered cut i am always trying to give it some curl definition and volume enter this productmotions naturally  you define my curls creme has been doing a phenomenal job with my hair it leaves it soft cleanfeeling and gives a definite shine i have used this 3 days in a row at times and it doesnt leave my hair oily limp or overproducted maybe thats a new word but you know what i mean i wash my hair every 23 days and its nice not to have to wash it daily to remove styling products i will also be reviewing another motions naturally 

array(['NEGATIVE'], dtype='<U8')

### Logistic Regresstion

In [86]:
from sklearn.linear_model import LogisticRegression

# initialize logistic regression classifier
clf_log = LogisticRegression()

# fitting training data to log reg classifier
clf_log.fit(train_x_vectors, train_y)

# test
print(test_x[0])
print(test_y[0])
clf_log.predict(test_x_vectors[0])

if youre like me you probably have a slew of hair products you bought to try used a few times then decided you were sorry you bought them so they go to the back of the cabinet until you clear it out and throw them away so its with a bit of skepticism i try new hair products any more many of them do a good initial job but the next day you have to really work with your hair to try and get it looking good again my hair is short thick and rather fine so even with a layered cut i am always trying to give it some curl definition and volume enter this productmotions naturally  you define my curls creme has been doing a phenomenal job with my hair it leaves it soft cleanfeeling and gives a definite shine i have used this 3 days in a row at times and it doesnt leave my hair oily limp or overproducted maybe thats a new word but you know what i mean i wash my hair every 23 days and its nice not to have to wash it daily to remove styling products i will also be reviewing another motions naturally 

array(['POSITIVE'], dtype='<U8')

## Evaluation

### Mean Accuracy

In [87]:
# .score function returns the mean accuracy of the classifier against the test set
print('Linear SVM:', clf_svm.score(test_x_vectors, test_y))
print('Decision Tree:',clf_dec.score(test_x_vectors, test_y))
print('Naive Bayes:',clf_gnb.score(test_x_vectors.todense(), test_y))
print('Logistic Regression:',clf_log.score(test_x_vectors, test_y))

Linear SVM: 0.821917808219178
Decision Tree: 0.5753424657534246
Naive Bayes: 0.6438356164383562
Logistic Regression: 0.8082191780821918


### F1 Score

In [88]:
from sklearn.metrics import f1_score

# f1 score - measures precision/recall
# f1 score reveals good predictor for positive, but trash for neutral and neg
# pretty much always predicting postive
f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels = [Sentiment.POSITIVE, Sentiment.NEGATIVE])




array([0.83116883, 0.8115942 ])

In [89]:
#test w/hand written data
test_set = ['horrible book', 'awesome, loved it', 'waste of time']
new_test_vector = vectorizer.transform(test_set)

clf_svm.predict(new_test_vector)

array(['NEGATIVE', 'POSITIVE', 'NEGATIVE'], dtype='<U8')

## Tuning Our Model w/Grid Search

In [90]:
# Google 'Parameter tuning sklearn' to find ways to improve your model!
# option: could strip punctuation so "good!"" and good are counted as the same

from sklearn.model_selection import GridSearchCV

# there's a lot of parameters for svm. We can use GridSearch to programmatically test many values

parameters = {'kernel': ('linear', 'rbf'), 'C': (1,4,8,16,32)}
svc = svm.SVC()
# cv = how many times do we want to split the data to cross validate and make sure things are working well
# with a specific parameter
clf = GridSearchCV(svc, parameters, cv = 5)

# fit clf to training data
clf.fit(train_x_vectors, train_y)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': (1, 4, 8, 16, 32), 'kernel': ('linear', 'rbf')})

In [52]:
# seeing if grid search helped
print(clf_svm.score(test_x_vectors, test_y))


0.7901234567901234


### Saving Model

In [91]:
# save classifier so we don't have to retrain it

import pickle

with open('sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(clf, f)

### Load Model

In [92]:
with open('./sentiment_classifier.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)

In [93]:
# testing saved model
print(test_x[0])
loaded_clf.predict(test_x_vectors[0])

if youre like me you probably have a slew of hair products you bought to try used a few times then decided you were sorry you bought them so they go to the back of the cabinet until you clear it out and throw them away so its with a bit of skepticism i try new hair products any more many of them do a good initial job but the next day you have to really work with your hair to try and get it looking good again my hair is short thick and rather fine so even with a layered cut i am always trying to give it some curl definition and volume enter this productmotions naturally  you define my curls creme has been doing a phenomenal job with my hair it leaves it soft cleanfeeling and gives a definite shine i have used this 3 days in a row at times and it doesnt leave my hair oily limp or overproducted maybe thats a new word but you know what i mean i wash my hair every 23 days and its nice not to have to wash it daily to remove styling products i will also be reviewing another motions naturally 

array(['POSITIVE'], dtype='<U8')