In [124]:
import random
import json

## Data Class

In [125]:
class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"

class Review:
    def __init__( self, text, score ):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
    
    # Categorizes a score into sentiment
    def get_sentiment( self ):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else: 
            return Sentiment.POSITIVE

class ReviewContainer:
    
    def __init__( self, reviews ):
        self.reviews = reviews
    
    def get_text( self ):
        return [ review.text for review in self.reviews ]
    
    def get_sentiment( self ):
        return [ review.sentiment for review in self.reviews ]
    
    def evenly_distribute( self ):
        negative = list( filter( lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews ) )
        positive = list( filter( lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews ) )
        
        negative_size = len( negative )
        positive_shrunk = positive[0: negative_size ]
        
        self.reviews = negative + positive_shrunk
        random.shuffle( self.reviews )
        

## Load Data

In [126]:
file_name = 'data/books_small_10000.txt'
reviews = []

with open( file_name, 'r' ) as f:
    for line in f:
        review = json.loads( line )
        reviews.append( Review( review['reviewText'], review['overall'] ) ) 


In [127]:
reviews[7].sentiment

'POSITIVE'

## Prepare Data

In [128]:
from sklearn.model_selection import train_test_split

training, test = train_test_split( reviews, test_size = 0.33, random_state = 42 )

training_container = ReviewContainer(training)
test_container = ReviewContainer(test)

training_container.evenly_distribute()
test_container.evenly_distribute()
len( training_container.reviews )



872

In [129]:
print( len(training) )
print( len(test) )

6700
3300


In [130]:
train_x = training_container.get_text()
train_y = training_container.get_sentiment()

test_x = test_container.get_text()
test_y = test_container.get_sentiment()

### Bags of words vectorization

In [131]:
# Counts the number of appearance of each word in the sentence an repressents it in a vector
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)

# print( train_x[0] )
# print( train_x_vectors[0] )

## Classification

#### Linear SVM

In [132]:
from sklearn import svm

clf_svm = svm.SVC(kernel ='rbf')
clf_svm.fit(train_x_vectors, train_y)

print( test_x[0] )
print( clf_svm.predict(test_x_vectors[0]) )

The Double is definitely not one of Pelecanos's best efforts.  The story is threadbare and makes little sense as it goes from minor burglary to intense shoot outs and killings for no real reason.  The sub plots are meaningless and the dialogue is mostly laughable.  Pelecanos really struggles with writing in the present day.. he is much better with the 70's and 80's where he can show his knowledge of music and cars from those eras.  As a DC native I do enjoy the DC setting, but even that gets old and seems mostly to be filler in this book.  Look for very early Pelecanos books to read something really exciting.
['POSITIVE']


#### Decision Tree

In [133]:
from sklearn.tree import DecisionTreeClassifier

clf_tree = DecisionTreeClassifier()
clf_tree.fit( train_x_vectors, train_y )

print( test_x[100] )
print( clf_tree.predict( test_x_vectors[100] ) )

Again a masterful telling of a story.  My only regret is that I have to wait until Oct. 28th to find out the ending.  Well done!
['NEGATIVE']


#### Naive Bayes

In [134]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()
clf_gnb.fit( train_x_vectors.toarray(), train_y )

print( test_x[250] )
print( clf_gnb.predict(test_x_vectors[250].toarray()) )

I don't like this book, the story is not believable. The characters weren't develop, there was no chemistry between the hero and heroine.
['NEGATIVE']


#### Logistic Regression

In [135]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit( train_x_vectors, train_y )

print( test_x[45] )
print( clf_log.predict( test_x_vectors[45] ) )

I haven't read a urban novel in years. This has a good storyline, can't wait to read the sequel! I like the way how the author went into detail, about what the main characters were thinking. I have a feeling Khalil is going to go bonkers on Donte and a few other people.
['POSITIVE']


## Evaluation

#### Mean Accuracy

In [136]:
print( clf_svm.score( test_x_vectors, test_y ) )
print( clf_tree.score( test_x_vectors, test_y ) )
print( clf_gnb.score( test_x_vectors.toarray(), test_y ) )
print( clf_log.score( test_x_vectors, test_y ) )

0.8100961538461539
0.6322115384615384
0.6610576923076923
0.8052884615384616


#### F1 Score

In [137]:
from sklearn.metrics import f1_score

print( f1_score( test_y, clf_svm.predict( test_x_vectors ), average = None, labels = [ Sentiment.POSITIVE, Sentiment.NEGATIVE ] ) )
print( f1_score( test_y, clf_tree.predict( test_x_vectors ), average = None, labels = [ Sentiment.POSITIVE, Sentiment.NEGATIVE ] ) )
print( f1_score( test_y, clf_gnb.predict( test_x_vectors.toarray() ), average = None, labels = [ Sentiment.POSITIVE, Sentiment.NEGATIVE ] ) )
print( f1_score( test_y, clf_log.predict( test_x_vectors ), average = None, labels = [ Sentiment.POSITIVE, Sentiment.NEGATIVE ] ) )



[0.80871671 0.81145585]
[0.62222222 0.64168618]
[0.65693431 0.66508314]
[0.80291971 0.80760095]


#### Testing on Examples

In [116]:
# train_y.count( Sentiment.POSITIVE )
# print( len(train_y), train_y.count( Sentiment.POSITIVE ), train_y.count( Sentiment.NEGATIVE ) )
# print( test_y.count( Sentiment.POSITIVE ), test_y.count( Sentiment.NEGATIVE ) )

872 436 436
208 208


In [141]:
test_set = [ "Bad book", "I think this book is not good", "I liked reading this book", "I recommend this book, nice experience!" ]

test_set_vectors = vectorizer.transform( test_set )
clf_log.predict( test_set_vectors )

array(['NEGATIVE', 'NEGATIVE', 'POSITIVE', 'POSITIVE'], dtype='<U8')

## Saving Model

#### Save Model

In [142]:
import pickle

with open( './models/sentiment/log_classifier.pkl', 'wb' ) as f:
    pickle.dump( clf_log, f )

#### Load Model

In [143]:
with open( './models/sentiment/log_classifier.pkl', 'rb' ) as f:
    new_clf = pickle.load( f )