## Classificações



In [41]:
import pandas as pd
import numpy as np
import json


In [138]:
import random

class Sentiment:
    Negative = 'Negative'
    Neutral = 'Neutral'
    Positive = 'Positive'

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.Negative
        elif self.score == 3:
            return Sentiment.Neutral
        else:
            return Sentiment.Positive
        
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
    
    def get_text(self):
        return [x.text for x in self.reviews]
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
    
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment== Sentiment.Negative, self.reviews))
        positive = list(filter(lambda x: x.sentiment== Sentiment.Positive, self.reviews))
        
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)

In [102]:
file_name = 'Books_small_10000.json'

reviews = []
with open(file_name) as f:
    for line in f:
        review = json.loads(line)
        reviews.append(Review(review['reviewText'], review['overall']))
reviews[4].sentiment

'Neutral'

# Preparing the data

In [139]:
from sklearn.model_selection import train_test_split

treino, test = train_test_split(reviews, test_size=0.33, random_state=42)

treino_container = ReviewContainer(treino)
test_container = ReviewContainer(test)



In [171]:
treino_container.evenly_distribute()
treino_x = treino_container.get_text()
treino_y = treino_container.get_sentiment()

test_container.evenly_distribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

print(treino_y.count(Sentiment.Positive))

436


## bag of world vectorization

In [180]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer()
treino_x_vectors = vectorizer.fit_transform(treino_x)

test_x_vectors = vectorizer.transform(test_x)



## Classification

### Linear SVM

In [149]:
from sklearn import svm

cficador_svm = svm.SVC(kernel='linear') 

cficador_svm.fit(treino_x_vectors, treino_y)

cficador_svm.predict(test_x_vectors[0])

array(['Positive'], dtype='<U8')

### Decision Tree

In [150]:
from sklearn.tree import DecisionTreeClassifier

cficador_dec = DecisionTreeClassifier()
cficador_dec.fit(treino_x_vectors, treino_y)
cficador_dec.predict(test_x_vectors[0])

array(['Positive'], dtype='<U8')

### Naive Bayes

In [164]:
from sklearn.naive_bayes import GaussianNB

cficador_gnb = GaussianNB()
fit_treino_y = treino_y

In [160]:

cficador_gnb.fit(treino_x_vectors, fit_treino_y)
cficador_gnb.predict(test_x_vectors[0])

TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.

### Logistic Regression

In [151]:
from sklearn.linear_model import LogisticRegression

cficador_log = LogisticRegression()
cficador_log.fit(treino_x_vectors, treino_y)

cficador_log.predict(test_x_vectors[0])

array(['Positive'], dtype='<U8')

## Evaluetion

In [166]:
# mean acurracy
print(cficador_svm.score(test_x_vectors, test_y))
print(cficador_dec.score(test_x_vectors, test_y))
#print(cficador_gnb.score(test_x_vectors, test_y))
print(cficador_log.score(test_x_vectors, test_y))

0.7124242424242424
0.633030303030303
0.7448484848484849


In [167]:
# f1 scores
from sklearn.metrics import f1_score

f1_score(test_y, cficador_svm.predict(test_x_vectors), average=None, labels=[Sentiment.Positive, Sentiment.Neutral, Sentiment.Negative])

array([0.85363477, 0.        , 0.28146853])

In [172]:
test_y.count(Sentiment.Positive)

208

### Testando os modelos

In [175]:
test_set = ['great, very interresing', 'horrible, not buy', 'Awesome book!! i love it']
new_test = vectorizer.transform(test_set)

cficador_svm.predict(new_test)

array(['Positive', 'Negative', 'Positive'], dtype='<U8')

## Refinando paâmetros do classificador

In [181]:
from sklearn.model_selection import GridSearchCV

parametros = {'kernel': ('linear', 'rbf'), 'C': (1, 4, 8, 16, 32)}

svc = svm.SVC()
classificador = GridSearchCV(svc, parametros, cv=5)
classificador.fit(treino_x_vectors, treino_y)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': (1, 4, 8, 16, 32), 'kernel': ('linear', 'rbf')})

## Salvando o modelo(Classificador)

In [183]:
import pickle

with open('./models/sentiment_classifier.pkl', 'wb') as f:
    pickle.dump(classificador, f)
    

In [184]:
### load de model

In [185]:
with open('./models/sentiment_classifier.pkl', 'rb') as f:
    loaded_clf =pickle.load(f)

In [189]:
print(loaded_clf.predict(test_x_vectors[4]))
print(test_x[4])

['Negative']
I love a dark mystery, but the construction of the plot and the weak characters kept me from enjoying this book.Obstensibly, this is the story of a detective who is investigating a series of asphyxiation murders.  The targets are beautiful young women.  However, the decective does very little investigation.  He attends a couple of autopsies, but we don't get to see any of the sleuthing that is the usual hallmark of the genre.  If he investigates anything about the women's lives of what they have in common, readers never find out.  Instead we have a bit of &#34;Keystone Kops&#34; style detecting courtesy of a computer dating magnate and his IT man (they determine that the young women were all clients and try to determine who the killer is before the news gets out and wrecks their bottom line).  The detective spends most of his time worried about his personal life and none of his interactions seem particularly authentic.  And the way the author wraps up the plotline about hi