In [1]:
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score

import json

### Data class

In [2]:

class Category:
    ELECTRONICS = "ELECTRONICS"
    BOOKS = "BOOKS"
    CLOTHING = "CLOTHING"
    GROCERY = "GROCERY"
    PATIO = "PATIO"
    
class Sentiment:
    POSITIVE = "POSITIVE"
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"

class Review:
    def __init__(self, category, text, score):
        self.category = category
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else: 
            return Sentiment.POSITIVE
        
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
    
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_x(self, vectorizer):
        return vectorizer.transform(self.get_text())
    
    def get_y(self):
        return [x.sentiment for x in self.reviews]
    
    def get_category(self):
        return [x.category for x in self.reviews]
    
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        positive_shrunk = positive[:len(negative)]
        print(len(positive_shrunk))
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)
        print(self.reviews[0])

### Load data

In [3]:
file_names = ['./data/Electronics_small.json', './data/Books_small.json', './data/Clothing_small.json', './data/Grocery_small.json', './data/Patio_small.json']
file_categories = [Category.ELECTRONICS, Category.BOOKS, Category.CLOTHING, Category.GROCERY, Category.PATIO]

reviews = []
for i in range(len(file_names)):
    file_name = file_names[i]
    category = file_categories[i]
    with open(file_name) as f:
        for line in f:
            review_json = json.loads(line)
            review = Review(category, review_json['reviewText'], review_json['overall'])
            reviews.append(review)

### Data preparation

In [4]:
train, test = train_test_split(reviews, test_size = 0.33, random_state=42)

train_container = ReviewContainer(train)

test_container = ReviewContainer(test)

corpus = train_container.get_text()

vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)

train_x = train_container.get_x(vectorizer)
train_y = train_container.get_category()

test_x = test_container.get_x(vectorizer)
test_y = test_container.get_category()

### Classification

In [5]:
from sklearn import svm

clf = svm.SVC(C=16, kernel='linear', gamma='auto')
clf.fit(train_x, train_y)

SVC(C=16, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [6]:
test_set = ['great for my wedding', "loved it in my garden", 'good computer']
new_test = vectorizer.transform(test_set)

clf.predict(new_test)

array(['CLOTHING', 'PATIO', 'ELECTRONICS'], dtype='<U11')

In [7]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

# print(train_x)
gnb.fit(train_x.todense(),train_y)
gnb.score(test_x.todense(),test_y)

0.8109090909090909

In [8]:
y_pred = clf.predict(test_x)

f1_score(test_y, y_pred, average=None)

for i in range(len(y_pred)):
    print(y_pred[i], test_y[i])

BOOKS BOOKS
ELECTRONICS CLOTHING
CLOTHING CLOTHING
BOOKS BOOKS
ELECTRONICS ELECTRONICS
ELECTRONICS ELECTRONICS
ELECTRONICS ELECTRONICS
CLOTHING CLOTHING
CLOTHING CLOTHING
BOOKS BOOKS
CLOTHING CLOTHING
ELECTRONICS ELECTRONICS
ELECTRONICS ELECTRONICS
ELECTRONICS ELECTRONICS
GROCERY GROCERY
GROCERY GROCERY
BOOKS BOOKS
GROCERY GROCERY
BOOKS BOOKS
BOOKS BOOKS
CLOTHING ELECTRONICS
GROCERY GROCERY
BOOKS BOOKS
ELECTRONICS ELECTRONICS
PATIO PATIO
ELECTRONICS ELECTRONICS
PATIO PATIO
ELECTRONICS ELECTRONICS
PATIO PATIO
ELECTRONICS ELECTRONICS
BOOKS BOOKS
ELECTRONICS ELECTRONICS
PATIO PATIO
GROCERY PATIO
BOOKS BOOKS
ELECTRONICS CLOTHING
GROCERY GROCERY
PATIO PATIO
PATIO PATIO
ELECTRONICS CLOTHING
GROCERY GROCERY
PATIO PATIO
ELECTRONICS ELECTRONICS
ELECTRONICS ELECTRONICS
ELECTRONICS ELECTRONICS
GROCERY GROCERY
GROCERY GROCERY
PATIO PATIO
GROCERY GROCERY
CLOTHING CLOTHING
PATIO PATIO
PATIO PATIO
CLOTHING CLOTHING
CLOTHING CLOTHING
BOOKS BOOKS
GROCERY GROCERY
BOOKS BOOKS
ELECTRONICS ELECTRONICS
BOOK

PATIO PATIO
CLOTHING CLOTHING
PATIO PATIO
CLOTHING PATIO
PATIO CLOTHING
GROCERY GROCERY
BOOKS CLOTHING
BOOKS BOOKS
PATIO PATIO
PATIO PATIO
ELECTRONICS ELECTRONICS
GROCERY GROCERY
PATIO PATIO
CLOTHING CLOTHING
PATIO PATIO
PATIO PATIO
CLOTHING CLOTHING
CLOTHING CLOTHING
CLOTHING GROCERY
BOOKS BOOKS
GROCERY GROCERY
GROCERY GROCERY
CLOTHING CLOTHING
CLOTHING CLOTHING
ELECTRONICS ELECTRONICS
PATIO PATIO
BOOKS GROCERY
PATIO PATIO
PATIO PATIO
PATIO PATIO
ELECTRONICS ELECTRONICS
ELECTRONICS PATIO
BOOKS BOOKS
BOOKS BOOKS
CLOTHING CLOTHING
PATIO PATIO
PATIO PATIO
BOOKS BOOKS
BOOKS BOOKS
PATIO PATIO
CLOTHING CLOTHING
GROCERY GROCERY
GROCERY GROCERY
GROCERY GROCERY
ELECTRONICS ELECTRONICS
ELECTRONICS BOOKS
BOOKS BOOKS
BOOKS BOOKS
BOOKS BOOKS
CLOTHING CLOTHING
CLOTHING CLOTHING
BOOKS BOOKS
BOOKS BOOKS
BOOKS BOOKS
CLOTHING CLOTHING
ELECTRONICS ELECTRONICS
CLOTHING CLOTHING
PATIO PATIO
BOOKS BOOKS
BOOKS BOOKS
CLOTHING CLOTHING
CLOTHING CLOTHING
CLOTHING CLOTHING
CLOTHING CLOTHING
PATIO PATIO
GROCERY 

In [10]:

clf.score(test_x, test_y)

0.9090909090909091

In [11]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel':('linear', 'rbf'), 'C':[0.1,1,8,16,32]}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(train_x, train_y)



KeyboardInterrupt: 

In [None]:
clf.score(test_x, test_y)

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sn
import pandas as pd
import matplotlib as plt

y_pred = clf.predict(test_x)

labels = [Category.ELECTRONICS, Category.BOOKS, Category.CLOTHING, Category.GROCERY, Category.PATIO]

cm = confusion_matrix(test_y, y_pred, labels=labels)
df_cm = pd.DataFrame(cm, index=labels, columns=labels)

sn.heatmap(df_cm, annot=True, fmt='d')