## Sentiment Classification - NLP
* we want to categorise which category the product belongs to 


In [2]:
import numpy as np
import seaborn as sns
import pickle
import matplotlib.pyplot as plt
import sklearn as sk
from random import shuffle
import json

In [3]:
class Category:
    ELECTRONICS = "ELECTRONICS"
    BOOKS = "BOOKS"
    CLOTHING = "CLOTHING"
    GROCERY = "GROCERY"
    PATIO = "PATIO"
    
class CategoryObject:
    def __init__(self, category, text):
        self.category = category
        self.text = text

In [4]:
file_names =[
  'files/category/Electronics_small.json',
    'files/category/Books_small.json', 
    'files/category/Clothing_small.json',
    'files/category/Grocery_small.json', 
    'files/category/Patio_small.json'
]

list_categories = [
    Category.ELECTRONICS,
    Category.BOOKS, 
    Category.CLOTHING, 
    Category.GROCERY, 
    Category.PATIO
]
categories = []

for i in range(len(file_names)):
    category = list_categories[i]
    with open(file_names[i], 'r') as reader:
        for line in reader:
            category_json = json.loads(line)
            cate = CategoryObject(category, category_json["reviewText"])
            categories.append(cate)

shuffle(categories)

### Data Preparation

In [6]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(categories, test_size = .2, random_state=33)

In [7]:
len(train), len(test)

(4000, 1000)

In [8]:
X_train = [i.text for i in train]
y_train = [i.category for i in train]

X_test = [i.text for i in test]
y_test = [i.category for i in test]

In [9]:
y_train[:5]

['GROCERY', 'BOOKS', 'ELECTRONICS', 'ELECTRONICS', 'GROCERY']

### Bag of word Vectorisation

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test)

In [11]:
len(y_train), len(X_train)

(4000, 4000)

### Classification of Categories
* In this one im going to use SVC from sklearn.svm

In [12]:
from sklearn.svm import SVC
clf = SVC()
clf.fit(X_train_vectors, y_train)

SVC()

In [18]:
sent = "cell phone"
clf.predict(vectorizer.transform([sent]))


array(['ELECTRONICS'], dtype='<U11')

In [14]:
clf.score(X_train_vectors, y_train)

0.895

### Perfomance

In [17]:
from sklearn.metrics import f1_score
y_pred = clf.predict(X_test_vectors)

f1_score(y_test, y_pred, average=None)

array([0.94021739, 0.77218225, 0.70725995, 0.81428571, 0.72282609])

In [20]:
sents = [ "cell phone", "shirt", "computer"]
sents_vects = vectorizer.transform(sents)
clf.predict(sents_vects)

array(['ELECTRONICS', 'CLOTHING', 'ELECTRONICS'], dtype='<U11')

> **Conclusion:** - The more the train data the better the **model**.