In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

import json
import pandas as pd

# accesories  = 0
# cell_phones = 1

data_train = []

with open("accessories.json", 'r') as file:
    data = json.load(file)

for key in data.keys():
        for entry in data[key]:
            data_train.append(pd.DataFrame.from_dict([entry]))

with open("cell_phones.json", 'r') as file:
    data = json.load(file)

for key in data.keys():
        for entry in data[key]:
            data_train.append(pd.DataFrame.from_dict([entry]))

df = pd.concat(data_train)
df["sentence"] = df["title"] + df["summary"] + df["text"]

sentence = df["sentence"].values
label =    df["label"].values

sentence_train, sentence_test, y_train, y_test = train_test_split(
    sentence, 
    label, 
    test_size=0.2
)

vectorizer = CountVectorizer()
vectorizer.fit(sentence_train)

X_train = vectorizer.transform(sentence_train).toarray()
X_test  = vectorizer.transform(sentence_test).toarray()

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
score = log_reg.score(X_test, y_test)
print('Logistic Regression: {}'.format(score))

svc = SVC(kernel="linear", C=1)
svc.fit(X_train, y_train)
score = svc.score(X_test, y_test)
print('Linear kernel SVC accuracy: {}'.format(score))

svc = SVC(kernel="rbf", C=3)
svc.fit(X_train, y_train)
score = svc.score(X_test, y_test)
print('Radial Basis Function kernel SVC accuracy: {}'.format(score))

nb = GaussianNB()
nb.fit(X_train, y_train)
score = nb.score(X_test, y_test)
print('Gaussian Naive Bayes accuracy: {}'.format(score))

mlp = MLPClassifier()
mlp.fit(X_train, y_train)
score = mlp.score(X_test, y_test)
print('Multi-layer Perceptron Classifier accuracy: {}'.format(score))

Logistic Regression: 0.875
Linear kernel SVC accuracy: 1.0
Radial Basis Function kernel SVC accuracy: 1.0
Gaussian Naive Bayes accuracy: 1.0
Multi-layer Perceptron Classifier accuracy: 1.0


In [7]:
# Predictions with my "Proof of concept" model(Gaussian Naive Bayes)

with open("cell_phones_and_accessories.json", 'r') as file:
    data = json.load(file)
    
data_train = []
i = 0
for key in data.keys():
    for entry in data[key]:
        if i == 100000:
            break
        sentence_to_predict = pd.DataFrame.from_dict([entry])
        
        
        sentence_to_predict["sentence"] = sentence_to_predict["title"] + sentence_to_predict["summary"] + sentence_to_predict["text"]
        sentence = sentence_to_predict["sentence"].values
        data_to_predict = vectorizer.transform(sentence).toarray()

        entry["label"] = str(nb.predict(data_to_predict)[0])
        data_train.append(entry)
        i+=1

with open("cell_and_accessories_test_label.json", "w+") as file:
    json.dump(data_train, file, indent=2)