In [159]:
import json
import pandas as pd
import re
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
import time

In [102]:
#Load JSON Data
json_data = None
json_data = 'trainingdata_catnumbers.json'
data = pd.read_json(json_data, orient='records')

In [104]:
#check for duplicates:
print('Shape with duplicates: ', data.shape)
data.drop_duplicates(subset=['uniq_id'], inplace=True)
print('Shape without duplicates: ', data.shape)

Shape with duplicates:  (1318, 31)
Shape without duplicates:  (1310, 31)


In [105]:
#store id of labeled data:
labeled_ids = data.uniq_id.values

In [106]:
##Convert Category Column to int
data['category'] = data['category'].astype('int')

In [107]:
data.product_name.head()

0    DIGITAL Funda Airpods Silicone 1 Case+Carabine...
1    GadgetBite Anti-Lost Magnetic Strap Silicone C...
2    Wow Imagine MonoCarbon Genuine Carbon Fiber Ul...
3    Brain Freezer Skin Soft Silicone Dual Layer Ul...
4    Case-Mate AirPods Pro Tough Case Cover Silicon...
Name: product_name, dtype: object

### First Preprocessing

In [108]:
#add stemmer: maps different forms of the same word to a common “stem” 
stemmer = SnowballStemmer('english')

#add stopwords: to remove irrelevant words from text
nltk.download("stopwords")

#add english stopwords
words = stopwords.words("english")

#Preprocessing: apply stemmer and remove stopwords
data['cleaned'] = data['product_name'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words]).lower())


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hannah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [109]:
#assing data and labels:
X = data['cleaned'] #data
y = data.category #labels

#split data into train and test data (80/20-Split) including the cleaned product names column (x) and the category column (y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [110]:
#stratified cross-validation 5-fold (results in 20% Test- and 80% Trainingdata)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

In [111]:
X.size

1310

### Fit Model

In [160]:
#add a pipeline to build the model: contains a list of transforms 

#CountVectorizer: Converts a collection of text to a matrix of token counts 
#TfidfTransformer: gives every word a weight depending of their frequency
#SelectPercentile: Selects defined percentile of best fitting features by using chi2-Test
#MultinomialNB: Applies Multinomial Naive Bayes


pipeline = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2), stop_words="english")),
                     ('tfid', TfidfTransformer(sublinear_tf=False)),
                     ('chi',  SelectPercentile(chi2, percentile=50)),
                     ('clf', MultinomialNB())
                    ])

In [113]:
#stop time for training:
start_training_time = time.time()

#fit model with 80/20 split:
model = pipeline.fit(X_train, y_train)

#stop time:
total_training_time = (time.time() - start_training_time)



In [115]:
print("accuracy score 80/20 split: " + str(model.score(X_test, y_test)))

#get crossvalidation scores:
scoring = ['accuracy']
cv_results = cross_validate(pipeline, X, y, scoring=scoring, cv=skf)


#print crossvalidation scores:
print("accuracy cross-val per fold: ", cv_results['test_accuracy'])
print("accuracy cross-val mean: ", np.mean(cv_results['test_accuracy']))

accuracy score 80/20 split: 0.7900763358778626
accuracy cross-val per fold:  [0.81679389 0.80534351 0.8129771  0.83587786 0.80534351]
accuracy cross-val mean:  0.815267175572519


In [117]:
#runtime for 80/20 split training: 
print("runtime for training in seconds: ", total_training_time)

#get runtime of cross-validation:
print("runtime for training in seconds mean: ", np.mean(cv_results['fit_time']))


runtime for training in seconds:  0.12810230255126953
runtime for training in seconds mean:  0.0972982406616211


### Overview categories and their corresponding numbers

In [95]:
# Electronic Accessories" -> 1
# Electronic Accessories | Smartphone Accessories -> 11
# Electronic Accessories | Smartphone Accessories | Mobile Covers -> 111
# Electronic Accessories | Smartphone Accessories | Mobile Covers | Flip Covers -> 1111
# Electronic Accessories | Smartphone Accessories | Mobile Covers | Covers with Stand -> 1112
# Electronic Accessories | Headphones -> 12
# Electronic Accessories | Headphones | Headphone Accessories -> 121
# Electronic Accessories | Tablet Accessories -> 13
# Electronic Accessories | Chargers and Adapters -> 14
# Electronic Accessories | Batteries -> 15
# Electronic Accessories | Batteries | Camera Batteries -> 151
# Electronic Accessories | Cables -> 16
# Electronic Accessories | Cables | HDMI Cables -> 161
# Electronic Accessories | Screen Protectors -> 17
# Electronic Accessories | Camera Accessories -> 18
# Electronic Accessories | Camera Accessories | Camera Filters and Lenses -> 181
# Electronic Accessories | Laptop Accessories -> 19
# Electronic Accessories | Laptop Accessories | Laptop Bags and Backpacks -> 191
# Electronic Accessories | Laptop Accessories | Laptop Sleeves -> 192
# Electronic Accessories | Memory Cards -> 2
# Electronic Accessories | Keyboards -> 3
# Electronic Accessories | Keyboards | Keyboard Accessories -> 31
# Electronic Accessories | Mice -> 4
# Electronic Accessories | Mice | Mouse Pads -> 41
# Electronic Accessories | TV Accessories -> 5
# Electronic Accessories | Power Supplies -> 6

### prediction results for 80/20 simple split

In [161]:
print(model.predict(['Peace Hand - Gel Wrist Rest Support Mouse Pad - Non-slip - Anti-skid - for Computer - PC - Laptop Black']))
#should be 41 (Electronic Accessories | Mice | Mouse Pads)

[41]


In [162]:
print(model.predict(['Aarfa Slimfit Durable Printed Hard Case for Xiaomi Redmi 4 (4X)']))
#should be 111 (Electronic Accessories | Smartphone Accessories | Mobile Covers)

[1112]


In [163]:
print(model.predict(['C&E CNE622576 (60 Feet/18.2 Meters) High Speed HDMI Cable Male to Male with Ethernet and Audio Return (Black)']))
#should be 161 (Electronic Accessories | Cables | HDMI Cables)

[161]


In [164]:
print(model.predict(['Saco Transparent Laptop Touchpad Protector for All Laptops (Clear, 158x98 mm)']))
#should be 19 (Electronic Accessories | Laptop Accessories)

[31]


In [165]:
print(model.predict(['Verbatim Bravo Wired Notebook Optical Mouse, Black (98106)']))
#should be 4 (Electronic Accessories | Mice)

[192]


In [158]:
print(model.predict(['FOSO F36 PU Leather Magnetic Flip Cover Wallet Back Cover for Honor View 20 (Brown)']))
#should be 1111 (Electronic Accessories | Smartphone Accessories | Mobile Covers | Flip Covers)

[1111]


### Fit model on all labeled data

In [127]:
#stop time for training:
start_training_time = time.time()

#fit model with data and labels:
model = pipeline.fit(X, y)

#stop time:
total_training_time = (time.time() - start_training_time)

#runtime for training: 
print("runtime for training in seconds: ", total_training_time)


runtime for training in seconds:  0.11916422843933105


### Load unlabeled dataset

In [128]:
#load json dataset 
json_file = 'Amazon_Electronics_2_cleaned.json'

#convert json string to pandas object and save it to data
data = pd.read_json(json_file, orient='records')

In [129]:
#filter for unlabeled data: 
data_unlabeled = data[~data['uniq_id'].isin(labeled_ids)].copy(deep=True)
print('size of unlabeled data: ', data_unlabeled.shape[0])

size of unlabeled data:  8690


In [130]:
#select relevant column and copy it into product_names
product_names = data_unlabeled[['product_name']].copy()

#show data
product_names.head()

Unnamed: 0,product_name
0,Cazcase Deer Pattern Smart Case Cover Flip Sta...
1,"D-kandy for Gionee A1 Lite, Fashion Series Lea..."
3,"iPhone 6 Case, LUVVITT® ULTRA ARMOR iPhone 6 C..."
4,Redgear MPR800 Soft Base Mousepad with 4 LED S...
6,Aimo Wireless IPH5PCLP002 Rubber Essentials Sl...


In [131]:
#preprocessing: apply stemming & lower case on each row of product_names. Save cleaned data in new column "cleaned"
product_names['cleaned'] = product_names['product_name'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words]).lower())

#show cleaned data
product_names['cleaned'].head()

0    cazcas deer pattern smart case cover flip stan...
1    d kandi gione a lite fashion seri leather flip...
3    iphon case luvvitt ultra armor iphon case best...
4     redgear mpr soft base mousepad led spectrum mode
6    aimo wireless iph pclp rubber essenti slim dur...
Name: cleaned, dtype: object

### Prediction

In [135]:
#stop time for prediction:
start_training_time = time.time()

#predict labels
predicted_labels = model.predict(product_names['cleaned'])

#stop time:
total_training_time = (time.time() - start_training_time)

#runtime for training: 
print("runtime for prediction in seconds for {} titles: {}".format(total_training_time,len(predicted_labels)))

runtime for prediction in seconds for 0.2754089832305908 titles: 8690
