In [186]:
import json
import pandas as pd
import re
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.feature_selection import mutual_info_classif, SelectPercentile
from sklearn.svm import SVC
import time

In [187]:
#Load JSON Data
json_data = None
json_data = 'trainingdata_catnumbers.json'
data = pd.read_json(json_data, orient='records')

In [188]:
##Convert Category Column to int
data['category'] = data['category'].astype('int')

In [189]:
#check for duplicates and delete
print('Shape with duplicates: ', data.shape)
data.drop_duplicates(subset=['uniq_id'], inplace=True)
print('Shape without duplicates: ', data.shape)

Shape with duplicates:  (1318, 31)
Shape without duplicates:  (1310, 31)


In [190]:
#store id of labeled data 
labeled_ids = data.uniq_id.values

### First Preprocessing

In [191]:
#add stemmer: maps different forms of the same word to a common “stem” 
stemmer = SnowballStemmer('english')

#add stopwords: to remove irrelevant words from text
nltk.download("stopwords")

#add english stopwords
words = stopwords.words("english")

#apply stemmer and remove stopwords
data['cleaned'] = data['product_name'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words]).lower())


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hannah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [192]:
#assign data and labels
X = data['cleaned'] #data
y = data.category #labels


#split data into train and test data (80/20-Split) including the cleaned product names column (x) and the category column (y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [193]:
#stratified cross-validation 5-fold (results in 20% Test- and 80% Trainingdata)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

### Fit Model

In [195]:
#add a pipeline to build the model: contains a list of transforms 

#CountVectorizer: Converts a collection of text to a matrix of token counts 
#TfidfTransformer: gives every word a weight depending of their frequency
#SelectPercentile: Selects defined percentile of best fitting features by using chi2-Test
#SVC: Applies Support Vector Classification



pipeline = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2))),
                     ('tfid', TfidfTransformer()),
                     ('chi',  SelectPercentile(chi2, percentile=50)),
                     ('clf', SVC(C=1.0, max_iter=1000, random_state=0))])

In [196]:
y_train = y_train.astype('int') #make sure all data hat datatype int

#stop time for training:
start_training_time = time.time()

#fit model with 80/20 split:
model = pipeline.fit(X_train, y_train)

#stop time:
total_training_time = (time.time() - start_training_time)


In [197]:
print("accuracy score 80/20 split: " + str(model.score(X_test, y_test)))

#get crossvalidation scores:
scoring = ['accuracy']
cv_results = cross_validate(pipeline, X, y, scoring=scoring, cv=skf)


#print crossvalidation scores:
print("accuracy cross-val per fold: ", cv_results['test_accuracy'])
print("accuracy cross-val mean: ", np.mean(cv_results['test_accuracy']))

accuracy score 80/20 split: 0.8549618320610687
accuracy cross-val per fold:  [0.84351145 0.87022901 0.88549618 0.8740458  0.85114504]
accuracy cross-val mean:  0.8648854961832061


In [198]:
#runtime for 80/20 split training: 
print("runtime for training in seconds: ", total_training_time)

#get runtime of cross-validation:
print("runtime for training in seconds mean: ", np.mean(cv_results['fit_time']))

runtime for training in seconds:  0.4547598361968994
runtime for training in seconds mean:  0.5098120212554932


### Overview categories and their corresponding numbers

In [199]:
# Electronic Accessories" -> 1
# Electronic Accessories | Smartphone Accessories -> 11
# Electronic Accessories | Smartphone Accessories | Mobile Covers -> 111
# Electronic Accessories | Smartphone Accessories | Mobile Covers | Flip Covers -> 1111
# Electronic Accessories | Smartphone Accessories | Mobile Covers | Covers with Stand -> 1112
# Electronic Accessories | Headphones -> 12
# Electronic Accessories | Headphones | Headphone Accessories -> 121
# Electronic Accessories | Tablet Accessories -> 13
# Electronic Accessories | Chargers and Adapters -> 14
# Electronic Accessories | Batteries -> 15
# Electronic Accessories | Batteries | Camera Batteries -> 151
# Electronic Accessories | Cables -> 16
# Electronic Accessories | Cables | HDMI Cables -> 161
# Electronic Accessories | Screen Protectors -> 17
# Electronic Accessories | Camera Accessories -> 18
# Electronic Accessories | Camera Accessories | Camera Filters and Lenses -> 181
# Electronic Accessories | Laptop Accessories -> 19
# Electronic Accessories | Laptop Accessories | Laptop Bags and Backpacks -> 191
# Electronic Accessories | Laptop Accessories | Laptop Sleeves -> 192
# Electronic Accessories | Memory Cards -> 2
# Electronic Accessories | Keyboards -> 3
# Electronic Accessories | Keyboards | Keyboard Accessories -> 31
# Electronic Accessories | Mice -> 4
# Electronic Accessories | Mice | Mouse Pads -> 41
# Electronic Accessories | TV Accessories -> 5
# Electronic Accessories | Power Supplies -> 6

### prediction results for 80/20 simple split

In [200]:
print(model.predict(['Peace Hand - Gel Wrist Rest Support Mouse Pad - Non-slip - Anti-skid - for Computer - PC - Laptop Black']))
#should be 41 (Electronic Accessories | Mice | Mouse Pads)

[41]


In [201]:
print(model.predict(['Aarfa Slimfit Durable Printed Hard Case for Xiaomi Redmi 4 (4X)']))
#should be 111 (Electronic Accessories | Smartphone Accessories | Mobile Covers)

[111]


In [202]:
print(model.predict(['C&E CNE622576 (60 Feet/18.2 Meters) High Speed HDMI Cable Male to Male with Ethernet and Audio Return (Black)']))
#should be 161 (Electronic Accessories | Cables | HDMI Cables)

[161]


In [203]:
print(model.predict(['Saco Transparent Laptop Touchpad Protector for All Laptops (Clear, 158x98 mm)']))
#should be 19 (Electronic Accessories | Laptop Accessories)

[19]


In [204]:
print(model.predict(['Verbatim Bravo Wired Notebook Optical Mouse, Black (98106)']))
#should be 4 (Electronic Accessories | Mice)

[1]


In [205]:
print(model.predict(['FOSO F36 PU Leather Magnetic Flip Cover Wallet Back Cover for Honor View 20 (Brown)']))
#should be 1111 (Electronic Accessories | Smartphone Accessories | Mobile Covers | Flip Covers)

[1111]


### Fit model on all labeled data

In [206]:
#stop time for training:
start_training_time = time.time()

#fit model with data and labels:
model = pipeline.fit(X, y)

#stop time:
total_training_time = (time.time() - start_training_time)

#runtime for training: 
print("runtime for training in seconds: ", total_training_time)


runtime for training in seconds:  0.6360390186309814


### Load unlabeled dataset 

In [207]:
#load json dataset 
json_file = 'Amazon_Electronics_2_cleaned.json'

#convert json string to pandas object and save it to data
data = pd.read_json(json_file, orient='records')

In [174]:
#filter for unlabeled data: 
data_unlabeled = data[~data['uniq_id'].isin(labeled_ids)].copy(deep=True)

print('size of unlabeled data: ', data_unlabeled.shape[0])

size of unlabeled data:  8690


In [175]:
#select relevant column and copy it into product_names
product_names = data_unlabeled[['product_name']].copy()

#show data
product_names.head()

Unnamed: 0,product_name
0,Cazcase Deer Pattern Smart Case Cover Flip Sta...
1,"D-kandy for Gionee A1 Lite, Fashion Series Lea..."
3,"iPhone 6 Case, LUVVITT® ULTRA ARMOR iPhone 6 C..."
4,Redgear MPR800 Soft Base Mousepad with 4 LED S...
6,Aimo Wireless IPH5PCLP002 Rubber Essentials Sl...


In [176]:
#select relevant column and copy it into product_names
product_names = data_unlabeled.copy()

#show data
product_names.head()

Unnamed: 0,uniq_id,crawl_timestamp,asin,product_url,product_name,image_urls__small,medium,large,browsenode,seller_name,...,no__of_offers,no__of_sellers,sales_rank_in_child_category,product_details__k_v_pairs,Number_Of_Items,Batteries_Included,Batteries_Required,Material,Form_Factor,left_in_stock
0,8258a459bf720ac86b8bc2d214346c35,2020-02-06 19:30:06 +0000,B07652FT69,https://www.amazon.in/Cazcase-Pattern-Smart-Co...,Cazcase Deer Pattern Smart Case Cover Flip Sta...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1375329031,E Shop Solution,...,,,,,,,,,,
1,8610899e786315b1adad872764f424b0,2020-02-07 00:20:04 +0000,B07TMCY7WN,https://www.amazon.in/D-kandy-Gionee-A1-Lite-M...,"D-kandy for Gionee A1 Lite, Fashion Series Lea...",https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,,1389409031,SHIV DURGA ENTERPRISES,...,,,,,,,,,,
3,c5499856b11d3a2796bdd362b1701fc1,2020-02-07 02:38:59 +0000,B00JYKGFWY,https://www.amazon.in/iPhone-LUVVITT%C2%AE-Scr...,"iPhone 6 Case, LUVVITT® ULTRA ARMOR iPhone 6 C...",https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1389409031,HelloYehlo,...,,,,,,,,,,
4,ce5ac9f85505667fb9dafce9c1b0103c,2020-02-07 13:30:48 +0000,B07LB5SGK7,https://www.amazon.in/Redgear-MPR800-Soft-Mous...,Redgear MPR800 Soft Base Mousepad with 4 LED S...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1375413031,Appario Retail Private Ltd,...,,,,,,,,,,
6,7fff79ed7aacb6b010d870c293a6e0e4,2020-02-07 00:40:57 +0000,B009NSXDHW,https://www.amazon.in/Aimo-Wireless-IPH5PCLP00...,Aimo Wireless IPH5PCLP002 Rubber Essentials Sl...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1389409031,,...,,,,,,,,,,


In [177]:
#preprocessing: apply stemming & lower case on each row of product_names. Save cleaned data in new column "cleaned"
product_names['cleaned'] = product_names['product_name'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words]).lower())

#show cleaned data
product_names['cleaned'].head()

0    cazcas deer pattern smart case cover flip stan...
1    d kandi gione a lite fashion seri leather flip...
3    iphon case luvvitt ultra armor iphon case best...
4     redgear mpr soft base mousepad led spectrum mode
6    aimo wireless iph pclp rubber essenti slim dur...
Name: cleaned, dtype: object

### Prediction

In [178]:
#stop time for prediction:
start_training_time = time.time()

#predict categories and store the predictions in new column:
data_unlabeled['category_label'] = model.predict(product_names['cleaned'])

#stop time:
total_training_time = (time.time() - start_training_time)

#runtime for training: 
print("runtime for prediction in seconds for {} titles: {}".format(total_training_time,len(predicted_labels)))

runtime for prediction in seconds for 2.436814069747925 titles: 8690


In [179]:
#Load labeled Data
json_data = None
json_data = 'trainingdata_catnumbers.json'
data_labeled = pd.read_json(json_data, orient='records')

In [180]:
#rename column with labels (in preparation for concat step):
data_labeled.rename(columns={'category':'category_label'}, inplace=True)

In [181]:
#get all labels in single df:
final_data = pd.concat([data_unlabeled,data_labeled],axis=0)

In [182]:
#see if data is added
final_data.tail()

Unnamed: 0,uniq_id,crawl_timestamp,asin,product_url,product_name,image_urls__small,medium,large,browsenode,seller_name,...,no__of_sellers,sales_rank_in_child_category,product_details__k_v_pairs,Number_Of_Items,Batteries_Included,Batteries_Required,Material,Form_Factor,left_in_stock,category_label
1313,e368d34f1661f7c9b61833bc17000bc7,2020-02-06 18:07:49 +0000,B01GIZ4DHG,https://www.amazon.in/Broadlink-Black-Smart-Re...,Broadlink 2018 New RM-Mini3 Black Bean Univers...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1388929031,global tech store,...,,,,,,,,,,1
1314,6009674ac5a780fd3229423cd8b0f121,2020-02-07 11:01:41 +0000,B07S7P8Z1D,https://www.amazon.in/Zebronics-Monster-X10-co...,Zebronics Zeb-Moving Monster X10 Trolly DJ Spe...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1375442031,DRS TECH,...,,,,,,,,,,1
1315,f906d33851ee356fdf48be84ae5c5b0e,2020-02-06 18:07:33 +0000,B072LVD67Q,https://www.amazon.in/AH59-02692E-Replaced-Sou...,VINABTY New AH59-02692E Replaced Remote fit fo...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1388929031,,...,,{'AudioVideoRemoteControls': '#6287'},"{'Product_Dimensions': '14 x 5.1 x 2.5 cm', 'I...",,,,,,,1
1316,f153bb16e99e2e4ba834d10f0da73261,2020-02-06 19:08:44 +0000,B072JCXN11,https://www.amazon.in/Komshine-10-Locator-Conn...,Pen Type Red Light Source Komshine 1/10/20mw F...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1388894031,,...,,,"{'Item_Weight': '200 g', 'Item_model_number': ...",,,,,,,1
1317,97a1c02f12be2425c86315b80c8509f6,2020-02-06 20:44:28 +0000,B07HHFK1FB,https://www.amazon.in/Amateur-Antenna-430mhz-M...,HYS Amateur Dual Band So239 17 Inch Antenna wi...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1389228031,,...,,,"{'Item_model_number': 'TC-M40VU', 'ASIN': 'B07...",,,,,,,1


In [183]:
#add new column category_name and insert category_label
final_data['category_name'] = final_data['category_label']

In [184]:
#replace category label by corresponding category name and insert into column category_name
final_data['category_name'] = final_data['category_name'].replace( 1 , "Electronic Accessories")
final_data['category_name'] = final_data['category_name'].replace( 11 , "Electronic Accessories | Smartphone Accessories") 
final_data['category_name'] = final_data['category_name'].replace( 111 , "Electronic Accessories | Smartphone Accessories | Mobile Covers") 
final_data['category_name'] = final_data['category_name'].replace( 1111 , "Electronic Accessories | Smartphone Accessories | Mobile Covers | Flip Covers") 
final_data['category_name'] = final_data['category_name'].replace( 1112 , "Electronic Accessories | Smartphone Accessories | Mobile Covers | Covers with Stand") 
final_data['category_name'] = final_data['category_name'].replace( 12 , "Electronic Accessories | Headphones") 
final_data['category_name'] = final_data['category_name'].replace( 121 , "Electronic Accessories | Headphones | Headphone Accessories") 
final_data['category_name'] = final_data['category_name'].replace( 13 , "Electronic Accessories | Tablet Accessories") 
final_data['category_name'] = final_data['category_name'].replace( 14 , "Electronic Accessories | Chargers and Adapters") 
final_data['category_name'] = final_data['category_name'].replace( 15 , "Electronic Accessories | Batteries") 
final_data['category_name'] = final_data['category_name'].replace( 151 , "Electronic Accessories | Batteries | Camera Batteries") 
final_data['category_name'] = final_data['category_name'].replace( 16 , "Electronic Accessories | Cables") 
final_data['category_name'] = final_data['category_name'].replace( 161 , "Electronic Accessories | Cables | HDMI Cables") 
final_data['category_name'] = final_data['category_name'].replace( 17 , "Electronic Accessories | Screen Protectors") 
final_data['category_name'] = final_data['category_name'].replace( 18 , "Electronic Accessories | Camera Accessories") 
final_data['category_name'] = final_data['category_name'].replace( 181 , "Electronic Accessories | Camera Accessories | Camera Filters and Lenses") 
final_data['category_name'] = final_data['category_name'].replace( 19 , "Electronic Accessories | Laptop Accessories") 
final_data['category_name'] = final_data['category_name'].replace( 191 , "Electronic Accessories | Laptop Accessories | Laptop Bags and Backpacks") 
final_data['category_name'] = final_data['category_name'].replace( 192 , "Electronic Accessories | Laptop Accessories | Laptop Sleeves") 
final_data['category_name'] = final_data['category_name'].replace( 2 , "Electronic Accessories | Memory Cards") 
final_data['category_name'] = final_data['category_name'].replace( 3 , "Electronic Accessories | Keyboards") 
final_data['category_name'] = final_data['category_name'].replace( 31 , "Electronic Accessories | Keyboards | Keyboard Accessories") 
final_data['category_name'] = final_data['category_name'].replace( 4 , "Electronic Accessories | Mice") 
final_data['category_name'] = final_data['category_name'].replace( 41 , "Electronic Accessories | Mice | Mouse Pads") 
final_data['category_name'] = final_data['category_name'].replace( 5 , "Electronic Accessories | TV Accessories") 
final_data['category_name'] = final_data['category_name'].replace( 6 , "Electronic Accessories | Power Supplies") 



In [185]:
final_data.head()

Unnamed: 0,uniq_id,crawl_timestamp,asin,product_url,product_name,image_urls__small,medium,large,browsenode,seller_name,...,sales_rank_in_child_category,product_details__k_v_pairs,Number_Of_Items,Batteries_Included,Batteries_Required,Material,Form_Factor,left_in_stock,category_label,category_name
0,8258a459bf720ac86b8bc2d214346c35,2020-02-06 19:30:06 +0000,B07652FT69,https://www.amazon.in/Cazcase-Pattern-Smart-Co...,Cazcase Deer Pattern Smart Case Cover Flip Sta...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1375329031,E Shop Solution,...,,,,,,,,,13,Electronic Accessories | Tablet Accessories
1,8610899e786315b1adad872764f424b0,2020-02-07 00:20:04 +0000,B07TMCY7WN,https://www.amazon.in/D-kandy-Gionee-A1-Lite-M...,"D-kandy for Gionee A1 Lite, Fashion Series Lea...",https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,,1389409031,SHIV DURGA ENTERPRISES,...,,,,,,,,,1111,Electronic Accessories | Smartphone Accessorie...
3,c5499856b11d3a2796bdd362b1701fc1,2020-02-07 02:38:59 +0000,B00JYKGFWY,https://www.amazon.in/iPhone-LUVVITT%C2%AE-Scr...,"iPhone 6 Case, LUVVITT® ULTRA ARMOR iPhone 6 C...",https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1389409031,HelloYehlo,...,,,,,,,,,111,Electronic Accessories | Smartphone Accessorie...
4,ce5ac9f85505667fb9dafce9c1b0103c,2020-02-07 13:30:48 +0000,B07LB5SGK7,https://www.amazon.in/Redgear-MPR800-Soft-Mous...,Redgear MPR800 Soft Base Mousepad with 4 LED S...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1375413031,Appario Retail Private Ltd,...,,,,,,,,,1,Electronic Accessories
6,7fff79ed7aacb6b010d870c293a6e0e4,2020-02-07 00:40:57 +0000,B009NSXDHW,https://www.amazon.in/Aimo-Wireless-IPH5PCLP00...,Aimo Wireless IPH5PCLP002 Rubber Essentials Sl...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1389409031,,...,,,,,,,,,111,Electronic Accessories | Smartphone Accessorie...


In [208]:
#check for duplicates and delete
print('Shape with duplicates: ', final_data.shape)
final_data.drop_duplicates(subset=['uniq_id'], inplace=True)
print('Shape without duplicates: ', final_data.shape)

Shape with duplicates:  (10008, 38)
Shape without duplicates:  (10000, 38)


In [210]:
#save file with added labels as json
json_object = json.dumps([row.dropna().to_dict() for index,row in final_data.iterrows()],indent=4)

with open('AmazonElectronics_Labeled.json', 'w') as outfile:
    outfile.write(json_object) 