In [85]:
import json
import pandas as pd
import re
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

#### Prepare preprocessing

In [86]:
#instantiate language specific stemmer: stemmer removes morphological affixes from words, leaving only the word stem
stemmer = SnowballStemmer('english')

#download stopwords
nltk.download("stopwords")

#select langueage specific stopwords and save in "words"
words = stopwords.words("english")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hannah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Load Real Data

In [137]:
#load json dataset 
json_file = 'Amazon_Electronics_2_cleaned.json'

#convert json string to pandas object and save it to data
data = pd.read_json(json_file, orient='records')

In [138]:
#show head of loaded data
data.head()

Unnamed: 0,uniq_id,crawl_timestamp,asin,product_url,product_name,image_urls__small,medium,large,browsenode,seller_name,...,no__of_offers,no__of_sellers,sales_rank_in_child_category,product_details__k_v_pairs,Number_Of_Items,Batteries_Included,Batteries_Required,Material,Form_Factor,left_in_stock
0,8258a459bf720ac86b8bc2d214346c35,2020-02-06 19:30:06 +0000,B07652FT69,https://www.amazon.in/Cazcase-Pattern-Smart-Co...,Cazcase Deer Pattern Smart Case Cover Flip Sta...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1375329031,E Shop Solution,...,,,,,,,,,,
1,8610899e786315b1adad872764f424b0,2020-02-07 00:20:04 +0000,B07TMCY7WN,https://www.amazon.in/D-kandy-Gionee-A1-Lite-M...,"D-kandy for Gionee A1 Lite, Fashion Series Lea...",https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,,1389409031,SHIV DURGA ENTERPRISES,...,,,,,,,,,,
2,1b0e4aa1d0cd635e2366558512ccc067,2020-02-06 19:59:39 +0000,B079VL519L,https://www.amazon.in/Heartly-Lenovo-A6600-Spi...,Heartly Kickstand Hard Dual Rugged Armor Hybri...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1389409031,Ruh International,...,,,,,,,,,,
3,c5499856b11d3a2796bdd362b1701fc1,2020-02-07 02:38:59 +0000,B00JYKGFWY,https://www.amazon.in/iPhone-LUVVITT%C2%AE-Scr...,"iPhone 6 Case, LUVVITT® ULTRA ARMOR iPhone 6 C...",https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1389409031,HelloYehlo,...,,,,,,,,,,
4,ce5ac9f85505667fb9dafce9c1b0103c,2020-02-07 13:30:48 +0000,B07LB5SGK7,https://www.amazon.in/Redgear-MPR800-Soft-Mous...,Redgear MPR800 Soft Base Mousepad with 4 LED S...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1375413031,Appario Retail Private Ltd,...,,,,,,,,,,


In [163]:
#select relevant column and copy it into product_names
product_names = data[['product_name']].copy()

#show data
product_names.head()

Unnamed: 0,product_name
0,Cazcase Deer Pattern Smart Case Cover Flip Sta...
1,"D-kandy for Gionee A1 Lite, Fashion Series Lea..."
2,Heartly Kickstand Hard Dual Rugged Armor Hybri...
3,"iPhone 6 Case, LUVVITT® ULTRA ARMOR iPhone 6 C..."
4,Redgear MPR800 Soft Base Mousepad with 4 LED S...


#### Preprocessing

In [164]:
#preprocessing: apply stemming & lower case on each row of product_names. Save cleaned data in new column "cleaned"
product_names['cleaned'] = product_names['product_name'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words]).lower())

#show cleaned data
product_names['cleaned'].head()

0    cazcas deer pattern smart case cover flip stan...
1    d kandi gione a lite fashion seri leather flip...
2    heart kickstand hard dual rug armor hybrid bum...
3    iphon case luvvitt ultra armor iphon case best...
4     redgear mpr soft base mousepad led spectrum mode
Name: cleaned, dtype: object

In [165]:
#vectorize cleaned text
vectorizer = TfidfVectorizer()
text_vec = vectorizer.fit_transform(product_names['cleaned'])

#show size of cleaned text
text_vec.shape

(10000, 7598)

#### K-Means Algorithm: 1. Iteration

In [166]:
#apply KMeans: select number of clusters 
kmeans = KMeans(n_clusters=30, random_state=0).fit(text_vec)


#### Add the clusters to the data

In [167]:
#add clusters to original data
product_names['clusters'] = kmeans.labels_

In [271]:
#show product_names to see added clusters column
product_names.head()

Unnamed: 0,product_name,cleaned,clusters
0,Cazcase Deer Pattern Smart Case Cover Flip Sta...,cazcas deer pattern smart case cover flip stan...,12
1,"D-kandy for Gionee A1 Lite, Fashion Series Lea...",d kandi gione a lite fashion seri leather flip...,26
2,Heartly Kickstand Hard Dual Rugged Armor Hybri...,heart kickstand hard dual rug armor hybrid bum...,3
3,"iPhone 6 Case, LUVVITT® ULTRA ARMOR iPhone 6 C...",iphon case luvvitt ultra armor iphon case best...,10
4,Redgear MPR800 Soft Base Mousepad with 4 LED S...,redgear mpr soft base mousepad led spectrum mode,7


In [272]:
#get number of items per cluster
product_names.groupby(by=["clusters"]).size()

clusters
0      229
1      352
2      227
3      244
4      630
5      128
6      153
7     2042
8       94
9      375
10     505
11     232
12     168
13     445
14     493
15     187
16     173
17     399
18     237
19     211
20     117
21      76
22     113
23     232
24     227
25     209
26     565
27     439
28     157
29     341
dtype: int64

In [171]:
#check results for cluster 1
product_names[product_names['clusters']==1]['product_name']


19      Nainika Cute Girlish Soft Silicone Meow Cat Ea...
24      Mobyro Kitty Case Cover for- Motorola G4 Plus ...
50      ANVIKA 3D Cute Rubber Hello Kitty Back Case Co...
67      YES2GOOD Redmi 4A Kitty Case Cover | Soft TPU ...
100     YES2GOOD Soft Silicone Printed Rubber Back Cov...
                              ...                        
9914    Mobyro Galaxy C9 PRO Hello Kitty Cover Case Cu...
9961    Nainika Cute Girlish Soft Silicone Meow Cat Ea...
9983    Mobyro Cute Cartoon Hello Kitty Case Cover for...
9989    Avianna 3D Cute Hello Kitty Back Cover for Hon...
9990    ANVIKA Soft Silicone Rubber Hello Kitty Cover ...
Name: product_name, Length: 352, dtype: object

In [273]:
#add clusters to original data
result_df = pd.merge(data, product_names[['product_name','clusters']], how='inner', on=["product_name"])

In [274]:
result_df[result_df.duplicated(subset=['product_url'])]

Unnamed: 0,uniq_id,crawl_timestamp,asin,product_url,product_name,image_urls__small,medium,large,browsenode,seller_name,...,no__of_sellers,sales_rank_in_child_category,product_details__k_v_pairs,Number_Of_Items,Batteries_Included,Batteries_Required,Material,Form_Factor,left_in_stock,clusters
11,3e46c10d86062e9f9b9a828b22cbfe57,2020-02-07 00:51:50 +0000,B07BV5K9TV,https://www.amazon.in/Aarfa-Slimfit-Durable-Pr...,Aarfa Slimfit Durable Printed Hard Case for Xi...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1389409031,arretail,...,,,,,,,,,,17
13,72de8eea88eca5d131ae48a04c82147e,2020-02-07 00:51:02 +0000,B07BTS9NSX,https://www.amazon.in/Aarfa-Slimfit-Durable-Pr...,Aarfa Slimfit Durable Printed Hard Case for Xi...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1389409031,arretail,...,,,,,,,,,,17
66,f0525fde1a95b50620339b79964b89c5,2020-02-06 21:18:36 +0000,B01MZB41X2,https://www.amazon.in/SCHOFIC-Premium-Leather-...,SCHOFIC Fancy Wallet Diary Faux Leather Mobile...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1389409031,Schofic,...,,,,,,,,,,6
68,e4e7783f08d99d348b02d13588eb0c69,2020-02-06 21:30:34 +0000,B01EDYG1J2,https://www.amazon.in/SCHOFIC-Premium-Leather-...,SCHOFIC Fancy Wallet Diary Faux Leather Mobile...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1389409031,Schofic,...,,,,,,,,,,6
159,4f8b85f170a78a25f235951b36525991,2020-02-07 09:32:04 +0000,B07MTDV44F,https://www.amazon.in/Taiaiping-Double-Layer-C...,Taiaiping Armor Series Perfectly Fits The Sams...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1389409031,SSA infotech,...,,,,,,,,,,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7983,261e915c38c29da72094fce678f8bee8,2020-02-07 01:48:18 +0000,B07VVNZ6KF,https://www.amazon.in/Shree-Krishnam-Enterpris...,Shree Krishnam Enterprises Back Cover for Oppo...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1389409031,KRISHNAM ENTERPRISESs,...,,,,,,,,,,2
8080,f3cfab1de7cc888e7420d91de735f19a,2020-02-06 23:30:20 +0000,B07TCK5JDB,https://www.amazon.in/BuyFeb%C2%AE-Printed-Com...,BuyFeb® Printed Soft Back Cover Case Compatibl...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,,1389409031,BuyFeb com,...,,,,,,,,,,15
8082,c976542cae225e4eabee192f377e5472,2020-02-06 23:24:48 +0000,B07T9HKN34,https://www.amazon.in/BuyFeb%C2%AE-Printed-Com...,BuyFeb® Printed Soft Back Cover Case Compatibl...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,,1389409031,BuyFeb com,...,,,,,,,,,,15
8670,80ce552ea93eb406c2c9142289a8cd1f,2020-02-07 09:05:59 +0000,B07X48F1FS,https://www.amazon.in/Shree-Krishnam-Enterpris...,"Shree Krishnam Enterprises vivo s1 Back Cover,...",https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,,1389409031,KRISHNAM ENTERPRISESs,...,,,,,,,,,,15


In [275]:
#drop duplicates
result_df.drop_duplicates(subset=['product_url'],inplace=True)
result_df.shape

(10000, 37)

In [276]:
#sort values by clusters
result_df.sort_values(by=['clusters'],inplace=True)
result_df.head()



Unnamed: 0,uniq_id,crawl_timestamp,asin,product_url,product_name,image_urls__small,medium,large,browsenode,seller_name,...,no__of_sellers,sales_rank_in_child_category,product_details__k_v_pairs,Number_Of_Items,Batteries_Included,Batteries_Required,Material,Form_Factor,left_in_stock,clusters
8919,9a37ef174752e4e7775c39fbd50c7521,2020-02-06 19:42:56 +0000,B079H4VBPD,https://www.amazon.in/Tech-Tempered-Screen-Pro...,M Tech Redmi Note 4 Red & Tempered Glass Scree...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1389403031,M Tech enterprises,...,,,,,,,,,,0
2143,b78555440d8ed04ad8748bae96ff0ee7,2020-02-06 23:13:03 +0000,B0169EN22I,https://www.amazon.in/Skinomi-TechSkin-Surface...,Skinomi TechSkin - Surface Pro 4 Screen Protec...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1375332031,,...,,,,,,,,,,0
1690,aca76bb28865832c26a25fb6cd42155c,2020-02-06 20:59:05 +0000,B01N9OL91B,https://www.amazon.in/Moshi-Anti-Glare-Protect...,Moshi iVisor Anti-Glare Screen Protector Compa...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1375317031,,...,,,,,,,,,,0
707,f1157809aeb50d198cdc3e36ee320da3,2020-02-06 23:03:43 +0000,B01KZNRM0W,https://www.amazon.in/RBEIK-iPad-Screen-Protec...,RBEIK iPad Pro 12.9 Screen Protector Glass - P...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1375332031,Cart2India SLP,...,,,,,,,,,,0
9795,c26dabcdc08233aa944cd5c8ce95140e,2020-02-07 09:34:56 +0000,B07GB6XTNF,https://www.amazon.in/i-Blason-Full-Body-Glitt...,"Samsung Galaxy Note 9 Case, i-Blason [Cosmo] F...",https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1389409031,Expo Associates,...,,,,,,,,,,0


#### Generate sample dataset from the clusters

In [277]:
#get random 20 products from each cluster
sample_dataset2=result_df.groupby("clusters").sample(n=20, random_state=1)

In [278]:
sample_dataset2.head()

Unnamed: 0,uniq_id,crawl_timestamp,asin,product_url,product_name,image_urls__small,medium,large,browsenode,seller_name,...,no__of_sellers,sales_rank_in_child_category,product_details__k_v_pairs,Number_Of_Items,Batteries_Included,Batteries_Required,Material,Form_Factor,left_in_stock,clusters
4754,754f992887816c11dc6433ee0ac2d2f6,2020-02-06 20:57:26 +0000,B00HSGR7R8,https://www.amazon.in/LENTION-Anti-scratch-Hyd...,LENTION Clear Screen Protector for 13-inch Mac...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1375317031,,...,,,,,,,,,,0
3831,fdf046641c7db85afb30fcd584b8598e,2020-02-07 01:11:51 +0000,B01AA5SFK6,https://www.amazon.in/EEEKit-Premium-Tempered-...,EEEKit HD Tempered Glass Screen Protector Film...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1375332031,Cart2India SLP,...,,,,,,,,,,0
5915,590ff6c0e11619ab417f7650a8e75b8c,2020-02-06 17:16:37 +0000,B0792DNLB7,https://www.amazon.in/Nillkin-Tempered-Amazing...,Nillkin Tempered Glass Amazing H Anti-Explosio...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1389409031,HiGAR,...,,,,,,,,,,0
262,980ab756824fb4f7707e9fe3af625687,2020-02-07 01:06:03 +0000,B077JJWBN9,https://www.amazon.in/Supershieldz-Kurio-Next-...,[3-Pack] Supershieldz for Kurio Next 7 Kids T...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,,1375332031,Cart2India Online,...,,,,,,,,,,0
8704,4a096644c1886970d9b5f0915ff12cea,2020-02-06 19:26:37 +0000,B00SA0BAEW,https://www.amazon.in/Stony-Edge-Protector-Pol...,"Stony-Edge iPhone 6 Wallet Case, Flip & Stand ...",https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,,1389403031,Yuva Fashion Store,...,,,,,,,,,,0


In [50]:
#save file with all clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in result_df.iterrows()],indent=4)

with open('products_with_cluster.json', 'w') as outfile:
    outfile.write(json_object) 


In [51]:
#save test/training data as json
json_object = json.dumps([row.dropna().to_dict() for index,row in sample_dataset2.iterrows()],indent=4)

with open('sampledata_cluster.json', 'w') as outfile:
    outfile.write(json_object) 


In [52]:
#save file with all clusters as CSV
file_name = 'products_with_cluster.csv'
result_df.to_csv(file_name,header=True)


In [53]:
#save test/training data as CSV
file_name = 'sampledata_cluster.csv'
sample_dataset2.to_csv(file_name,header=True)


In [282]:
#Assigning clusters to product types according to the cluster analysis 
screen_protectors = result_df[result_df['clusters']==0]
flip_covers = result_df[result_df['clusters']==6]
headphones = result_df[result_df['clusters']==9]
ipad_covers = result_df[result_df['clusters']==12]
cables = result_df[result_df['clusters']==23]
hdmi_cables = result_df[result_df['clusters']==24]
kickstand_covers = result_df[result_df['clusters']==3]
mouse_mousepad = result_df[result_df['clusters']==18]
mobile_covers = result_df[result_df['clusters'].isin([1,2,4,5,8,10,11,14,15,17,19,20,21,22,25,26,27,28,29])]

In [283]:
#get random 50 from each product category (change variable before .sample and name of json-file depending on category)
sampledata = screen_protectors.sample(n=50, random_state=1)

#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in sampledata.iterrows()],indent=4)

with open('screen_protectors_sample3.json', 'w') as outfile:
    outfile.write(json_object) 


In [83]:
#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in result_df[result_df['clusters']==0].iterrows()],indent=4)

with open('screen_protectors.json', 'w') as outfile:
    outfile.write(json_object) 
    
#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in result_df[result_df['clusters']==6].iterrows()],indent=4)

with open('flip_covers.json', 'w') as outfile:
    outfile.write(json_object) 

#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in result_df[result_df['clusters']==9].iterrows()],indent=4)

with open('headphones.json', 'w') as outfile:
    outfile.write(json_object) 
    
#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in mouse_mousepad.iterrows()],indent=4)

with open('mouse_mousepad.json', 'w') as outfile:
    outfile.write(json_object) 
    
#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in result_df[result_df['clusters']==12].iterrows()],indent=4)

with open('ipad_covers.json', 'w') as outfile:
    outfile.write(json_object) 
    
#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in result_df[result_df['clusters']==23].iterrows()],indent=4)

with open('cables.json', 'w') as outfile:
    outfile.write(json_object) 
    
#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in result_df[result_df['clusters']==24].iterrows()],indent=4)

with open('hdmi_cables.json', 'w') as outfile:
    outfile.write(json_object) 
    
#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in result_df[result_df['clusters']==3].iterrows()],indent=4)

with open('kickstand_covers.json', 'w') as outfile:
    outfile.write(json_object) 
    
#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in result_df[result_df['clusters']==6].iterrows()],indent=4)

with open('flip_covers.json', 'w') as outfile:
    outfile.write(json_object) 
    
    
#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in result_df[result_df['clusters'].isin([1,2,4,5,8,10,11,14,15,17,19,20,21,22,25,26,27,28,29])].iterrows()],indent=4)

with open('mobile_covers.json', 'w') as outfile:
    outfile.write(json_object) 
    
    

In [284]:
#second iteration cluster 7

#vectorize cleaned text
vectorizer2 = TfidfVectorizer()
cluster7 = product_names[product_names['clusters']==7]
text_vec2 = vectorizer2.fit_transform(cluster7['cleaned'])

#show size of cleaned text
text_vec2.shape

#apply KMeans: select number of clusters 
kmeans2 = KMeans(n_clusters=30, random_state=0).fit(text_vec2)

#add clusters to original data
cluster7['cluster'] = kmeans2.labels_

#show product_names to see added clusters column
cluster7.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster7['cluster'] = kmeans2.labels_


Unnamed: 0,product_name,cleaned,clusters,cluster
4,Redgear MPR800 Soft Base Mousepad with 4 LED S...,redgear mpr soft base mousepad led spectrum mode,7,0
5,RAYOVAC 815-60PPF Alkaline Batteries Reclosabl...,rayovac ppf alkalin batteri reclos pro pack aa pk,7,1
6,Aimo Wireless IPH5PCLP002 Rubber Essentials Sl...,aimo wireless iph pclp rubber essenti slim dur...,7,11
8,Generic MC0001 Cell Phone Case for iPhone 4 - ...,generic mc cell phone case iphon non retail pa...,7,11
22,"LuMee Two for iPhone 8/7/6s/6, The and Authent...",lume two iphon the authent patent protect self...,7,0


In [285]:
cluster7.groupby(by=["cluster"]).size()


cluster
0     497
1     100
2      95
3     120
4      83
5      62
6      37
7      29
8      24
9      19
10     67
11     68
12     16
13     31
14     94
15     47
16     94
17     15
18     50
19     24
20     97
21     36
22     67
23     80
24     42
25     44
26     13
27     41
28     26
29     24
dtype: int64

In [286]:
#add clusters to original data
result_df2 = pd.merge(data, cluster7[['product_name','cluster']], how='inner', on=["product_name"])

result_df2[result_df2.duplicated(subset=['product_url'])]

#drop duplicates
result_df2.drop_duplicates(subset=['product_url'],inplace=True)
result_df2.shape

#sort values by clusters
result_df2.sort_values(by=['cluster'],inplace=True)
result_df2.head()


Unnamed: 0,uniq_id,crawl_timestamp,asin,product_url,product_name,image_urls__small,medium,large,browsenode,seller_name,...,no__of_sellers,sales_rank_in_child_category,product_details__k_v_pairs,Number_Of_Items,Batteries_Included,Batteries_Required,Material,Form_Factor,left_in_stock,cluster
0,ce5ac9f85505667fb9dafce9c1b0103c,2020-02-07 13:30:48 +0000,B07LB5SGK7,https://www.amazon.in/Redgear-MPR800-Soft-Mous...,Redgear MPR800 Soft Base Mousepad with 4 LED S...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1375413031,Appario Retail Private Ltd,...,,,,,,,,,,0
953,b6f226b239a591b84ac8e9306195d624,2020-02-06 18:14:09 +0000,B0078P5V2C,https://www.amazon.in/NP-BN1-Replacement-Batte...,Pack of 2 NP-BN1 Batteries for Sony Cyber-Shot...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1388983031,,...,,,,,,,,,,0
955,e22dfd0515b1d570f20f797792ea877b,2020-02-07 00:16:52 +0000,B079HCNWB6,https://www.amazon.in/Buckle-Down-Cell-Phone-S...,Buckle-Down Cell Phone Case for Samsung Galaxy...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1389409031,,...,,,,,,,,,,0
957,db0abf073e981b2e7f7435bec6d8a7e7,2020-02-07 11:35:53 +0000,B001TLMRW8,https://www.amazon.in/Krusell-Mobile-Pouch-952...,Krusell Luna Mobile Pouch 95212 Universal Larg...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1389409031,Trumpcard,...,,,,,,,,,,0
959,2e54b997fa3044ddc77ebe33afbb1c53,2020-02-07 06:33:59 +0000,B074QYMFZC,https://www.amazon.in/Stinger-Select-SSPRCA6-P...,Stinger Select SSPRCA6 Performance Series 6' C...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1389231031,Buy Your Wish,...,,,,,,,,,,0


In [None]:
#get random 20 products from each cluster
sample_dataset_cluster7=result_df2.groupby("cluster").sample(n=20, random_state=1)

#save test/training data as json
json_object2 = json.dumps([row.dropna().to_dict() for index,row in sample_dataset_cluster7.iterrows()],indent=4)

with open('sampledata_cluster7.json', 'w') as outfile:
    outfile.write(json_object2) 

In [288]:
#Assigning clusters to product types according to the cluster analysis 
mobile_covers2 = result_df2[result_df2['cluster'].isin([2,11,17,21,23,27])]
batteries = result_df2[result_df2['cluster']==1]
camera_accessories = result_df2[result_df2['cluster'].isin([4,19,20])]
macbook_accessories = result_df2[result_df2['cluster']==5]
tv_accessories = result_df2[result_df2['cluster'].isin([6,12])]
keyboard_cases_tablets = result_df2[result_df2['cluster']==7]
memory_cards = result_df2[result_df2['cluster'].isin([8,24])]
nikon_batteries = result_df2[result_df2['cluster']==9]
car_phone_chargers = result_df2[result_df2['cluster']==10]
tv_covers = result_df2[result_df2['cluster']==12]
car_phone_holder = result_df2[result_df2['cluster'].isin([13,14])]
airpod_cases = result_df2[result_df2['cluster']==15]
back_covers = result_df2[result_df2['cluster'].isin([16,26])]
keyboards_and_accessories = result_df2[result_df2['cluster']==18]
headphones2 = result_df2[result_df2['cluster']==22]
glasses = result_df2[result_df2['cluster']==25]
dvds = result_df2[result_df2['cluster']==28]
laptop_tables = result_df2[result_df2['cluster']==29]


In [289]:
#get random data from each product category (change variable before .sample and name of json-file depending on category)
sampledata = tv_accessories.sample(n=53, random_state=1)

#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in sampledata.iterrows()],indent=4)

with open('tv_accessories_sample.json', 'w') as outfile:
    outfile.write(json_object) 


In [119]:
#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in mobile_covers2.iterrows()],indent=4)

with open('mobile_covers2.json', 'w') as outfile:
    outfile.write(json_object) 
    
#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in batteries.iterrows()],indent=4)

with open('batteries.json', 'w') as outfile:
    outfile.write(json_object) 
    
#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in camera_accessories.iterrows()],indent=4)

with open('camera_accessories.json', 'w') as outfile:
    outfile.write(json_object) 
    
#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in macbook_accessories.iterrows()],indent=4)

with open('macbook_accessories.json', 'w') as outfile:
    outfile.write(json_object) 
    
#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in tv_mounts.iterrows()],indent=4)

with open('tv_mounts.json', 'w') as outfile:
    outfile.write(json_object) 
    
#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in memory_cards.iterrows()],indent=4)

with open('memory_cards.json', 'w') as outfile:
    outfile.write(json_object) 
    
#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in keyboard_cases_tablets.iterrows()],indent=4)

with open('keyboard_cases_tablets.json', 'w') as outfile:
    outfile.write(json_object) 
    
#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in nikon_batteries.iterrows()],indent=4)

with open('nikon_batteries.json', 'w') as outfile:
    outfile.write(json_object) 
    
#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in car_phone_chargers.iterrows()],indent=4)
    
with open('car_phone_chargers.json', 'w') as outfile:
    outfile.write(json_object) 
    
#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in tv_covers.iterrows()],indent=4)

with open('tv_covers.json', 'w') as outfile:
    outfile.write(json_object) 
    
#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in car_phone_holder.iterrows()],indent=4)

with open('car_phone_holder.json', 'w') as outfile:
    outfile.write(json_object)   
    
#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in airpod_cases.iterrows()],indent=4)

with open('airpod_cases.json', 'w') as outfile:
    outfile.write(json_object)   
    
#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in back_covers.iterrows()],indent=4)

with open('back_covers.json', 'w') as outfile:
    outfile.write(json_object)      

#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in keyboards_and_accessories.iterrows()],indent=4)

with open('keyboards_and_accessories.json', 'w') as outfile:
    outfile.write(json_object)      
    
#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in headphones2.iterrows()],indent=4)

with open('headphones2.json', 'w') as outfile:
    outfile.write(json_object)      

#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in glasses.iterrows()],indent=4)

with open('glasses.json', 'w') as outfile:
    outfile.write(json_object)    
    
#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in dvds.iterrows()],indent=4)

with open('dvds.json', 'w') as outfile:
    outfile.write(json_object)     

#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in laptop_tables.iterrows()],indent=4)

with open('laptop_tables.json', 'w') as outfile:
    outfile.write(json_object)  

In [120]:
#second iteration cluster 13

#vectorize cleaned text
vectorizer2 = TfidfVectorizer()
cluster13 = product_names[product_names['clusters']==13]
text_vec2 = vectorizer2.fit_transform(cluster13['cleaned'])

#show size of cleaned text
text_vec2.shape

#apply KMeans: select number of clusters 
kmeans2 = KMeans(n_clusters=10, random_state=0).fit(text_vec2)

#add clusters to original data
cluster13['cluster'] = kmeans2.labels_


#show product_names to see added clusters column
cluster13.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster13['cluster'] = kmeans2.labels_


Unnamed: 0,product_name,cleaned,clusters,cluster
12,Saco Transparent Laptop Touchpad Protector for...,saco transpar laptop touchpad protector all la...,13,2
30,Tomtoc 360 Protective Sleeve Case for Apple 15...,tomtoc protect sleev case appl inch new macboo...,13,9
40,ProCase 14-15.6 Inch Laptop Sleeve Case Protec...,procas inch laptop sleev case protect bag macb...,13,9
69,MOCA Anti-theft Lightweight [RollTop] Design M...,moca anti theft lightweight rolltop design men...,13,7
73,LeeRooy Blue 2 Big Part Tourister Laptop Backp...,leerooy blue big part tourist laptop backpack ...,13,7


In [238]:
cluster13.groupby(by=["cluster"]).size()

cluster
0    40
1    33
2    27
3    52
4    34
5    26
6    31
7    90
8    72
9    40
dtype: int64

In [239]:
#add clusters to original data
result_df2 = pd.merge(data, cluster13[['product_name','cluster']], how='inner', on=["product_name"])

result_df2[result_df2.duplicated(subset=['product_url'])]

#drop duplicates
result_df2.drop_duplicates(subset=['product_url'],inplace=True)
result_df2.shape

#sort values by clusters
result_df2.sort_values(by=['cluster'],inplace=True)
result_df2.head()

Unnamed: 0,uniq_id,crawl_timestamp,asin,product_url,product_name,image_urls__small,medium,large,browsenode,seller_name,...,no__of_sellers,sales_rank_in_child_category,product_details__k_v_pairs,Number_Of_Items,Batteries_Included,Batteries_Required,Material,Form_Factor,left_in_stock,cluster
198,2cbfdf2b255be387e550b128ca977609,2020-02-07 06:27:12 +0000,B0743FP5XH,https://www.amazon.in/Emartbuy%C2%AE-Resistant...,Emartbuy Laptop Neoprene Sleeve case with Zip ...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1375329031,EmartbuyIndia,...,,,,,,,,,,0
155,07f75096e36cec1fe251db899b7497b3,2020-02-06 20:45:26 +0000,B07BCHJGL5,https://www.amazon.in/Emartbuy%C2%AE-Resistant...,Emartbuy Laptop Neoprene Sleeve case with Zip ...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1375468031,EmartbuyIndia,...,,,,,,,,,,0
265,65844781f1b4c3275326e38cb3fb5070,2020-02-06 20:51:10 +0000,B016QJHG7S,https://www.amazon.in/Universal-Music-Premium-...,Universal Music Premium Laptop Sleeve for 13 I...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1375468031,Electrical & Musical Deals,...,,,,,,,,,,0
63,99a6cca52a8a563f6c866cb7f318de77,2020-02-06 20:40:11 +0000,B01N8UW906,https://www.amazon.in/Protecta-Laptop-Sleeve-L...,Protecta Plain Jane Laptop Sleeve for Laptops ...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1375468031,Protecta,...,,,,,,,,,,0
401,4e9b685739c1209633b216f3f6cd6a74,2020-02-07 02:11:15 +0000,B07CZ176QQ,https://www.amazon.in/Business-Briefcase-Beaut...,Business Briefcase Sleeve Beautiful Colorful O...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1375468031,Torque Traders,...,,,,,,,,,,0


In [None]:
#get random 20 products from each cluster
sample_dataset_cluster13=result_df2.groupby("cluster").sample(n=20, random_state=1)

#save test/training data as json
json_object2 = json.dumps([row.dropna().to_dict() for index,row in sample_dataset_cluster7.iterrows()],indent=4)

with open('sampledata_cluster13.json', 'w') as outfile:
    outfile.write(json_object2) 

In [240]:
#Assigning clusters to product types according to the cluster analysis 
laptop_sleeves_cases = result_df2[result_df2['cluster'].isin([0,9])]
laptop_backpacks = result_df2[result_df2['cluster'].isin([1,7])]
keyboard_protectors = result_df2[result_df2['cluster'].isin([2,6])]
laptop_bags = result_df2[result_df2['cluster']==3]
laptop_adapters_chargers = result_df2[result_df2['cluster']==4]
laptop_batteries = result_df2[result_df2['cluster']==5]
laptop_stickers = result_df2[result_df2['cluster']==8]

In [248]:
#get random data from each product category (change variable before .sample and name of json-file depending on category)
sampledata = laptop_stickers.sample(n=50, random_state=1)

#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in sampledata.iterrows()],indent=4)

with open('laptop_stickers_sample.json', 'w') as outfile:
    outfile.write(json_object) 


In [126]:
#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in Laptop_sleeves_cases.iterrows()],indent=4)

with open('Laptop_sleeves_cases.json', 'w') as outfile:
    outfile.write(json_object)  

#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in laptop_backpacks.iterrows()],indent=4)

with open('laptop_backpacks.json', 'w') as outfile:
    outfile.write(json_object)    

#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in keyboard_protectors.iterrows()],indent=4)

with open('keyboard_protectors.json', 'w') as outfile:
    outfile.write(json_object)   
    
#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in laptop_bags.iterrows()],indent=4)

with open('laptop_bags.json', 'w') as outfile:
    outfile.write(json_object)   
    
#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in laptop_adapters_chargers.iterrows()],indent=4)

with open('laptop_adapters_chargers.json', 'w') as outfile:
    outfile.write(json_object) 
    
#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in laptop_batteries.iterrows()],indent=4)

with open('laptop_batteries.json', 'w') as outfile:
    outfile.write(json_object)  
    
#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in laptop_stickers.iterrows()],indent=4)

with open('laptop_stickers.json', 'w') as outfile:
    outfile.write(json_object)  

In [127]:
#second iteration cluster 16

#vectorize cleaned text
vectorizer2 = TfidfVectorizer()
cluster16 = product_names[product_names['clusters']==16]
text_vec2 = vectorizer2.fit_transform(cluster16['cleaned'])

#show size of cleaned text
text_vec2.shape

#apply KMeans: select number of clusters 
kmeans2 = KMeans(n_clusters=10, random_state=0).fit(text_vec2)

#add clusters to original data
cluster16['cluster'] = kmeans2.labels_


#show product_names to see added clusters column
cluster16.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster16['cluster'] = kmeans2.labels_


Unnamed: 0,product_name,cleaned,clusters,cluster
56,8 PC Close Up Set : 55mm and 58mm Close-Up Fil...,pc close up set mm mm close up filter set diop...,16,1
186,JJC RR-SM 55MM Reverse Ring for 55MM Thread wi...,jjc rr sm mm revers ring mm thread soni nex ca...,16,9
340,Crazepony FPV CCD Camera Lens 2.5mm Wide Angle...,crazeponi fpv ccd camera len mm wide angl degr...,16,3
375,Fotasy MC72 72 mm Metal Screw-in Lens Cap (Black),fotasi mc mm metal screw len cap black,16,3
493,Tiffen 55mm Haze-2A Filter,tiffen mm haze a filter,16,8


In [128]:
cluster16.groupby(by=["cluster"]).size()

cluster
0    39
1    23
2    30
3    12
4    13
5     8
6     7
7    11
8    17
9    13
dtype: int64

In [129]:
#add clusters to original data
result_df2 = pd.merge(data, cluster16[['product_name','cluster']], how='inner', on=["product_name"])

result_df2[result_df2.duplicated(subset=['product_url'])]

#drop duplicates
result_df2.drop_duplicates(subset=['product_url'],inplace=True)
result_df2.shape

#sort values by clusters
result_df2.sort_values(by=['cluster'],inplace=True)
result_df2.head()

Unnamed: 0,uniq_id,crawl_timestamp,asin,product_url,product_name,image_urls__small,medium,large,browsenode,seller_name,...,no__of_sellers,sales_rank_in_child_category,product_details__k_v_pairs,Number_Of_Items,Batteries_Included,Batteries_Required,Material,Form_Factor,left_in_stock,cluster
86,07cdf002549b1dd231a03657c7195272,2020-02-06 18:19:44 +0000,B004GYXRM4,https://www.amazon.in/Polaroid-Multi-Coated-Va...,Polaroid Optics 58mm HD Multi-Coated Variable ...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1389076031,G. G.,...,4.0,{'NeutralDensityFilters': '#42'},{'Product_Dimensions': '5.8 x 5.8 x 1 cm ; 27....,,,,,,,0
130,1fe4078dc8cd861472d91a23258d8ed0,2020-02-07 09:22:43 +0000,B01F1JJ5AA,https://www.amazon.in/Gobe-ND1000-72mm-16-Laye...,Gobe ND1000 72mm MRC 16-Layer ND Filter,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1389073031,Cart2India SLP,...,,,,,,,,,,0
128,5424341be5da6d37a95f187d2705a877,2020-02-07 09:22:44 +0000,B01C07F6SQ,https://www.amazon.in/Ozure-Neutral-Density-Fi...,Ozure Neutral Density Filters Kit (Set of Thre...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1389073031,jumaanji,...,,,,,,,,,,0
126,8cde00e9eed2ea3d372fe311298fe972,2020-02-07 09:20:59 +0000,B07RZWP16K,https://www.amazon.in/Neewer-Compatible-Instal...,Neewer 3-Pack Filter Set Compatible with DJI O...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1389073031,,...,,,,,,,,,,0
76,9bbabed23cda48cddc73a890b0562b52,2020-02-07 05:32:35 +0000,B07B26QRGW,https://www.amazon.in/SHOPEE-Branded-Neutral-D...,SHOPEE 58mm ND8 Neutral Density Filter,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1389078031,aggarwal enterprises25,...,,,,,,,,,,0


In [None]:
#get random 7 products from cluster
sample_dataset_cluster16=result_df2.groupby("cluster").sample(n=7, random_state=1)

#save test/training data as json
json_object2 = json.dumps([row.dropna().to_dict() for index,row in sample_dataset_cluster16.iterrows()],indent=4)

with open('sampledata_cluster16.json', 'w') as outfile:
    outfile.write(json_object2) 

In [130]:
#Assigning clusters to product types according to the cluster analysis 
camera_filters = result_df2[result_df2['cluster'].isin([0,1,2,3,4,6,8])]
camera_lens_accessories = result_df2[result_df2['cluster'].isin([5,9])]
camera_filter_adapters = result_df2[result_df2['cluster']==7]


In [254]:
#get random data from each product category
sampledata = camera_filter_adapters.sample(n=10, random_state=1)
#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in sampledata.iterrows()],indent=4)

with open('camera_filter_adapters_sample.json', 'w') as outfile:
    outfile.write(json_object) 

In [131]:
#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in camera_filters.iterrows()],indent=4)

with open('camera_filters.json', 'w') as outfile:
    outfile.write(json_object) 
    
#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in camera_lens_accessories.iterrows()],indent=4)

with open('camera_lens_accessories.json', 'w') as outfile:
    outfile.write(json_object)  
    
#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in camera_filter_adapters.iterrows()],indent=4)

with open('camera_filter_adapters.json', 'w') as outfile:
    outfile.write(json_object)  

In [132]:
#second iteration cluster 18

#vectorize cleaned text
vectorizer2 = TfidfVectorizer()
cluster18 = product_names[product_names['clusters']==18]
text_vec2 = vectorizer2.fit_transform(cluster18['cleaned'])

#show size of cleaned text
text_vec2.shape

#apply KMeans: select number of clusters 
kmeans2 = KMeans(n_clusters=10, random_state=0).fit(text_vec2)

#add clusters to original data
cluster18['cluster'] = kmeans2.labels_


#show product_names to see added clusters column
cluster18.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster18['cluster'] = kmeans2.labels_


Unnamed: 0,product_name,cleaned,clusters,cluster
9,"Verbatim Bravo Wired Notebook Optical Mouse, B...",verbatim bravo wire notebook optic mous black,18,2
14,Peace Hand - Gel Wrist Rest Support Mouse Pad ...,peac hand gel wrist rest support mous pad non ...,18,4
66,"Non-Slip Round Mousepad, FINCIBO Unicorn Manda...",non slip round mousepad fincibo unicorn mandal...,18,1
97,Arion Rapoo M10 2.4G Wireless Mouse With Nano ...,arion rapoo m g wireless mous with nano receiv...,18,5
118,KiNGKANG watercolor vector pattern tropical ca...,kingkang watercolor vector pattern tropic cact...,18,1


In [133]:
cluster18.groupby(by=["cluster"]).size()

cluster
0     4
1    50
2    37
3    40
4    17
5    26
6    19
7    10
8    18
9    16
dtype: int64

In [134]:
#add clusters to original data
result_df2 = pd.merge(data, cluster18[['product_name','cluster']], how='inner', on=["product_name"])

result_df2[result_df2.duplicated(subset=['product_url'])]

#drop duplicates
result_df2.drop_duplicates(subset=['product_url'],inplace=True)
result_df2.shape

#sort values by clusters
result_df2.sort_values(by=['cluster'],inplace=True)
result_df2.head()

Unnamed: 0,uniq_id,crawl_timestamp,asin,product_url,product_name,image_urls__small,medium,large,browsenode,seller_name,...,no__of_sellers,sales_rank_in_child_category,product_details__k_v_pairs,Number_Of_Items,Batteries_Included,Batteries_Required,Material,Form_Factor,left_in_stock,cluster
186,566e3f13f0f0316c5ee6fb871da6a398,2020-02-06 18:22:20 +0000,B014WZN6YA,https://www.amazon.in/William-Morris-Strawberr...,William Morris Strawberry Thief Pattern - Mous...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1375297031,Global Klamp,...,,,,,,,,,,0
197,bb72a9a403e4f744400f33f1a896d6f2,2020-02-06 20:26:18 +0000,B00CBH2KYO,https://www.amazon.in/3dRose-LLC-Inches-Nursin...,3dRose LLC 8 X 8 X 0.25 Inches Be Nice...Nurse...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1375297031,Global Klamp,...,,,,,,,,,,0
7,fb2d010c3d561858d116fac0e9ee7509,2020-02-06 20:26:18 +0000,B00EKKIDPK,https://www.amazon.in/Medieval-Offbeat-Unusual...,"3dRose LLC 8 x 8 x 0.25 Inches Mouse Pad, Wood...",https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1375297031,Cart2India Online,...,,,,,,,,,,0
172,afc57138e73e042a8abb1c2d11234508,2020-02-06 20:06:15 +0000,B00JMK6U9Y,https://www.amazon.in/3dRose-Crazy-Ferret-Mous...,3dRose Crazy Ferret Lady Mouse Pad (mp_175041_1),https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1375297031,Global Klamp,...,,,,,,,,,,0
96,8531dd956e7a53ba2f234304db64d8f3,2020-02-06 17:55:01 +0000,B07DKZWN5N,https://www.amazon.in/Marvo-G15-Gaming-Mousepa...,Marvo G15 Gaming Mousepad (Blue),https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,1375420031,Appario Retail Private Ltd,...,,,,,,,,,,1


In [135]:
#get random 4 products from cluster
sample_dataset_cluster_18 = result_df2.groupby("cluster").sample(n=4, random_state=1)

#save test/training data as json
json_object2 = json.dumps([row.dropna().to_dict() for index,row in sample_dataset_cluster_18.iterrows()],indent=4)

with open('sampledata_cluster_18.json', 'w') as outfile:
    outfile.write(json_object2) 

In [191]:
#third iteration: cluster 7_0
#vectorize cleaned text
vectorizer3 = TfidfVectorizer()
cluster7_0 = cluster7[cluster7['cluster']==0]

text_vec3 = vectorizer3.fit_transform(cluster7_0['cleaned'])

#apply KMeans: select number of clusters 
kmeans3 = KMeans(n_clusters=3, random_state=0).fit(text_vec3)

#add clusters to original data
cluster7_0['cluster2'] = kmeans3.labels_


#show product_names to see added clusters column
cluster7_0.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster7_0['cluster2'] = kmeans3.labels_


Unnamed: 0,product_name,cleaned,clusters,cluster,cluster2
4,Redgear MPR800 Soft Base Mousepad with 4 LED S...,redgear mpr soft base mousepad led spectrum mode,7,0,0
22,"LuMee Two for iPhone 8/7/6s/6, The and Authent...",lume two iphon the authent patent protect self...,7,0,0
36,Incase ICON Case for iPhone 7 (Anthracite - IN...,incas icon case iphon anthracit inph ant,7,0,0
80,SellZone Speaker for Dell Vostro 3560 P/N CCKFF,sellzon speaker dell vostro p n cckff,7,0,0
101,Zalman Optimized Vent Hole Ultra Quiet Noteboo...,zalman optim vent hole ultra quiet notebook co...,7,0,0


In [192]:
cluster7_0.groupby(by=["cluster2"]).size()

cluster2
0    370
1     90
2     37
dtype: int64

In [194]:
#add clusters to original data
result_df3 = pd.merge(data, cluster7_0[['product_name','cluster2']], how='inner', on=["product_name"])

result_df3[result_df3.duplicated(subset=['product_url'])]

#drop duplicates
result_df3.drop_duplicates(subset=['product_url'],inplace=True)
result_df3.shape

#sort values by clusters
result_df3.sort_values(by=['cluster2'],inplace=True)
result_df3.head()

#get random 20 products from each cluster
sample_dataset_cluster7_0=result_df3.groupby("cluster2").sample(n=20, random_state=1)


In [195]:
#save test/training data as json
json_object3 = json.dumps([row.dropna().to_dict() for index,row in sample_dataset_cluster7_0.iterrows()],indent=4)

with open('sampledata_cluster7_0.json', 'w') as outfile:
    outfile.write(json_object3)

In [196]:
non_fitting_products = result_df3[result_df3['cluster2'].isin([0,1,2])]

#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in non_fitting_products.iterrows()],indent=4)

with open('non_fitting_products.json', 'w') as outfile:
    outfile.write(json_object)  

In [255]:
#get random data from each product category
sampledata = non_fitting_products.sample(n=200, random_state=1)

#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in sampledata.iterrows()],indent=4)

with open('non_fitting_products_sample.json', 'w') as outfile:
    outfile.write(json_object) 

In [197]:
#third iteration: cluster 7_3
#vectorize cleaned text
vectorizer3 = TfidfVectorizer()
cluster7_3 = cluster7[cluster7['cluster']==3]

text_vec3 = vectorizer3.fit_transform(cluster7_3['cleaned'])

#apply KMeans: select number of clusters 
kmeans3 = KMeans(n_clusters=5, random_state=0).fit(text_vec3)

#add clusters to original data
cluster7_3['cluster2'] = kmeans3.labels_


#show product_names to see added clusters column
cluster7_3.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster7_3['cluster2'] = kmeans3.labels_


Unnamed: 0,product_name,cleaned,clusters,cluster,cluster2
55,TRIPP LITE 10 Gb/100 Gb Duplex Multimode 50/12...,tripp lite gb gb duplex multimod om lszh fiber...,7,3,2
95,Cable Matters 501101x7 Closed Screw Banana Plu...,cabl matter x close screw banana plug pair,7,3,2
146,Cellphonez Earphone Pouch - Multi Purpose Pock...,cellphonez earphon pouch multi purpos pocket s...,7,3,3
196,ABC Products® USB Cable Cord Lead for Sony Alp...,abc product usb cabl cord lead soni alpha d sl...,7,3,0
241,uxcell 100pcs 5A Copper Ring Terminals Lug Bat...,uxcel pcs a copper ring termin lug batteri cab...,7,3,2


In [290]:
cluster7_3.groupby(by=["cluster2"]).size()

cluster2
0    26
1    30
2    26
3    15
4    23
dtype: int64

In [291]:
#add clusters to original data
result_df3 = pd.merge(data, cluster7_3[['product_name','cluster2']], how='inner', on=["product_name"])

result_df3[result_df3.duplicated(subset=['product_url'])]

#drop duplicates
result_df3.drop_duplicates(subset=['product_url'],inplace=True)
result_df3.shape

#sort values by clusters
result_df3.sort_values(by=['cluster2'],inplace=True)
result_df3.head()

#get random 20 products from each cluster
sample_dataset_cluster7_3=result_df3.groupby("cluster2").sample(n=15, random_state=1)



In [None]:

#save test/training data as json
json_object3 = json.dumps([row.dropna().to_dict() for index,row in sample_dataset_cluster7_3.iterrows()],indent=4)

with open('sampledata_cluster7_3.json', 'w') as outfile:
    outfile.write(json_object3)

In [292]:
power_supplies = result_df3[result_df3['cluster2'].isin([0,1,2,3,4])]

#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in power_supplies.iterrows()],indent=4)

with open('power_supplies.json', 'w') as outfile:
    outfile.write(json_object)  

In [295]:
#get random data from each product category
sampledata = power_supplies.sample(n=120, random_state=1)

#save file with all selected clusters as json
json_object = json.dumps([row.dropna().to_dict() for index,row in sampledata.iterrows()],indent=4)

with open('power_supplies_sample2.json', 'w') as outfile:
    outfile.write(json_object) 

In [69]:
#third iteration: cluster 7_19
#vectorize cleaned text
vectorizer3 = TfidfVectorizer()
cluster7_19 = cluster7[cluster7['cluster']==19]

text_vec3 = vectorizer3.fit_transform(cluster7_19['cleaned'])

#apply KMeans: select number of clusters 
kmeans3 = KMeans(n_clusters=3, random_state=0).fit(text_vec3)

#add clusters to original data
cluster7_19['cluster2'] = kmeans3.labels_


#show product_names to see added clusters column
cluster7_19.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cluster7_19['cluster2'] = kmeans3.labels_


Unnamed: 0,product_name,cleaned,clusters,cluster,cluster2
32,Impossible PRD3107 Color Film for Polaroid Sx-...,imposs prd color film polaroid sx camera,7,19,2
192,Kastar NB-6L Battery (3-Pack) for Canon PowerS...,kastar nb l batteri pack canon powershot d d e...,7,19,2
708,VKO Camera Shoulder Neck Strap for Fujifilm X-...,vko camera shoulder neck strap fujifilm x t x ...,7,19,2
1039,Vivitar NB-10L Ultra High Capacity 1300mAh Li-...,vivitar nb l ultra high capac mah li ion batte...,7,19,1
1058,Nexme Red & Black Flower Designer 3D Mobile Co...,nexm red black flower design d mobil cover nok...,7,19,0


In [70]:
cluster7_19.groupby(by=["cluster2"]).size()

cluster2
0    11
1     4
2     9
dtype: int64

In [71]:
#add clusters to original data
result_df3 = pd.merge(data, cluster7_19[['product_name','cluster2']], how='inner', on=["product_name"])

result_df3[result_df3.duplicated(subset=['product_url'])]

#drop duplicates
result_df3.drop_duplicates(subset=['product_url'],inplace=True)
result_df3.shape

#sort values by clusters
result_df3.sort_values(by=['cluster2'],inplace=True)
result_df3.head()

#get random 20 products from each cluster
sample_dataset_cluster7_19=result_df3.groupby("cluster2").sample(n=4, random_state=1)

#save test/training data as json
json_object3 = json.dumps([row.dropna().to_dict() for index,row in sample_dataset_cluster7_19.iterrows()],indent=4)

with open('sampledata_cluster7_19.json', 'w') as outfile:
    outfile.write(json_object3)