# Onehot Encodings

This notebook creates the one-hot encoded product data and write it to a csv for use by other notebooks. Columns with hierarchical data are excluded from the one-hot encoded vectors.

references: 

https://www.statology.org/one-hot-encoding-in-python/

In [93]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [94]:
product_path = "clean_data/cleaned_products.csv"
df_product_standard = pd.read_csv(product_path)

df_product_standard = df_product_standard.drop(columns=["Unnamed: 0"])
df_product_standard = df_product_standard.sample(frac=1, random_state=42)

In [95]:
df_product_standard.head()

Unnamed: 0,ctr_product_num,ctr_style_name,short_desc,long_desc,merch_division_nm,merch_lob_nm,merch_bus_cat_nm,merch_subcat_nm,merch_fineline_nm,corporate_status_cd,...,ctr_product_profile_cd,ctr_consumer_role_cd,package_depth_qty,package_height_qty,package_width_qty,package_volume_qty,package_weight_qty,national_consumer_price_amt,cold_sensitive_ind,heat_sensitive_ind
574496,650092,,COAST G50 FLSHLGHT,Coast G50 Focusing LED Flashlight,FIXING,HARDWARE,HOUSEHOLD FLASHLIGHTS,Performance Flashlights & Lanterns,Performance Flashlights,FD,...,USABLE,PLAYER,21.3,17.5,21.8,0.097515,0.667,0.0,N,N
288658,62383,,LT235/85R16 E RR MS,,AUTOMOTIVE,TIRES,ALL TERRAIN TIRES,All Terrain Truck & SUV Tires,General Tire Grabber A/T 2,FD,...,JOB_JOY,EMERG_DESTINATION,31.8,9.3,31.8,5.442437,45.3,191.49,N,N
679579,6680281,,ASH CWS GARDEN FLAG,CELEBRATE WINTER SNOWMAN GARDEN FLAG,SEASONAL & GARDENING,BACKYARD LIVING,OUTDOOR LIVING ACCESSORIES,Outdoor Decor & Accessories,FMA - Outdoor Decor & Accessories,ACT,...,USABLE,PLAYER,1.0,1.0,1.0,0.000579,1.0,13.99,N,N
162285,1121723,,53469 MUFFLER QFSS,,AUTOMOTIVE,HEAVY AUTO PARTS,EXHAUST,Mufflers,Quiet Flow SS Mufflers,ACT,...,JOB_JOY,DESTINATION,32.0,6.0,7.0,0.777778,10.5,0.0,N,N
630014,467698,,BBJ MINI CARRIER,BabyBjorn Baby Carrier Mini - Cotton,AUTOMOTIVE,CAR CARE & ACCESSORIES,CHILD TRAVEL & BABY,Auto Child Safety & Accessories,"Strollers, Travel Systems & Accessories",FD,...,JOB_JOY,PLAYER,15.5,15.1,23.3,0.26299,1.983,119.99,N,N


In [96]:
#list the continuous columns and categorical columns
contcols = ["ctr_product_num", "package_depth_qty", "package_height_qty", "package_volume_qty", "package_weight_qty", "national_consumer_price_amt"]
catcols = ["corporate_status_cd", "ctr_good_better_best_cd", "ctr_product_profile_cd", "ctr_consumer_role_cd", "cold_sensitive_ind", "heat_sensitive_ind"]

df_categorical = df_product_standard[catcols]
df_continuous = df_product_standard[contcols]

In [97]:
#get one hot encodings for categorical columns
df_onehot = pd.get_dummies(df_categorical)
df_onehot

Unnamed: 0,corporate_status_cd_ACT,corporate_status_cd_DWO,corporate_status_cd_FD,corporate_status_cd_INA,corporate_status_cd_INC,corporate_status_cd_SD,corporate_status_cd_TD,ctr_good_better_best_cd_BEST,ctr_good_better_best_cd_BETTER,ctr_good_better_best_cd_GOOD,...,ctr_product_profile_cd_JOB_JOY,ctr_product_profile_cd_USABLE,ctr_consumer_role_cd_CONVENIENCE,ctr_consumer_role_cd_DESTINATION,ctr_consumer_role_cd_EMERG_DESTINATION,ctr_consumer_role_cd_PLAYER,cold_sensitive_ind_N,cold_sensitive_ind_Y,heat_sensitive_ind_N,heat_sensitive_ind_Y
574496,0,0,1,0,0,0,0,0,1,0,...,0,1,0,0,0,1,1,0,1,0
288658,0,0,1,0,0,0,0,0,0,0,...,1,0,0,0,1,0,1,0,1,0
679579,1,0,0,0,0,0,0,0,0,1,...,0,1,0,0,0,1,1,0,1,0
162285,1,0,0,0,0,0,0,1,0,0,...,1,0,0,1,0,0,1,0,1,0
630014,0,0,1,0,0,0,0,1,0,0,...,1,0,0,0,0,1,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259178,1,0,0,0,0,0,0,1,0,0,...,0,1,0,0,1,0,1,0,1,0
365838,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
131932,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
671155,1,0,0,0,0,0,0,0,0,1,...,0,1,0,0,0,1,1,0,1,0


In [98]:
#combine continuous column values with onehot encoding for each product
df_cont_and_onehot = df_onehot.join(df_continuous)
df_cont_and_onehot

Unnamed: 0,corporate_status_cd_ACT,corporate_status_cd_DWO,corporate_status_cd_FD,corporate_status_cd_INA,corporate_status_cd_INC,corporate_status_cd_SD,corporate_status_cd_TD,ctr_good_better_best_cd_BEST,ctr_good_better_best_cd_BETTER,ctr_good_better_best_cd_GOOD,...,cold_sensitive_ind_N,cold_sensitive_ind_Y,heat_sensitive_ind_N,heat_sensitive_ind_Y,ctr_product_num,package_depth_qty,package_height_qty,package_volume_qty,package_weight_qty,national_consumer_price_amt
574496,0,0,1,0,0,0,0,0,1,0,...,1,0,1,0,650092,21.3,17.5,0.097515,0.667,0.000
288658,0,0,1,0,0,0,0,0,0,0,...,1,0,1,0,62383,31.8,9.3,5.442437,45.300,191.490
679579,1,0,0,0,0,0,0,0,0,1,...,1,0,1,0,6680281,1.0,1.0,0.000579,1.000,13.990
162285,1,0,0,0,0,0,0,1,0,0,...,1,0,1,0,1121723,32.0,6.0,0.777778,10.500,0.000
630014,0,0,1,0,0,0,0,1,0,0,...,1,0,1,0,467698,15.5,15.1,0.262990,1.983,119.990
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259178,1,0,0,0,0,0,0,1,0,0,...,1,0,1,0,779497,7.9,6.8,0.153483,13.227,0.000
365838,0,0,1,0,0,0,0,0,0,0,...,1,0,1,0,962038,0.1,4.8,0.013333,0.100,5.052
131932,0,0,1,0,0,0,0,0,0,0,...,1,0,1,0,1951189,1.6,46.9,0.124262,10.970,16.410
671155,1,0,0,0,0,0,0,0,0,1,...,1,0,1,0,6511575,1.0,1.0,0.000579,1.000,5.790


In [99]:
df_cont_and_onehot.set_index("ctr_product_num", inplace=True)

In [100]:
df_cont_and_onehot.dropna(inplace=True)

In [101]:
df_cont_and_onehot

Unnamed: 0_level_0,corporate_status_cd_ACT,corporate_status_cd_DWO,corporate_status_cd_FD,corporate_status_cd_INA,corporate_status_cd_INC,corporate_status_cd_SD,corporate_status_cd_TD,ctr_good_better_best_cd_BEST,ctr_good_better_best_cd_BETTER,ctr_good_better_best_cd_GOOD,...,ctr_consumer_role_cd_PLAYER,cold_sensitive_ind_N,cold_sensitive_ind_Y,heat_sensitive_ind_N,heat_sensitive_ind_Y,package_depth_qty,package_height_qty,package_volume_qty,package_weight_qty,national_consumer_price_amt
ctr_product_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
650092,0,0,1,0,0,0,0,0,1,0,...,1,1,0,1,0,21.3,17.5,0.097515,0.667,0.000
62383,0,0,1,0,0,0,0,0,0,0,...,0,1,0,1,0,31.8,9.3,5.442437,45.300,191.490
6680281,1,0,0,0,0,0,0,0,0,1,...,1,1,0,1,0,1.0,1.0,0.000579,1.000,13.990
1121723,1,0,0,0,0,0,0,1,0,0,...,0,1,0,1,0,32.0,6.0,0.777778,10.500,0.000
467698,0,0,1,0,0,0,0,1,0,0,...,1,1,0,1,0,15.5,15.1,0.262990,1.983,119.990
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
779497,1,0,0,0,0,0,0,1,0,0,...,0,1,0,1,0,7.9,6.8,0.153483,13.227,0.000
962038,0,0,1,0,0,0,0,0,0,0,...,0,1,0,1,0,0.1,4.8,0.013333,0.100,5.052
1951189,0,0,1,0,0,0,0,0,0,0,...,0,1,0,1,0,1.6,46.9,0.124262,10.970,16.410
6511575,1,0,0,0,0,0,0,0,0,1,...,1,1,0,1,0,1.0,1.0,0.000579,1.000,5.790


In [102]:
df_cont_and_onehot.to_csv("embeddings/onehot.csv")

# Data Preview

These embeddings are generated from the product_standard_details.csv by first converting the categorical columns into one-hot encodings, then combining these with the numerical/continuous values. The end result is a sparse vector for each product. Note: heirarchical categorical columns were dropped since these embeddings will be used to predict product category later on.

For example, product 779760 is a fishing hook, with the following embedding:

In [103]:
df_cont_and_onehot.iloc[779760]

corporate_status_cd_ACT                    0.000000
corporate_status_cd_DWO                    0.000000
corporate_status_cd_FD                     1.000000
corporate_status_cd_INA                    0.000000
corporate_status_cd_INC                    0.000000
corporate_status_cd_SD                     0.000000
corporate_status_cd_TD                     0.000000
ctr_good_better_best_cd_BEST               0.000000
ctr_good_better_best_cd_BETTER             1.000000
ctr_good_better_best_cd_GOOD               0.000000
ctr_good_better_best_cd_OPP                0.000000
ctr_product_profile_cd_CONSUMABLE          0.000000
ctr_product_profile_cd_DISCRETIONARY       0.000000
ctr_product_profile_cd_JOB_JOY             1.000000
ctr_product_profile_cd_USABLE              0.000000
ctr_consumer_role_cd_CONVENIENCE           0.000000
ctr_consumer_role_cd_DESTINATION           0.000000
ctr_consumer_role_cd_EMERG_DESTINATION     1.000000
ctr_consumer_role_cd_PLAYER                0.000000
cold_sensiti

# One-Hot Encoded Store Embeddings

In [104]:
store_path = "clean_data/cleaned_store.csv"
df_store = pd.read_csv(store_path)

df_store = df_store.drop(columns=["Unnamed: 0"])
#df_store = df_store.sample(frac=1, random_state=42)

In [105]:
df_store

Unnamed: 0,store_num,store_nm,province_cd,latitude_qty,longitude_qty,store_size_cd,retail_square_ft_qty,ins_garden_centre_sqr_ft_qty,number_of_service_bays_qty,checkouts_count,store_concept_type_nm,onsite_propane_txt,winterized_canopy_txt,shopping_centre_nm
0,1,"ALLISTON, ON",ON,44.149236,-79.884000,C,47006,0,10,11,Smart,Yes,Not Determined,0
1,2,"Antigonish, NS",NS,45.617730,-61.984187,D,29461,0,6,7,Smart,Yes,No,Antigonish Market Square
2,3,"AMHERST, NS",NS,45.809747,-64.200389,D,31568,0,10,9,Smart,No,Yes,0
3,4,"Arnprior, ON",ON,45.425607,-76.366699,E,25371,3951,8,9,Smart,No,No,Arnprior Shopping Centre
4,5,"BANCROFT, ON",ON,45.084521,-77.860934,D,31466,0,10,7,Smart,Yes,Yes,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,928,"Place Versailles, QC",QC,45.592571,-73.541760,C,54121,0,7,9,Smart,No,No,Place Versailles
502,929,"Winnipeg Grant Park, MB",MB,49.858253,-97.164098,B,55557,0,7,9,Smart2,No,No,0
503,930,"Calgary Mount Royal, AB",AB,51.038404,-114.082194,E,24475,0,0,6,Smart,No,No,0
504,932,"NORTH SAANICH, BC",BC,48.662738,-123.419055,D,31596,0,7,9,Smart,No,No,0


In [106]:
#list the continuous columns and categorical columns (dropping store name and shopping center name)
contcols_store = ["store_num", "latitude_qty", "longitude_qty", "retail_square_ft_qty", "ins_garden_centre_sqr_ft_qty", "number_of_service_bays_qty", "checkouts_count"]
catcols_store = ["province_cd", "store_size_cd", "store_concept_type_nm", "onsite_propane_txt", "winterized_canopy_txt"]

df_categorical_store = df_store[catcols_store]
df_continuous_store = df_store[contcols_store]

In [107]:
#df with only continuous (numerical) columns
df_continuous_store

Unnamed: 0,store_num,latitude_qty,longitude_qty,retail_square_ft_qty,ins_garden_centre_sqr_ft_qty,number_of_service_bays_qty,checkouts_count
0,1,44.149236,-79.884000,47006,0,10,11
1,2,45.617730,-61.984187,29461,0,6,7
2,3,45.809747,-64.200389,31568,0,10,9
3,4,45.425607,-76.366699,25371,3951,8,9
4,5,45.084521,-77.860934,31466,0,10,7
...,...,...,...,...,...,...,...
501,928,45.592571,-73.541760,54121,0,7,9
502,929,49.858253,-97.164098,55557,0,7,9
503,930,51.038404,-114.082194,24475,0,0,6
504,932,48.662738,-123.419055,31596,0,7,9


In [108]:
#df with only categorical columns
df_categorical_store

Unnamed: 0,province_cd,store_size_cd,store_concept_type_nm,onsite_propane_txt,winterized_canopy_txt
0,ON,C,Smart,Yes,Not Determined
1,NS,D,Smart,Yes,No
2,NS,D,Smart,No,Yes
3,ON,E,Smart,No,No
4,ON,D,Smart,Yes,Yes
...,...,...,...,...,...
501,QC,C,Smart,No,No
502,MB,B,Smart2,No,No
503,AB,E,Smart,No,No
504,BC,D,Smart,No,No


In [109]:
#get one hot encodings for categorical columns
df_onehot_store = pd.get_dummies(df_categorical_store)
df_onehot_store

Unnamed: 0,province_cd_AB,province_cd_BC,province_cd_MB,province_cd_NB,province_cd_NL,province_cd_NS,province_cd_NT,province_cd_ON,province_cd_PE,province_cd_QC,...,store_concept_type_nm_NextGen (Incomplete),store_concept_type_nm_Small Market,store_concept_type_nm_Smart,store_concept_type_nm_Smart2,store_concept_type_nm_Traditional,onsite_propane_txt_No,onsite_propane_txt_Yes,winterized_canopy_txt_No,winterized_canopy_txt_Not Determined,winterized_canopy_txt_Yes
0,0,0,0,0,0,0,0,1,0,0,...,0,0,1,0,0,0,1,0,1,0
1,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,1,1,0,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,1,0,0,0,1
3,0,0,0,0,0,0,0,1,0,0,...,0,0,1,0,0,1,0,1,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,1,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,1,0,1,0,0
502,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,1,0,0
503,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,1,0,0
504,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,1,0,0


In [110]:
#combine continuous column values with onehot encoding for each product
df_cont_and_onehot_store = df_continuous_store.join(df_onehot_store)
df_cont_and_onehot_store.set_index("store_num", inplace=True)
df_cont_and_onehot_store.dropna(inplace=True)
df_cont_and_onehot_store

Unnamed: 0_level_0,latitude_qty,longitude_qty,retail_square_ft_qty,ins_garden_centre_sqr_ft_qty,number_of_service_bays_qty,checkouts_count,province_cd_AB,province_cd_BC,province_cd_MB,province_cd_NB,...,store_concept_type_nm_NextGen (Incomplete),store_concept_type_nm_Small Market,store_concept_type_nm_Smart,store_concept_type_nm_Smart2,store_concept_type_nm_Traditional,onsite_propane_txt_No,onsite_propane_txt_Yes,winterized_canopy_txt_No,winterized_canopy_txt_Not Determined,winterized_canopy_txt_Yes
store_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,44.149236,-79.884000,47006,0,10,11,0,0,0,0,...,0,0,1,0,0,0,1,0,1,0
2,45.617730,-61.984187,29461,0,6,7,0,0,0,0,...,0,0,1,0,0,0,1,1,0,0
3,45.809747,-64.200389,31568,0,10,9,0,0,0,0,...,0,0,1,0,0,1,0,0,0,1
4,45.425607,-76.366699,25371,3951,8,9,0,0,0,0,...,0,0,1,0,0,1,0,1,0,0
5,45.084521,-77.860934,31466,0,10,7,0,0,0,0,...,0,0,1,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
928,45.592571,-73.541760,54121,0,7,9,0,0,0,0,...,0,0,1,0,0,1,0,1,0,0
929,49.858253,-97.164098,55557,0,7,9,0,0,1,0,...,0,0,0,1,0,1,0,1,0,0
930,51.038404,-114.082194,24475,0,0,6,1,0,0,0,...,0,0,1,0,0,1,0,1,0,0
932,48.662738,-123.419055,31596,0,7,9,0,1,0,0,...,0,0,1,0,0,1,0,1,0,0


In [111]:
df_cont_and_onehot_store.to_csv("embeddings/onehot_store.csv")

# Data Preview

These one-hot encoded store embeddings have a combination of the numerical continuous features, combined with the onehot encoded categorical features. The example below is for store_num=5, which is one of the CTC stores in Ontario.

In [114]:
df_cont_and_onehot_store.loc[5]

latitude_qty                                     45.084521
longitude_qty                                   -77.860934
retail_square_ft_qty                          31466.000000
ins_garden_centre_sqr_ft_qty                      0.000000
number_of_service_bays_qty                       10.000000
checkouts_count                                   7.000000
province_cd_AB                                    0.000000
province_cd_BC                                    0.000000
province_cd_MB                                    0.000000
province_cd_NB                                    0.000000
province_cd_NL                                    0.000000
province_cd_NS                                    0.000000
province_cd_NT                                    0.000000
province_cd_ON                                    1.000000
province_cd_PE                                    0.000000
province_cd_QC                                    0.000000
province_cd_SK                                    0.0000