The one_hot_encode_columns function is designed to apply one-hot encoding to several categorical columns within the master_table DataFrame.

It first creates a copy of the original table to ensure that the original data is not modified. The function uses OneHotEncoder from sklearn to encode the categorical columns (outlet_code, item_department, outlet_profile_category, outlet_area, outlet_parking_lots, and outlet_cluster_category). After encoding, the column names for the newly created one-hot encoded features are generated using get_feature_names_out. The encoded columns are then combined with the existing non-categorical data by first dropping the original categorical columns and then concatenating the encoded columns to the remaining data. Additionally, the item_category column is one-hot encoded using pd.get_dummies.

The final result is a DataFrame with the categorical features encoded as binary variables.

In [1]:
%run ./run_script.ipynb

conf = get_conf()

trans = get_datasources(conf)["trans_info"]
item = get_datasources(conf)["item_info"]
stores = get_datasources(conf)["outlets_info"]

trans = pre_process_transaction_info(trans)
item = pre_process_item_info(item)
store = pre_process_stores_info(stores)
no_categories = conf['params']["no_categories"]

master_table=create_master_table(trans,item,no_categories,store)

In [2]:
def one_hot_encode_columns(master_table):
    """
    One-Hot Encoding the categorical columns

    Args: 
        master_table: Pandas DataFrame
            Master Table
    
    Returns:
        one_hot_encode_columns: Pandas DataFrame
            Master table with encoded columns
    """
    
    # Create a copy of the master table to avoid modifying the original
    data_encoded = master_table.copy()
    
    # One-Hot Encoding using sklearn's OneHotEncoder
    onehot_encoder = OneHotEncoder(sparse_output=False)
    
    # Encode the specified categorical columns
    encoded_cols = onehot_encoder.fit_transform(master_table[['outlet_code', 'item_department',
                                                             'outlet_profile_category', 'outlet_area',
                                                             'outlet_parking_lots', 'outlet_cluster_category']])
    
    # Get the column names for the encoded columns
    encoded_column_names = onehot_encoder.get_feature_names_out(['outlet_code', 'item_department',
                                                                'outlet_profile_category', 'outlet_area',
                                                                'outlet_parking_lots', 'outlet_cluster_category'])
    
    # Create a DataFrame with the encoded columns
    encoded_df = pd.DataFrame(encoded_cols, columns=encoded_column_names)
    
    # Drop the original categorical columns from the data
    data_encoded = data_encoded.drop(['outlet_code', 'item_department', 'outlet_profile_category',
                                      'outlet_area', 'outlet_parking_lots', 'outlet_cluster_category'], axis=1)
    
    # Concatenate the encoded columns with the remaining data
    data_encoded = pd.concat([data_encoded, encoded_df], axis=1)
    
    # One-Hot Encode the 'item_category' column using pd.get_dummies
    data_encoded = pd.get_dummies(data_encoded, columns=['item_category'], prefix='item_category')
    
    return data_encoded

In [3]:
pd.set_option('display.max_columns', None)
data_encoded= one_hot_encode_columns(master_table)
data_encoded

Unnamed: 0,week,total_sales_qty,sales_next_week,fe_avg_4_week_sales,fe_4_weeks_std_dev_weekly,fe_4_weeks_weekly_min_sales,fe_4_weeks_weekly_max_sales,previous_week_sales,prev_2_weeks_sales,prev_3_weeks_sales,prev_month_sales,outlet_min_sales,outlet_max_sales,fe_sales_change_vs_next_week,fe_sales_change_vs_previous_week,fe_sales_to_max_sales_ratio,fe_cumulative_sales,month,week_month,week_year,quarter_year,row,outlet_code_A,outlet_code_B,outlet_code_C,outlet_code_D,outlet_code_E,item_department_Beverages,item_department_Chilled,item_department_Grocery,outlet_profile_category_High,outlet_profile_category_Moderate,outlet_area_10000,outlet_area_10150,outlet_area_11237,outlet_area_11500,outlet_area_14425,outlet_parking_lots_12,outlet_parking_lots_41,outlet_parking_lots_50,outlet_parking_lots_52,outlet_parking_lots_68,outlet_cluster_category_Large,outlet_cluster_category_Medium,outlet_cluster_category_Small,item_category_Ambient Instant Noodles,item_category_Ambient Liquid Milk,item_category_Butter,item_category_Cheese Slices,item_category_Concentrated Liquid Drinks,item_category_Crackers,item_category_Curd,item_category_Fat Spread,item_category_Liquid Cooking Oils & Fats,item_category_Multiple Consumption RTD Beverages,item_category_Non Flavored Tea,item_category_Powdered Milk,item_category_Rice,item_category_Single Consumption RTD Beverages,"item_category_Snack-Nuts,Peas & Mixes",item_category_Stirred Yogurt,item_category_Sugar Confectionary,item_category_Sweet Biscuits & Cookies Regular,item_category_Whipping Cream
0,2022-01-17,598.0,1342.0,0.00,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,40.0,3516.0,744.0,0.0,0.129944,598.0,1,3,3,1,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,2022-01-17,598.0,1342.0,0.00,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,40.0,3516.0,744.0,0.0,0.129944,598.0,1,3,3,1,2.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False
2,2022-01-17,598.0,1342.0,0.00,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,40.0,3516.0,744.0,0.0,0.129944,598.0,1,3,3,1,3.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False
3,2022-01-17,598.0,1342.0,0.00,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,40.0,3516.0,744.0,0.0,0.129944,598.0,1,3,3,1,4.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False
4,2022-01-17,598.0,1342.0,0.00,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,40.0,3516.0,744.0,0.0,0.129944,598.0,1,3,3,1,5.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,2022-10-17,200.0,0.0,574.25,251.103398,200.0,737.0,737.0,937.0,1628.0,2297.0,0.0,1051.0,0.0,-537.0,0.043459,589062.0,10,3,42,4,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2996,2022-10-17,200.0,0.0,574.25,251.103398,200.0,737.0,737.0,937.0,1628.0,2297.0,0.0,1051.0,0.0,-537.0,0.043459,589062.0,10,3,42,4,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False
2997,2022-10-17,200.0,0.0,574.25,251.103398,200.0,737.0,737.0,937.0,1628.0,2297.0,0.0,1051.0,0.0,-537.0,0.043459,589062.0,10,3,42,4,3.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False
2998,2022-10-17,200.0,0.0,574.25,251.103398,200.0,737.0,737.0,937.0,1628.0,2297.0,0.0,1051.0,0.0,-537.0,0.043459,589062.0,10,3,42,4,4.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False
