In [21]:
import pandas as pd
import numpy as np

In [22]:
# Set possibility, to see all columns
pd.set_option('display.max_columns', None)

In [23]:
# Load overall store data and weight data
# the first df is a standard data from Best Secret 
# the second one containts the weight
#Skip this part, if you have the info already combined

df_overall = pd.read_csv('../../../data/raw/full_data_best_secret_new.csv', index_col=0)
df_weight = pd.read_csv('../../../data/raw/package_weight_data.csv', index_col=0)

In [24]:
# Remove duplicate orders, keeping the first occurrence of each order_code
df_unique = df_weight.drop_duplicates(subset=['order_code'], keep='first')

In [25]:
# Filter for only necessary columns for analysis and feature engineering
#Might be worth, revisiting this selection, but seemed to be most granular and commonly usable
selected_columns = ['order_code', 'display_name', 'qty_items_sold', 'sap_main_size', 'product_navision_detail_category']
new_df = df_overall[selected_columns]
df = new_df

In [26]:
# Merge the unique weights into the main DataFrame by order_code
df_merged = df.merge(df_unique[['order_code', 'package_weight']], on='order_code', how='left')

Start here, if you have a combined data with normal columns and additional weight

In [27]:
# Replace any zero weights with NaN, signaling missing or invalid weight data
df_merged['package_weight'] = df_merged['package_weight'].replace(0, np.nan)
df = df_merged

In [28]:
# Filter to keep only orders with 15 or fewer rows
# Process for predicting orders with more than 15 different items has to be discussed
filtered_df = df.groupby('order_code').filter(lambda x: len(x) <= 15)

In [29]:
# Flatten details of an order into a single row, allowing up to 15 items per order
def flatten_order(group):
    flattened = {}
    flattened['package_weight'] = group['package_weight'].iloc[0]
    for i in range(15):
        if i < len(group):
            row = group.iloc[i]
            flattened[f'qty_items_sold_item_{i+1}'] = row['qty_items_sold']
            flattened[f'sap_main_size_item_{i+1}'] = row['sap_main_size']
            flattened[f'product_navision_detail_category_item_{i+1}'] = row['product_navision_detail_category']
        else:
            flattened[f'qty_items_sold_item_{i+1}'] = None
            flattened[f'sap_main_size_item_{i+1}'] = None
            flattened[f'product_navision_detail_category_item_{i+1}'] = None
    return pd.Series(flattened)

In [30]:
# Apply flattening function to each order group
df_grouped = filtered_df.groupby('order_code').apply(flatten_order).reset_index()

In [31]:
# Merge display_name with flattened orders
df_display_name = df.groupby('order_code')['display_name'].first().reset_index()
final_df = pd.merge(df_display_name, df_grouped, on='order_code')

In [32]:
# Update main DataFrame to only contain flattened order details
df = final_df

In [33]:
def calculate_item_weights(row):
    """
    This function distributes the total package weight across individual items 
    in an order according to the quantity of each item sold. By assigning weight 
    based on item quantity, it estimates the contribution of each item to the 
    overall package weight. 
    """
    total_qty = sum(
        [row[f'qty_items_sold_item_{i+1}'] for i in range(15) if not pd.isna(row[f'qty_items_sold_item_{i+1}'])]
    )
    item_weights = {}
    for i in range(15):
        qty_column = f'qty_items_sold_item_{i+1}'
        category_column = f'product_navision_detail_category_item_{i+1}'
        if pd.notna(row[qty_column]) and pd.notna(row[category_column]):
            item_qty = row[qty_column]
            item_weight = (item_qty / total_qty) * row['package_weight'] if total_qty > 0 else 0
            item_weights[(row[category_column], i+1)] = item_weight
    return item_weights



In [34]:
# Apply weight calculation function row-by-row
all_item_weights = df.apply(calculate_item_weights, axis=1)

In [35]:
# Flatten weights into a structured list of dictionaries with category and weight
flat_weights = []
for item_dict in all_item_weights:
    for (category, item_num), weight in item_dict.items():
        flat_weights.append({'product_navision_detail_category': category, 'weight': weight})

In [36]:
# Create a DataFrame of weights, then calculate the mean weight per product category
weights_df = pd.DataFrame(flat_weights)
avg_weight_by_category = weights_df.groupby('product_navision_detail_category')['weight'].mean().reset_index()

In [37]:
# Convert the average weights by category to a dictionary for quick lookup
weight_dict = avg_weight_by_category.set_index('product_navision_detail_category')['weight'].apply(lambda x: f"{round(x, 2):.2f}").to_dict()

In [38]:
# Write the dictionary to a .py file with UTF-8 encoding
with open('weight_dict.py', 'w', encoding='utf-8') as f:
    f.write('weight_dict = ' + str(weight_dict))