In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)

In [None]:
#import all the engineered features as dicts
from weight_dict import weight_dict
from volume_dict import volume_dict
from label_dict import label_dict
from height_dict import height_dict

In [34]:
df = pd.read_csv('../../../data/raw/full_data_best_secret_new.csv', index_col=0)

In [35]:
#only retain the most meaningful columns
df = df[['order_code', 'qty_items_sold', 'display_name', 'sap_main_size', 'product_navision_detail_category']]

In [36]:
#calculate the weight of the items by getting the weight per category and multiplicating by quantity
df.loc[:, 'weight'] = df['product_navision_detail_category'].map(weight_dict).astype(float) * df['qty_items_sold']

In [37]:
#calculate the volume of the items by getting the volume per category and multiplicating by quantity
df.loc[:, 'volume'] = df['product_navision_detail_category'].map(volume_dict) * df['qty_items_sold']

In [38]:
#assignign the texture per item
df.loc[:, 'label'] = df['product_navision_detail_category'].map(label_dict)

In [39]:
#calculate the height of the items by getting the height per category and multiplicating by quantity
height_lookup = {
    (item['product_navision_detail_category'], item['sap_main_size']): item['opt_height']
    for item in height_dict
}

def get_height(row):
    key = (row['product_navision_detail_category'], row['sap_main_size'])
    return height_lookup.get(key, None) * row['qty_items_sold'] if key in height_lookup else None

df['height'] = df.apply(get_height, axis=1)

In [40]:
df['label'] = df['label'].notna()
df['qty_items_sold'] = df['qty_items_sold'].fillna(0).astype(int)

In [41]:
df.head(5)

Unnamed: 0,order_code,qty_items_sold,display_name,sap_main_size,product_navision_detail_category,weight,volume,label,height
0,2117879653,1,BS 3,XS,funktionspolo o. a.,373.34,4412.5,True,914.0
1,2117879653,1,BS 3,OS,armband,442.09,5807.2,True,914.0
2,2117879653,1,BS 3,OS,besteck,434.66,34380.675,True,914.0
3,2118084029,1,BS 2,OS,strandlaken,659.01,3661.16225,True,914.0
4,2118171889,1,BS 3,XL,t-shirt kurz,422.99,4412.5,True,914.0


In [None]:
# Filters the dataset to retain only orders with 15 or fewer items by:
# 1. Counting the occurrences of each `order_code`.
# 2. Identifying valid orders with item counts less than or equal to 15.
# 3. Creating a filtered DataFrame containing only these valid orders.
order_counts = df['order_code'].value_counts()
valid_orders = order_counts[order_counts <= 15].index
filtered_df = df[df['order_code'].isin(valid_orders)]

In [None]:
# Transforms a grouped order into a flattened structure by:
# 1. Iterating over a maximum of 15 items in the group.
# 2. Extracting attributes for each item and assigning them to corresponding keys.
# 3. Filling missing positions with `None` for orders with fewer than 15 items.
# Returns a flattened representation of the order as a Pandas Series.

def flatten_order(group):
    flattened = {}
    for i in range(15):
        if i < len(group):
            row = group.iloc[i]
            flattened[f'qty_items_sold_item_{i+1}'] = row['qty_items_sold']
            flattened[f'sap_main_size_item_{i+1}'] = row['sap_main_size']
            flattened[f'product_navision_detail_category_item_{i+1}'] = row['product_navision_detail_category']
            flattened[f'weight_item_{i+1}'] = row['weight']
            flattened[f'volume_item_{i+1}'] = row['volume']
            flattened[f'label_item_{i+1}'] = row['label']
            flattened[f'height_item_{i+1}'] = row['height']
        else:
            flattened[f'qty_items_sold_item_{i+1}'] = None
            flattened[f'sap_main_size_item_{i+1}'] = None
            flattened[f'product_navision_detail_category_item_{i+1}'] = None
            flattened[f'weight_item_{i+1}'] = None
            flattened[f'volume_item_{i+1}'] = None
            flattened[f'label_item_{i+1}'] = None
            flattened[f'height_item_{i+1}'] = None
    return pd.Series(flattened)

In [None]:
# Creates the final structured DataFrame by:
# 1. Grouping the filtered dataset by `order_code` and applying the `flatten_order` function to transform order data.
# 2. Extracting the first `display_name` (i.e. the package size) for each order to retain order-level metadata.
# 3. Merging the flattened order details with the `display_name` data on `order_code` for a comprehensive view.

df_grouped = filtered_df.groupby('order_code').apply(flatten_order).reset_index()

df_display_name = df.groupby('order_code')['display_name'].first().reset_index()

final_df = pd.merge(df_display_name, df_grouped, on='order_code')

In [43]:
df = final_df

In [44]:
df.to_csv('data_for_models.csv', index=True)