In [1]:
import pandas as pd
import ipynbname
import os

In [3]:
project_root = str(ipynbname.path().parent.parent)
os.chdir(project_root)

In [4]:
############## CLEANING THE PURCHASE ORDERS DATA ##############

orders = pd.read_csv("./data/kernel/purchase_orders.csv")

# Make the orders with PUND in KGs, and change quantity accordingly
# 1 PUND = 0,45359237 kilogram
orders.loc[orders['unit'] == 'PUND', 'quantity'] = orders.loc[orders['unit'] == 'PUND', 'quantity'] * 0.45359237
# Change the unit to KG too: orders.loc[orders['unit'] == 'PUND', 'unit'] = 'KG'
# Drop unit_id and unit columns
orders = orders.drop(columns=['unit_id', 'unit'])

# Time is in GMT+2 which is Norway time
# Make delivery_date, created_date_time and modified_date_time to GMT +2
orders['delivery_date'] = pd.to_datetime(orders['delivery_date'], utc=True).dt.tz_convert('Etc/GMT-2')
orders['created_date_time'] = pd.to_datetime(orders['created_date_time'], utc=True).dt.tz_convert('Etc/GMT-2')
orders['modified_date_time'] = pd.to_datetime(orders['modified_date_time'], utc=True).dt.tz_convert('Etc/GMT-2')

# Save the cleaned data to a new CSV file in data_cleaned folder
orders.to_csv('./data_cleaned/purchase_orders_cleaned.csv', index=False)

In [5]:
### CLEANING THE RECEIVALS DATA ###
receivals = pd.read_csv("./data/kernel/receivals.csv")

# Make the date_arrival to GMT +2
receivals['date_arrival'] = pd.to_datetime(receivals['date_arrival'], utc=True).dt.tz_convert('Etc/GMT-2')
# Save the cleaned data to a new CSV file in data_cleaned folder
receivals.to_csv('./data_cleaned/receivals_cleaned.csv', index=False)

In [None]:
# Method 2: Merge orders and receivals directly, then aggregate the recievals per order line
# This will create duplicate rows for orders with multiple recievals, but we can aggregate them

# --- Load data ---
orders = pd.read_csv(
    "./data_cleaned/purchase_orders_cleaned.csv",
    parse_dates=["delivery_date", "created_date_time", "modified_date_time"]
)
receivals = pd.read_csv(
    "./data_cleaned/receivals_cleaned.csv",
    parse_dates=["date_arrival"]
)

# --- Merge orders and receivals WITHOUT aggregation ---
orders_with_receivals = orders.merge(
    receivals,
    on=["purchase_order_id", "purchase_order_item_no"],
    how="left",
    suffixes=('_order', '_receival')
)

# --- Fill missing values for orders with no receivals ---
orders_with_receivals["net_weight"] = orders_with_receivals["net_weight"].fillna(0)
orders_with_receivals["date_arrival"] = pd.to_datetime(orders_with_receivals["date_arrival"])

# --- Derived features ---
orders_with_receivals["fill_fraction"] = orders_with_receivals["net_weight"] / orders_with_receivals["quantity"]
orders_with_receivals["lead_time"] = (
    orders_with_receivals["date_arrival"] - orders_with_receivals["delivery_date"]
).dt.days
orders_with_receivals["lead_time"] = orders_with_receivals["lead_time"].fillna(0)

# --- Save result ---
orders_with_receivals.to_csv("./data_cleaned/orders_with_receivals_detailed.csv", index=False)

print(orders_with_receivals.shape)

(133409, 20)
