In [22]:
import numpy as np
import pandas as pd
import sys
import os
import ipynbname

In [23]:
# Change cwd to project root to ensure relative paths work correctly.
# If moving the .ipynb file back to project root, simply remove the 2 lines below.
project_root = str(ipynbname.path().parent.parent)
os.chdir(project_root)

print(sys.version)
print(f"cwd: {os.getcwd()}")
print(f"Pandas version: {pd.__version__}")

3.14.0 (main, Oct  7 2025, 15:35:21) [Clang 20.1.4 ]
cwd: /home/christian/Schreibtisch/semester7/moderne-maskinl√¶ring-i-praksis-TDT4173/group-project/main_predict_stock
Pandas version: 2.3.3


In [24]:
receivals_df = pd.read_csv('data/kernel/receivals.csv', parse_dates=['date_arrival'])

# relevant_columns = ['rm_id']
receivals_df['date_arrival'] = receivals_df['date_arrival'].apply(lambda x: x.date())

print("type before datetime conversion: ", type(receivals_df['date_arrival'].iloc[0]))
print("value before datetime conversion: ", receivals_df['date_arrival'].iloc[0])

print(receivals_df.info())

type before datetime conversion:  <class 'datetime.date'>
value before datetime conversion:  2004-06-15
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122590 entries, 0 to 122589
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   rm_id                   122533 non-null  float64
 1   product_id              122533 non-null  float64
 2   purchase_order_id       122537 non-null  float64
 3   purchase_order_item_no  122537 non-null  float64
 4   receival_item_no        122590 non-null  int64  
 5   batch_id                64765 non-null   float64
 6   date_arrival            122590 non-null  object 
 7   receival_status         122590 non-null  object 
 8   net_weight              122522 non-null  float64
 9   supplier_id             122590 non-null  int64  
dtypes: float64(6), int64(2), object(2)
memory usage: 9.4+ MB
None


In [25]:
orders_df = pd.read_csv('data/kernel/purchase_orders.csv', parse_dates=['delivery_date', 'created_date_time', 'modified_date_time'])

orders_df['delivery_date'] = orders_df['delivery_date'].apply(lambda x: x.date())
orders_df['created_date_time'] = orders_df['created_date_time'].apply(lambda x: x.date())
orders_df['modified_date_time'] = orders_df['modified_date_time'].apply(lambda x: x.date())

orders_and_receivals = orders_df.merge(receivals_df, on=['purchase_order_id', 'purchase_order_item_no'], how='left', suffixes=('_order', '_receival'))



# print sum of net weight for receivals with purchase_order_id 257357
print("Sum of net weight for receivals with purchase_order_id 257357: ", orders_and_receivals.loc[orders_and_receivals['purchase_order_id'] == 257357, 'net_weight'].sum())

# For rows where unit is 'PUND', convert quantity and net_weight entries to kilograms
orders_and_receivals.loc[orders_and_receivals['unit'] == 'PUND', 'quantity'] *= 0.45359237
orders_and_receivals.loc[orders_and_receivals['unit'] == 'PUND', 'net_weight'] *= 0.45359237

# Drop unit and unit_id columns (assume that the 44 entries with na values in these columns are all in kilograms
orders_and_receivals = orders_and_receivals.drop(columns=['unit', 'unit_id'])

# Drop status, status_id suffices
orders_and_receivals = orders_and_receivals.drop(columns=['status'])

# Filter out rows with either rm_id or date_arrival being null
orders_and_receivals = orders_and_receivals[orders_and_receivals['rm_id'].notnull() & orders_and_receivals['date_arrival'].notnull()]

print(orders_and_receivals.info())

Sum of net weight for receivals with purchase_order_id 257357:  3000.0
<class 'pandas.core.frame.DataFrame'>
Index: 122533 entries, 10 to 133293
Data columns (total 17 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   purchase_order_id       122533 non-null  int64  
 1   purchase_order_item_no  122533 non-null  int64  
 2   quantity                122533 non-null  float64
 3   delivery_date           122533 non-null  object 
 4   product_id_order        122533 non-null  int64  
 5   product_version         122533 non-null  int64  
 6   created_date_time       122533 non-null  object 
 7   modified_date_time      122124 non-null  object 
 8   status_id               122533 non-null  int64  
 9   rm_id                   122533 non-null  float64
 10  product_id_receival     122533 non-null  float64
 11  receival_item_no        122533 non-null  float64
 12  batch_id                64716 non-null   float64
 13  date_ar

In [26]:
# orders_and_receivals.to_csv('christian_orders_and_receivals.csv', index=False)

In [27]:
recv_per_day = (
    receivals_df
        .groupby(['date_arrival', 'rm_id'])
        .agg({'net_weight': 'sum'})
        .sort_values(['date_arrival', 'rm_id'])
)

recv_pivot = recv_per_day.reset_index().pivot(index='date_arrival', columns='rm_id', values='net_weight').fillna(0)
recv_cumsum = recv_pivot.cumsum()
print(recv_cumsum.head(10))

rm_id          342.0   343.0   345.0   346.0    347.0   348.0   353.0   \
date_arrival                                                             
2004-06-15        0.0     0.0     0.0     0.0      0.0     0.0     0.0   
2004-06-16        0.0     0.0     0.0     0.0      0.0     0.0     0.0   
2004-06-17        0.0     0.0     0.0     0.0  29805.0     0.0     0.0   
2004-06-18        0.0     0.0     0.0     0.0  29805.0     0.0     0.0   
2004-06-21        0.0     0.0     0.0     0.0  44725.0     0.0     0.0   
2004-06-22        0.0     0.0     0.0     0.0  44725.0     0.0     0.0   
2004-06-23    24940.0     0.0     0.0     0.0  44725.0     0.0     0.0   
2004-06-24    24940.0     0.0     0.0   820.0  44725.0     0.0     0.0   
2004-06-25    24940.0     0.0     0.0   820.0  44725.0     0.0     0.0   
2004-06-28    24940.0     0.0     0.0   820.0  44725.0     0.0     0.0   

rm_id         354.0   355.0     357.0   ...  4343.0  4381.0  4401.0  4441.0  \
date_arrival                    

In [None]:
orders_per_day = (
    orders_and_receivals
        .groupby(['created_date_time', 'rm_id'])
        .agg(num_orders=('purchase_order_id', 'count'), total_weight=('net_weight', 'sum'))
        .sort_values(['rm_id', 'created_date_time'])
        .reset_index()
)

orders_pivot_count = orders_per_day.pivot(index='created_date_time', columns='rm_id', values='num_orders').fillna(0)
orders_pivot_weight = orders_per_day.pivot(index='created_date_time', columns='rm_id', values='total_weight').fillna(0)
orders_cumsum_count = orders_pivot_count.cumsum()
orders_cumsum_weight = orders_pivot_weight.cumsum()

print("The shape of recv_cumsum: ", recv_cumsum.shape)



The shape of recv_cumsum:  (4750, 203)


In [29]:
def std_check(df, start_date, end_date, rm_id):
    if rm_id not in df.columns:
        raise ValueError(f"Raw material ID {rm_id} not found in DataFrame columns.")
    elif start_date not in df.index or end_date not in df.index:
        raise ValueError("Start date or end date not in DataFrame index.")
    return df.at[end_date, rm_id] - df.at[start_date, rm_id]

def received_weight(start_date, end_date, rm_id):
    """Calculate total received weight for a given raw material between start_date and end_date (inclusive)."""
    return std_check(recv_cumsum, start_date, end_date, rm_id)

def ordered_weight(start_date, end_date, rm_id):
    """Calculate total ordered weight for a given raw material between start_date and end_date (inclusive)."""
    return std_check(orders_cumsum_weight, start_date, end_date, rm_id)

def ordered_count(start_date, end_date, rm_id):
    """Calculate total number of orders for a given raw material between start_date and end_date (inclusive)."""
    return std_check(orders_cumsum_count, start_date, end_date, rm_id)


In [None]:
# Find number of orders in the past year from arbitrary date:

# def number_of_orders_last_year(ref_date, rm_id):
#     start_date = (ref_date - pd.DateOffset(years=1)).date()
#     mask = (orders_and_receivals['rm_id'] == rm_id) & (orders_and_receivals['created_date_time'] >= start_date) & (orders_and_receivals['created_date_time'] <= ref_date)
#     filtered_orders = orders_and_receivals.loc[mask]
#     return filtered_orders.shape[0]

# def total_weight_of_orders_last_year(ref_date, rm_id):
#     start_date = (ref_date - pd.DateOffset(years=1)).date()
#     mask = (orders_and_receivals['rm_id'] == rm_id) & (orders_and_receivals['created_date_time'] >= start_date) & (orders_and_receivals['created_date_time'] <= ref_date)
#     filtered_orders = orders_and_receivals.loc[mask]
#     return filtered_orders['net_weight'].sum()

# def received_weight(start_date, end_date, rm_id):
#     mask = (receivals_df['rm_id'] == rm_id) & (receivals_df['date_arrival'] >= start_date) & (receivals_df['date_arrival'] <= end_date)
#     filtered_receivals = receivals_df.loc[mask]
#     return filtered_receivals['net_weight'].sum()


In [30]:
intervals = []

unique_rm_ids = receivals_df['rm_id'].unique()

for year in range(2024, 2025): # only 2024 for now
    for month in range(1, 2):  # Training on January to July, as 31st July + 151 days = 29th December
        for day in range(1, 2):  # to avoid issues with different month lengths
            print(f"Processing date: {year}-{month}-{day}")
            for window_len in range(1, 152):
                window_start = pd.Timestamp(year=year, month=month, day=day).date()
                window_end = (pd.Timestamp(window_start) + pd.Timedelta(days=window_len)).date()
                for rm_id in unique_rm_ids:
                    total = received_weight(window_start, window_end, rm_id)
                    features = {
                        "rm_id": rm_id,
                        "interval_days": window_len,
                        "year": year,
                        "month": window_end.month,
                        "day_of_week": window_end.day,
                        "num_orders_last_year": ordered_count(window_start, window_end, rm_id),
                        "total_weight_orders_last_year": ordered_weight(window_start, window_end, rm_id),
                        "total_weight_same_window_one_year_ago": received_weight(window_start - pd.DateOffset(years=1), window_end - pd.DateOffset(years=1), rm_id)
                    }
                    intervals.append({**features, "target": total})

train_df = pd.DataFrame(intervals)
print(train_df.head())
train_df.to_csv("christian/train_data.csv", index=False)

Processing date: 2024-1-1


ValueError: Start date or end date not in DataFrame index.

In [14]:
print(train_df.shape)
# Find 1st of january + 151 days:
# first_jan_plus_151 = (pd.Timestamp(year=2025, month=1, day=1) + pd.Timedelta(days=151)).date()
# print("First of January plus 151 days: ", first_jan_plus_151)

(30804, 9)


In [15]:
X = train_df.drop(columns=['target'])
y = train_df['target']

import xgboost as xgb

model = xgb.XGBRegressor(
    n_estimators=3500,
    learning_rate=0.01,
    max_depth=8,
    min_child_weight=6,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.5,
    reg_lambda=1.0,
    tree_method='hist',
    random_state=42,
    n_jobs=-1,
    objective='reg:quantileerror',
    quantile_alpha=0.2,
    device="cuda"
)

# (Optional) validation split‚Äîcritical for time series generalization:
from sklearn.model_selection import train_test_split
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, shuffle=False)  # Do NOT shuffle in time series context!

model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    verbose=True,
)

# Evaluate
from sklearn.metrics import mean_pinball_loss
y_pred = model.predict(X_val)
print("Pinball loss (quantile loss) on validation:", mean_pinball_loss(y_val, y_pred, alpha=0.2))

[0]	validation_0-quantile:947956.54662
[1]	validation_0-quantile:947945.90411
[2]	validation_0-quantile:947935.36302
[3]	validation_0-quantile:947924.92828
[4]	validation_0-quantile:947914.65124
[5]	validation_0-quantile:947910.50307
[6]	validation_0-quantile:947906.47035
[7]	validation_0-quantile:947896.76596
[8]	validation_0-quantile:947904.51761
[9]	validation_0-quantile:947895.04833
[10]	validation_0-quantile:947926.07750
[11]	validation_0-quantile:947905.15762
[12]	validation_0-quantile:947919.46565
[13]	validation_0-quantile:947950.94036
[14]	validation_0-quantile:947975.51136
[15]	validation_0-quantile:947966.37576
[16]	validation_0-quantile:947957.60787
[17]	validation_0-quantile:947954.37016
[18]	validation_0-quantile:947934.71641
[19]	validation_0-quantile:947912.33803
[20]	validation_0-quantile:947907.38028
[21]	validation_0-quantile:947898.76868
[22]	validation_0-quantile:947895.20816
[23]	validation_0-quantile:947886.70892
[24]	validation_0-quantile:947878.32494
[25]	valid

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


In [16]:
prediction_map = pd.read_csv("data/prediction_mapping.csv")

rm_ids = prediction_map['rm_id'].unique()
interval_days = range(1, 151)
year = 2025

predict_df = pd.DataFrame([{
    "rm_id": rm_id,
    "interval_days": interval,
    "year": year,
    "month": (pd.Timestamp(year=year, month=1, day=1) + pd.Timedelta(days=interval-1)).month,
    "day_of_week": (pd.Timestamp(year=year, month=1, day=1) + pd.Timedelta(days=interval-1)).dayofweek,
    "num_orders_last_year": ordered_count(pd.Timestamp(year=year, month=1, day=1).date(), (pd.Timestamp(year=year, month=1, day=1) + pd.Timedelta(days=interval-1)).date(), rm_id),
    "total_weight_orders_last_year": ordered_weight(pd.Timestamp(year=year, month=1, day=1).date(), (pd.Timestamp(year=year, month=1, day=1) + pd.Timedelta(days=interval-1)).date(), rm_id),
    "total_weight_same_window_one_year_ago": received_weight(pd.Timestamp(year=year, month=1, day=1).date() - pd.DateOffset(years=1), (pd.Timestamp(year=year, month=1, day=1) + pd.Timedelta(days=interval-1)).date() - pd.DateOffset(years=1), rm_id)
} for rm_id in rm_ids for interval in interval_days])

print(predict_df.head(), predict_df.shape)




   rm_id  interval_days  year  month  day_of_week  num_orders_last_year  \
0    365              1  2025      1            2                     0   
1    365              2  2025      1            3                     0   
2    365              3  2025      1            4                     0   
3    365              4  2025      1            5                     0   
4    365              5  2025      1            6                     0   

   total_weight_orders_last_year  total_weight_same_window_one_year_ago  
0                            0.0                                    0.0  
1                            0.0                                    0.0  
2                            0.0                                    0.0  
3                            0.0                                    0.0  
4                            0.0                                    0.0   (30450, 8)


In [17]:
y_pred = model.predict(predict_df)
y_pred = np.maximum(0, y_pred)  # Ensure no negative predictions

pred2025 = pd.DataFrame({
    "ID": np.arange(1, len(y_pred) + 1),
    "predicted_weight": y_pred
})

pred2025.to_csv("christian/regular_xgboost.csv", index=False)



In [20]:
prediction_mapping = pd.read_csv("data/prediction_mapping.csv")
merged = pred2025.merge(prediction_mapping, on="ID")

agg_df = merged.groupby("rm_id", as_index=False).agg({
    "predicted_weight": "max",
}).sort_values("predicted_weight", ascending=False)

print(agg_df)
# print(max(y_pred), min(y_pred), np.mean(y_pred), np.median(y_pred))

     rm_id  predicted_weight
148   3123      1.031027e+06
147   3122      1.031027e+06
149   3124      1.031027e+06
150   3125      1.031001e+06
146   3121      9.889509e+05
..     ...               ...
199   4462      1.399836e+03
200   4463      1.385422e+03
198   4461      1.363529e+03
201   4481      1.344475e+03
202   4501      1.208460e+03

[203 rows x 2 columns]
