In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
import os
import ipynbname

In [2]:
# Change cwd to project root to ensure relative paths work correctly.
# If moving the .ipynb file back to project root, simply remove the 2 lines below.
project_root = os.path.abspath(os.path.join(os.path.dirname(ipynbname.path()), '..'))
os.chdir(project_root)

print(sys.version)
print(f"cwd: {os.getcwd()}")
print(f"Pandas version: {pd.__version__}")

3.14.0 (main, Oct  7 2025, 15:35:21) [Clang 20.1.4 ]
cwd: /home/christian/Schreibtisch/semester7/moderne-maskinlæring-i-praksis-TDT4173/group-project/main_predict_stock
Pandas version: 2.3.3


In [3]:
# receivals_path = "data/kernel/receivals.csv"
receivals_path = "data_cleaned/receivals_cleaned.csv"  # use cleaned data

receivals = pd.read_csv(receivals_path)
pred_map = pd.read_csv("data/prediction_mapping.csv")

# preprocess receivals
receivals['date_arrival'] = pd.to_datetime(receivals['date_arrival'])

# Make datetime naive (remove timezone) to allow comparison with naive window_start/window_end
if receivals['date_arrival'].dt.tz is not None:
    receivals['date_arrival'] = receivals['date_arrival'].dt.tz_localize(None)

receivals = receivals.dropna(subset=['rm_id', 'net_weight'])

# build historical intervals — for every year, same windows as prediction
intervals = []
for year in range(2024, 2025):   # adjust as necessary
    for window_len in range(1, 152):  # e.g., Jan 1–May 31 == 151 days
        window_start = pd.Timestamp(year=year, month=1, day=1)
        window_end = window_start + pd.Timedelta(days=window_len-1)
        for rm_id in receivals['rm_id'].unique():
            mask = (
                (receivals['rm_id'] == rm_id)
                & (receivals['date_arrival'] >= window_start)
                & (receivals['date_arrival'] <= window_end)
            )
            total = receivals.loc[mask, 'net_weight'].sum()
            # build features: e.g., total last year, mean per interval, etc.
            features = {
                "rm_id": rm_id,
                "interval_days": window_len,
                "year": year,
                "month": window_end.month,
                "day_of_week": window_end.dayofweek,
                # add more features...
                # e.g.: "prev_year_total": ...
            }
            intervals.append({**features, "target": total})

train_df = pd.DataFrame(intervals)
print(train_df.head())

   rm_id  interval_days  year  month  day_of_week  target
0  365.0              1  2024      1            0     0.0
1  379.0              1  2024      1            0     0.0
2  389.0              1  2024      1            0     0.0
3  369.0              1  2024      1            0     0.0
4  366.0              1  2024      1            0     0.0


In [4]:
print(train_df.shape)

(30653, 6)


In [5]:
X = train_df.drop(columns=['target'])
y = train_df['target']

import xgboost as xgb

model = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.01,
    max_depth=20,
    min_child_weight=6,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.5,
    reg_lambda=1.0,
    tree_method='hist',
    random_state=42,
    n_jobs=-1,
    objective='reg:quantileerror',
    quantile_alpha=0.2,
)

# (Optional) validation split—critical for time series generalization:
from sklearn.model_selection import train_test_split
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, shuffle=False)  # Do NOT shuffle in time series context!

model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    verbose=True,
)

# Evaluate
from sklearn.metrics import mean_absolute_error
y_pred = model.predict(X_val)
print("MAE on validation:", mean_absolute_error(y_val, y_pred))

[0]	validation_0-quantile:32892.76627
[1]	validation_0-quantile:32892.76627
[2]	validation_0-quantile:32838.90549
[3]	validation_0-quantile:32795.43740
[4]	validation_0-quantile:32742.72692
[5]	validation_0-quantile:32742.72692
[6]	validation_0-quantile:32697.92899
[7]	validation_0-quantile:32697.92899
[8]	validation_0-quantile:32658.65520
[9]	validation_0-quantile:32617.07096
[10]	validation_0-quantile:32617.07096
[11]	validation_0-quantile:32574.32151
[12]	validation_0-quantile:32574.32151
[13]	validation_0-quantile:32525.14747
[14]	validation_0-quantile:32484.21764
[15]	validation_0-quantile:32484.21764
[16]	validation_0-quantile:32484.21764
[17]	validation_0-quantile:32484.21764
[18]	validation_0-quantile:32484.21764
[19]	validation_0-quantile:32446.28613
[20]	validation_0-quantile:32404.28347
[21]	validation_0-quantile:32371.68952
[22]	validation_0-quantile:32335.44483
[23]	validation_0-quantile:32286.95885
[24]	validation_0-quantile:32286.95885
[25]	validation_0-quantile:32251.10

In [6]:
prediction_map = pd.read_csv("data/prediction_mapping.csv")

rm_ids = prediction_map['rm_id'].unique()
interval_days = range(1, 151)
year = 2025

predict_df = pd.DataFrame([
    {"rm_id": rm_id, "interval_days": interval, "year": year, "month": (pd.Timestamp(year=year, month=1, day=1) + pd.Timedelta(days=interval-1)).month,
     "day_of_week": (pd.Timestamp(year=year, month=1, day=1) + pd.Timedelta(days=interval-1)).dayofweek}
    for rm_id in rm_ids
    for interval in interval_days
])

print(predict_df.head(), predict_df.shape)




   rm_id  interval_days  year  month  day_of_week
0    365              1  2025      1            2
1    365              2  2025      1            3
2    365              3  2025      1            4
3    365              4  2025      1            5
4    365              5  2025      1            6 (30450, 5)


In [7]:
y_pred = model.predict(predict_df)
y_pred = np.maximum(0, y_pred)  # Ensure no negative predictions

pred2025 = pd.DataFrame({
    "ID": np.arange(1, len(y_pred) + 1),
    "predicted_weight": y_pred
})

pred2025.to_csv("christian/basic_xgboost_predictions_jan-may_2024_day_of_week.csv", index=False)



In [8]:
prediction_mapping = pd.read_csv("data/prediction_mapping.csv")
merged = pred2025.merge(prediction_mapping, on="ID")

agg_df = merged.groupby("rm_id", as_index=False).agg({
    "predicted_weight": "max",
}).sort_values("predicted_weight", ascending=False)

print(agg_df.head(20))
# print(max(y_pred), min(y_pred), np.mean(y_pred), np.median(y_pred))

     rm_id  predicted_weight
176   3781      4.113468e+06
180   3865      3.277284e+06
150   3125      1.937006e+06
151   3126      1.919993e+06
149   3124      1.597493e+06
147   3122      1.567073e+06
148   3123      1.323810e+06
75    2130      1.133226e+06
160   3282      1.042542e+06
83    2140      5.345011e+05
159   3265      4.531361e+05
182   3901      4.051195e+05
79    2134      3.763752e+05
80    2135      3.429033e+05
174   3761      3.332516e+05
85    2142      2.413436e+05
76    2131      1.619029e+05
87    2144      1.488164e+05
77    2132      1.459334e+05
88    2145      1.433494e+05
