In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
import numpy as np

In [2]:
df_train = pd.read_csv('../data/medicine_monthly_sales_train.csv')

In [3]:
df_train['Year-Month'] = pd.to_datetime(df_train['Year-Month'], format='%Y-%m')
df_train = df_train.sort_values(['Code', 'Province', 'Year-Month']).reset_index(drop=True)

In [4]:
def create_lags(group):
    group = group.copy()
    group['lag_1'] = group['Sales'].shift(1)
    group['lag_3'] = group['Sales'].shift(3)
    group['lag_6'] = group['Sales'].shift(6)
    group['lag_12'] = group['Sales'].shift(12)
    return group

In [5]:
df_train = df_train.groupby(['Code', 'Province']).apply(create_lags, include_groups=False).reset_index(drop=True)
df_train = df_train.dropna(subset=['lag_1', 'lag_3', 'lag_6', 'lag_12'])

In [6]:
df_train['month'] = df_train['Year-Month'].dt.month
df_train['year']  = df_train['Year-Month'].dt.year
df_train = pd.get_dummies(df_train, columns=['month'], prefix='m')

In [7]:
feature_cols = [c for c in df_train.columns if c.startswith('lag_') or c.startswith('m_')]
X_intermediate = df_train[feature_cols]

In [8]:
if X_intermediate.columns.has_duplicates:
    print(f"Original X_intermediate columns (may have duplicates): {X_intermediate.columns.tolist()}")
    X = X_intermediate.loc[:, ~X_intermediate.columns.duplicated(keep='first')]
    print(f"X columns after deduplication: {X.columns.tolist()}")
else:
    X = X_intermediate
    print(f"X columns (no duplicates found initially): {X.columns.tolist()}")

X columns (no duplicates found initially): ['lag_1', 'lag_3', 'lag_6', 'lag_12', 'm_1', 'm_2', 'm_3', 'm_4', 'm_5', 'm_6', 'm_7', 'm_8', 'm_9', 'm_10', 'm_11', 'm_12']


In [9]:
y = df_train['Sales']
tscv = TimeSeriesSplit(n_splits=10)
mae_scores = []

In [10]:
for fold, (train_idx, test_idx) in enumerate(tscv.split(X), 1):
    X_train_orig, X_test_orig = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    X_train = X_train_orig.copy()
    X_test = X_test_orig.copy()

    bool_cols = X_train.select_dtypes(include=bool).columns
    if len(bool_cols) > 0:
        print(f"Fold {fold} - Bool columns found in X_train: {list(bool_cols)}")
        X_train[bool_cols] = X_train[bool_cols].astype(np.int8)
        X_test[bool_cols] = X_test[bool_cols].astype(np.int8)
        print(f"Fold {fold} - X_train dtypes after bool conversion:\n{X_train.dtypes.value_counts()}")
    else:
        print(f"Fold {fold} - No bool columns found in X_train.")


    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest  = xgb.DMatrix(X_test,  label=y_test)
    
    params = {
        'objective':        'reg:squarederror',
        'eval_metric':      'mae',              
        'tree_method':      'hist',             
        'device':           'cuda',             
        'learning_rate':    0.001,              
        'max_depth':        5,                  
        'subsample':        0.7,                
        'colsample_bytree': 0.7,                
        'lambda':           1,                  
        'alpha':            0.1,                
        'min_child_weight': 1,                  
        'seed':             42
    }
    
    print(f"\n--- Fold {fold} ---")
    bst = xgb.train(
        params,
        dtrain,
        num_boost_round=1500, 
        evals=[(dtrain, 'train'), (dtest, 'test')],
        early_stopping_rounds=50,   
        verbose_eval=200 
    )
    
    best_iteration = bst.best_iteration
    best_mae = bst.best_score 

    print(f"Fold {fold} - Best Iteration: {best_iteration}, Best Test MAE during training: {best_mae:.4f}")
    
    preds = bst.predict(dtest, iteration_range=(0, bst.best_iteration + 1))
    mae_fold = mean_absolute_error(y_test, preds)
    mae_scores.append(mae_fold)
    print(f'Fold {fold} Final MAE (on best iteration): {mae_fold:.4f}')

print(f'\nMean MAE across folds: {np.mean(mae_scores):.4f}')
print(f'Std Dev MAE across folds: {np.std(mae_scores):.4f}')

Fold 1 - Bool columns found in X_train: ['m_1', 'm_2', 'm_3', 'm_4', 'm_5', 'm_6', 'm_7', 'm_8', 'm_9', 'm_10', 'm_11', 'm_12']
Fold 1 - X_train dtypes after bool conversion:
int8       12
float64     4
Name: count, dtype: int64

--- Fold 1 ---
[0]	train-mae:224.73434	test-mae:229.41003
[69]	train-mae:223.59900	test-mae:229.43551
Fold 1 - Best Iteration: 19, Best Test MAE during training: 229.4024
Fold 1 Final MAE (on best iteration): 229.4024
Fold 2 - Bool columns found in X_train: ['m_1', 'm_2', 'm_3', 'm_4', 'm_5', 'm_6', 'm_7', 'm_8', 'm_9', 'm_10', 'm_11', 'm_12']
Fold 2 - X_train dtypes after bool conversion:
int8       12
float64     4
Name: count, dtype: int64

--- Fold 2 ---
[0]	train-mae:227.04580	test-mae:227.80126
[162]	train-mae:225.44950	test-mae:227.76679
Fold 2 - Best Iteration: 112, Best Test MAE during training: 227.7567
Fold 2 Final MAE (on best iteration): 227.7567
Fold 3 - Bool columns found in X_train: ['m_1', 'm_2', 'm_3', 'm_4', 'm_5', 'm_6', 'm_7', 'm_8', 'm_9'

In [11]:
dall = xgb.DMatrix(X, label=y)
final_model = xgb.train(params, dall, num_boost_round=bst.best_iteration)

In [13]:
final_model.save_model('xgb_medicine_sales_final.json')