In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import LabelEncoder

# --- Function to calculate MAPE safely ---
def calculate_mape(y_true, y_pred):
    y_true_safe = y_true.copy()
    y_true_safe[y_true_safe == 0] = 1e-6 # Avoid division by zero
    return np.mean(np.abs((y_true - y_pred) / y_true_safe)) * 100

# --- 1. Load and Preprocess the OLD Dataset ---
try:
    df_old = pd.read_csv('E:/final_pharma_forcast/DATA/holidays (1).csv')
    df_old['datum'] = pd.to_datetime(df_old['datum'])
    df_old = df_old.set_index('datum')
    df_old_daily = df_old.resample('D').sum()
    df_old_daily['Year'] = df_old_daily.index.year
    df_old_daily['Month'] = df_old_daily.index.month
    df_old_daily['Day'] = df_old_daily.index.day
    df_old_daily['Weekday'] = df_old_daily.index.weekday
except FileNotFoundError:
    print("Could not find 'combined_preprocessed_sales_data (1).csv'.")
    exit()

# --- 2. Load and Preprocess the NEW Dataset with Holidays ---
try:
    df_new = pd.read_csv('E:/xgboost/Data/holidays (1).csv')
    df_new['datum'] = pd.to_datetime(df_new['datum'])
    df_new = df_new.set_index('datum')
    if 'Weekday Name' in df_new.columns:
        le = LabelEncoder()
        df_new['Weekday Name'] = le.fit_transform(df_new['Weekday Name'])
except FileNotFoundError:
    print("Could not find 'holidays (1).csv'.")
    exit()

# --- 3. Define Targets and Features ---
target_cols = ['M01AB', 'M01AE', 'N02BA', 'N02BE', 'N05B', 'N05C', 'R03', 'R06']
features_old = ['Year', 'Month', 'Day', 'Weekday']
non_feature_cols = ['Hour'] + target_cols
features_new = [col for col in df_new.columns if col not in non_feature_cols]

# --- 4. Loop, Train, and Evaluate ---
results = []
for target in target_cols:
    print(f"Processing target category: {target}")

    # --- Model 1: Original Data ---
    X_old = df_old_daily[features_old]
    y_old = df_old_daily[target]
    valid_indices_old = ~y_old.isna()
    X_old, y_old = X_old[valid_indices_old], y_old[valid_indices_old]
    split_idx_old = int(len(X_old) * 0.8)
    X_train_old, X_test_old = X_old[:split_idx_old], X_old[split_idx_old:]
    y_train_old, y_test_old = y_old[:split_idx_old], y_old[split_idx_old:]

    model_old = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    model_old.fit(X_train_old, y_train_old)
    preds_old = model_old.predict(X_test_old)
    rmse_old = mean_squared_error(y_test_old, preds_old, squared=False)
    mae_old = mean_absolute_error(y_test_old, preds_old)
    mape_old = calculate_mape(y_test_old, preds_old)

    # --- Model 2: New Data with Holidays ---
    X_new = df_new[features_new]
    y_new = df_new[target]
    valid_indices_new = ~y_new.isna()
    X_new, y_new = X_new[valid_indices_new], y_new[valid_indices_new]
    split_idx_new = int(len(X_new) * 0.8)
    X_train_new, X_test_new = X_new[:split_idx_new], X_new[split_idx_new:]
    y_train_new, y_test_new = y_new[:split_idx_new], y_new[split_idx_new:]

    model_new = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    model_new.fit(X_train_new, y_train_new)
    preds_new = model_new.predict(X_test_new)
    rmse_new = mean_squared_error(y_test_new, preds_new, squared=False)
    mae_new = mean_absolute_error(y_test_new, preds_new)
    mape_new = calculate_mape(y_test_new, preds_new)

    # --- Store results ---
    results.append({
        "Category": target,
        "Original RMSE": rmse_old, "New Model RMSE": rmse_new,
        "Original MAE": mae_old, "New Model MAE": mae_new,
        "Original MAPE (%)": mape_old, "New Model MAPE (%)": mape_new,
    })

# --- 5. Present the Comparison ---
results_df = pd.DataFrame(results)
results_df['RMSE Improvement (%)'] = ((results_df['New Model RMSE'] - results_df['Original RMSE']) / results_df['Original RMSE']) * 100
results_df['MAE Improvement (%)'] = ((results_df['New Model MAE'] - results_df['Original MAE']) / results_df['Original MAE']) * 100
results_df['MAPE Improvement (%)'] = ((results_df['New Model MAPE (%)'] - results_df['Original MAPE (%)']) / results_df['Original MAPE (%)']) * 100

print("\n--- Model Performance Comparison (with MAPE) ---")
print(results_df.to_string())

Processing target category: M01AB




Processing target category: M01AE




Processing target category: N02BA




Processing target category: N02BE




Processing target category: N05B




Processing target category: N05C




Processing target category: R03




Processing target category: R06

--- Model Performance Comparison (with MAPE) ---
  Category  Original RMSE  New Model RMSE  Original MAE  New Model MAE  Original MAPE (%)  New Model MAPE (%)  RMSE Improvement (%)  MAE Improvement (%)  MAPE Improvement (%)
0    M01AB       3.057706        2.865203      2.434120       2.238059       8.135073e+06        7.713818e+06             -6.295648            -8.054701             -5.178248
1    M01AE       2.292322        2.268642      1.744456       1.749372       7.675147e+06        6.979039e+06             -1.033051             0.281775             -9.069639
2    N02BA       2.214596        2.134967      1.746236       1.674562       1.976458e+07        1.904642e+07             -3.595653            -4.104495             -3.633546
3    N02BE      13.862451       13.031865     10.457826       9.716472       4.144750e+07        4.781714e+07             -5.991625            -7.088982             15.367979
4     N05B       4.687463        4.397570  



Forecasting Model Comparison Analysis
Objective: To scientifically determine the most accurate forecasting model by comparing the performance of a model trained on basic date features against a model trained on an enriched dataset with holiday and seasonal information.

1. Project Overview
This notebook serves as the final and most critical step in the forecasting project. The goal is to move beyond building a single model and instead, to rigorously compare different modeling approaches to identify the superior one.

The core hypothesis being tested is: Does adding rich, contextual features (like holidays and seasonal flags) lead to a statistically significant improvement in forecast accuracy?

To answer this, the notebook directly compares the performance of a model trained on the original, simple dataset against a model trained on the feature-rich holidays.csv dataset.

2. Technical Approach
The methodology is designed for a fair, head-to-head comparison. For each of the 8 drug categories, two separate models are trained and evaluated.

Dataset Preparation:

Original Data (combined_preprocessed_sales_data (1).csv): This dataset is loaded and aggregated to a daily level. Its features are limited to simple date components (Year, Month, Day, Weekday).

New Data (holidays.csv): This feature-rich dataset is loaded. Its features include all the simple date components plus the 27 holiday and seasonal indicator columns.

Model Training:

For each drug category, a Random Forest Regressor is trained on the original dataset.

A second, separate Random Forest Regressor is trained on the new, feature-rich dataset.

Evaluation and Comparison:

Both models are evaluated on their respective test sets using the same metrics: RMSE, MAE, and MAPE.

The results are compiled into a final summary table that calculates the percentage improvement offered by the new model, providing a clear and quantifiable measure of its superiority.