In [2]:
cd ..

c:\Users\Administrator\OneDrive\Desktop\SalesAI\backend


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [3]:
import sys 
import pandas as pd
import numpy as np
import logging
from datetime import datetime

import optuna 
import mlflow
import xgboost as xgb
import lightgbm as lgb
from prophet import Prophet

from src.logger import logger
from typing import Dict, Any, List, Optional, Union,Tuple
from pathlib import Path
from src.exception import CustomException 

from src.utils.mlflow_utils import MLflowManager
from src.features.feature_pipeline import FeaturePipeline
from src.data_pipelines.validators import DataValidator
from src.models.advanced_ensemble import AdvancedEnsemble
from src.models.digonistics import diagnose_model_performance
from src.models.ensemble_model import EnsembleModel
from src.utils.config_loader import ConfigLoader
from src.visualizations.shap_visualizer import ShapExplainer 

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler, RobustScaler


[ 2025-10-01 15:58:01,828 ] root - [32mINFO[0m - [32mLogger is configured and ready.[0m


[ 2025-10-01 15:58:03,700 ] src.utils.config_loader - [32mINFO[0m - [32mLoaded configuration from: C:\Users\Administrator\OneDrive\Desktop\SalesAI\backend\configs\ml_config.yaml[0m
[ 2025-10-01 15:58:03,703 ] src.features.feature_pipeline - [32mINFO[0m - [32m✅ Config loaded successfully using config_loader[0m


In [16]:
 

logger = logging.getLogger(__name__)


class ModelTrainer:
    def __init__(self, config_path: Optional[Union[str, Path]] = None):
        self.config_loader = ConfigLoader()

        self.config = self.config_loader.load_yaml(file_path="ml_config.yaml")

        self.training_config = self.config.get('training',{})

        self.model_config: Dict[str, Any] = self.config.get('models', {})
        self.model_config: Dict[str, Any] = self.config.get('models', {})

        self.mlflow_manager = MLflowManager(config_path)
        self.feature_engineer = None

        self.data_validator = DataValidator(config_path)

        self.models: Dict[str, Any] = {}
        self.scalers: Dict[str, Any] = {}
        self.encoders: Dict[str, Any] = {}
        self.feature_cols: List[str] = []


    def prepare_data(
        self, df: pd.DataFrame, target_col: str = "sales",
        date_col: str = "date", group_cols: Optional[List[str]] = None,
        categorical_cols: Optional[List[str]] = None
    ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:

        logger.info("🛠 Preparing data for training")

        required_cols = [date_col, target_col]
        if group_cols:
            required_cols.extend(group_cols)

        missing_cols = set(required_cols) - set(df.columns)
        if missing_cols:
            logger.error(f"❌ Missing required columns for training: {missing_cols}")
            raise CustomException(f"Missing required columns for training: {missing_cols}", sys)

        try:
            pipeline = FeaturePipeline(df, target_col=target_col, group_cols=group_cols)
            df_features = pipeline.run()
            logger.info("✅ Feature pipeline executed successfully.")

            if categorical_cols:
                df_features = pipeline.create_target_encoding(df_features, target_col, categorical_cols)
                logger.info("🎯 Applied target encoding to categorical columns.")

            # Chronological split
            df_sorted = df_features.sort_values(date_col)
            train_size = int(len(df_sorted) * (1 - self.training_config["test_size"] - self.training_config["validation_size"]))
            val_size = int(len(df_sorted) * self.training_config["validation_size"])

            train_df = df_sorted[:train_size]
            val_df = df_sorted[train_size:train_size + val_size]
            test_df = df_sorted[train_size + val_size:]

            # Drop rows with missing target
            train_df = train_df.dropna(subset=[target_col])
            val_df = val_df.dropna(subset=[target_col])
            test_df = test_df.dropna(subset=[target_col])

            logger.info(f"📊 Data split → Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")
            return train_df, val_df, test_df

        except Exception as e:
            logger.error(f"❌ Error during data preparation: {e}")
            raise CustomException(e, sys)


    def preprocess_features(
        self,
        train_df: pd.DataFrame,
        val_df: pd.DataFrame,
        test_df: pd.DataFrame,
        target_col: str,
        exclude_cols: List[str] = ["date"]
    ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, np.ndarray, np.ndarray, np.ndarray]:
        try:
            logger.info("🔄 Starting feature preprocessing...")

            feature_cols = [col for col in train_df.columns if col not in exclude_cols + [target_col]]
            self.feature_cols = feature_cols

            X_train, X_val, X_test = train_df[feature_cols].copy(), val_df[feature_cols].copy(), test_df[feature_cols].copy()
            y_train, y_val, y_test = train_df[target_col].values, val_df[target_col].values, test_df[target_col].values

            categorical_cols = X_train.select_dtypes(include=["object", "category"]).columns.tolist()

            for col in categorical_cols:
                if self.training_config.get("encoder", "label") == "label":
                    # Train LabelEncoder on training data
                    if col not in self.encoders:
                        le = LabelEncoder()
                        le.fit(X_train[col].astype(str).fillna("missing"))
                        self.encoders[col] = le
                    else:
                        le = self.encoders[col]

                    def transform_safe(encoder, series):
                        known_classes = set(encoder.classes_)
                        unknowns = series[~series.isin(known_classes)]
                        if not unknowns.empty:
                            logger.warning(f"⚠️ Unseen labels in column '{col}': {unknowns.unique()}")
                            # Add "unknown" class if not already present
                            if "unknown" not in encoder.classes_:
                                encoder.classes_ = np.append(encoder.classes_, "unknown")
                            series = series.apply(lambda x: x if x in known_classes else "unknown")
                        return encoder.transform(series)

                    for df_name, df in zip(["Train", "Val", "Test"], [X_train, X_val, X_test]):
                        series = df[col].astype(str).fillna("missing")
                        transformed = transform_safe(le, series)
                        df[col] = transformed.astype(np.int32)

                        logger.info(f"✅ Label encoded '{col}' in {df_name} set.")

                elif self.training_config["encoder"] == "onehot":
                    ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)
                    X_train_encoded = ohe.fit_transform(X_train[[col]])
                    X_val_encoded = ohe.transform(X_val[[col]])
                    X_test_encoded = ohe.transform(X_test[[col]])

                    encoded_cols = [f"{col}_{cat}" for cat in ohe.categories_[0]]
                    X_train = X_train.drop(columns=col).join(pd.DataFrame(X_train_encoded, columns=encoded_cols, index=X_train.index))
                    X_val = X_val.drop(columns=col).join(pd.DataFrame(X_val_encoded, columns=encoded_cols, index=X_val.index))
                    X_test = X_test.drop(columns=col).join(pd.DataFrame(X_test_encoded, columns=encoded_cols, index=X_test.index))

                    self.encoders[col] = ohe
                    logger.info(f"✅ One-hot encoded '{col}'.")

            # Scaling numeric features
            scaler_type = self.training_config.get("scaler", "standard")
            if scaler_type == "standard":
                scaler = StandardScaler()
            elif scaler_type == "minmax":
                scaler = MinMaxScaler()
            elif scaler_type == "robust":
                scaler = RobustScaler()
            else:
                raise CustomException(f"Unsupported scaler type: {scaler_type}")

            X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
            X_val_scaled = pd.DataFrame(scaler.transform(X_val), columns=X_val.columns, index=X_val.index)
            X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)

            self.scalers["scaler"] = scaler

            logger.info(f"✅ Preprocessing complete. Total features used: {len(self.feature_cols)} 🧠")

            return X_train_scaled, X_val_scaled, X_test_scaled, y_train, y_val, y_test

        except Exception as e:
            logger.error(f"❌ Preprocessing failed: {e}")
            raise CustomException(f"Error in preprocess_features: {e}")


    def calculate_metrics(self, y_true: np.ndarray, y_pred: np.ndarray) -> Dict[str, float]:
        try:
            logger.info("📊 Calculating model evaluation metrics...")

            # Avoid division by zero in MAPE
            non_zero_mask = y_true != 0
            if not np.any(non_zero_mask):
                mape = np.nan
                logger.warning("⚠️ All values in y_true are zero. MAPE is undefined.")
            else:
                mape = np.mean(np.abs((y_true[non_zero_mask] - y_pred[non_zero_mask]) / y_true[non_zero_mask])) * 100

            metrics = {
                "rmse": round(np.sqrt(mean_squared_error(y_true, y_pred)), 4),
                "mae": round(mean_absolute_error(y_true, y_pred), 4),
                "mape": round(mape, 4),
                "r2": round(r2_score(y_true, y_pred), 4),
            }

            logger.info(f"✅ Metrics calculated: {metrics}")
            return metrics

        except Exception as e:
            logger.error(f"❌ Failed to calculate metrics: {e}")
            raise CustomException(f"Error in calculate_metrics: {e}")


    def train_xgboost(self, 
                  X_train: np.ndarray, y_train: np.ndarray,
                  X_val: np.ndarray, y_val: np.ndarray,
                  use_optuna: bool = True) -> xgb.XGBRegressor:
        """
        Train an XGBoost regressor with optional Optuna hyperparameter optimization.

        Args:
            X_train, y_train: Training data
            X_val, y_val: Validation data
            use_optuna (bool): Whether to perform Optuna hyperparameter search

        Returns:
            Trained XGBRegressor model
        """
        logger.info("🚀 Starting training for XGBoost model...")

        if len(X_train) == 0 or len(X_val) == 0:
            raise ValueError("❌ Training or validation data is empty.")

        # Detect GPU support
        try:
            tree_method = "gpu_hist" if xgb.get_config().get("use_gpu", False) else "hist"
            logger.info(f"⚙️ Using tree_method: `{tree_method}`")
        except Exception as e:
            logger.warning(f"⚠️ Failed to detect GPU, using CPU. Reason: {e}")
            tree_method = "hist"

        best_params = {}

        if use_optuna:
            logger.info("🔍 Running Optuna hyperparameter optimization for XGBoost...")

            def objective(trial):
                params = {
                    'n_estimators': trial.suggest_int('n_estimators', 50, 500),
                    'max_depth': trial.suggest_int('max_depth', 3, 12),
                    'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.3, log=True),
                    'subsample': trial.suggest_float('subsample', 0.5, 1.0),
                    'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
                    'gamma': trial.suggest_float('gamma', 0, 1.0),
                    'reg_alpha': trial.suggest_float('reg_alpha', 0, 5.0),
                    'reg_lambda': trial.suggest_float('reg_lambda', 0, 5.0),
                    'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
                    'random_state': 42,
                    'tree_method': tree_method
                }

                model = xgb.XGBRegressor(**params, early_stopping_rounds=50)
                model.fit(
                    X_train, y_train,
                    eval_set=[(X_val, y_val)],
                    verbose=False
                )
                y_pred = model.predict(X_val)
                rmse = np.sqrt(mean_squared_error(y_val, y_pred))
                return rmse

            try:
                study = optuna.create_study(
                    direction="minimize",
                    sampler=optuna.samplers.TPESampler(seed=42),
                    pruner=optuna.pruners.MedianPruner()
                )
                study.optimize(objective, n_trials=self.training_config.get('optuna_trials', 50))

                best_params = study.best_params
                logger.info(f"🏆 Optuna best params found: {best_params}")
            except Exception as e:
                logger.error(f"❌ Optuna optimization failed: {e}")
                raise CustomException(f"Optuna error: {e}")
        else:
            logger.info("📦 Using config-defined hyperparameters for XGBoost...")
            best_params = self.model_config.get("xgboost", {}).get("params", {})
            if not best_params:
                raise ValueError("❌ No XGBoost parameters found in model_config.")
            logger.info(f"✅ Loaded XGBoost params: {best_params}")

        # Add fixed parameters
        best_params.update({
            "random_state": 42,
            "tree_method": tree_method
        })

        # Train final model
        try:
            logger.info("🛠️ Training final XGBoost model...")
            model = xgb.XGBRegressor(**best_params, early_stopping_rounds=50)
            model.fit(
                X_train, y_train,
                eval_set=[(X_val, y_val)],
                verbose=True
            )

            self.models["xgboost"] = model
            try:
                logger.info("Generating SHAP explanations for XGBoost...")
                shap_explainer = ShapExplainer(model, "lightgbm")
                background_sample = X_train.sample(n=min(100, len(X_train)), random_state=42)
                shap_explainer.fit_explainer(background_sample)
                
                if not hasattr(self, 'shap_explainers'):
                    self.shap_explainers = {}
                self.shap_explainers['xgboost'] = shap_explainer
                logger.info("SHAP explainer fitted for XGBoost")
            except Exception as e:
                logger.warning(f"SHAP generation failed for XGBoost: {e}")
            logger.info(f"✅ Model trained successfully. Best iteration: {model.best_iteration}")
            return model

        except Exception as e:
            logger.error(f"❌ XGBoost model training failed: {e}")
            raise CustomException(f"XGBoost training error: {e}")


    def train_lightgbm(self, 
                   X_train: np.ndarray, y_train: np.ndarray,
                   X_val: np.ndarray, y_val: np.ndarray,
                   use_optuna: bool = True) -> lgb.LGBMRegressor:
        """
        Train a LightGBM regressor with optional Optuna hyperparameter optimization.

        Args:
            X_train, y_train: Training dataset
            X_val, y_val: Validation dataset
            use_optuna (bool): Whether to use Optuna for hyperparameter tuning

        Returns:
            Trained LGBMRegressor model
        """

        logger.info("⚙️ Starting LightGBM training...")

        best_params = {}

        if use_optuna:
            logger.info("🔍 Optuna hyperparameter optimization enabled for LightGBM")

            def objective(trial):
                params = {
                    "num_leaves": trial.suggest_int("num_leaves", 31, 256),
                    "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.3, log=True),
                    "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
                    "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
                    "subsample": trial.suggest_float("subsample", 0.5, 1.0),
                    "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
                    "reg_alpha": trial.suggest_float("reg_alpha", 0, 5.0),
                    "reg_lambda": trial.suggest_float("reg_lambda", 0, 5.0),
                    "min_split_gain": trial.suggest_float("min_split_gain", 0, 1.0),
                    "random_state": 42,
                    "verbosity": -1,
                    "objective": "regression",
                    "metric": "rmse",
                    "boosting_type": "gbdt"
                }

                model = lgb.LGBMRegressor(**params)
                model.fit(
                    X_train, y_train,
                    eval_set=[(X_val, y_val)],
                    callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
                )

                y_pred = model.predict(X_val)
                rmse = np.sqrt(mean_squared_error(y_val, y_pred))
                return rmse

            try:
                study = optuna.create_study(
                    direction="minimize",
                    sampler=optuna.samplers.TPESampler(seed=42),
                    pruner=optuna.pruners.MedianPruner()
                )
                study.optimize(objective, n_trials=self.training_config.get("optuna_trials", 50))
                best_params = study.best_params

                logger.info(f"🏆 Best LightGBM params via Optuna: {best_params}")

            except Exception as e:
                logger.error(f"❌ Optuna optimization failed: {e}")
                raise CustomException(f"Optuna LightGBM error: {e}")

            # Add required fixed params
            best_params.update({
                "random_state": 42,
                "verbosity": -1,
                "objective": "regression",
                "metric": "rmse",
                "boosting_type": "gbdt"
            })

        else:
            logger.info("📦 Using static config parameters for LightGBM...")
            best_params = self.model_config.get("lightgbm", {}).get("params", {})
            if not best_params:
                raise ValueError("❌ No LightGBM parameters found in `model_config`")
            logger.info(f"✅ Loaded config params: {best_params}")

        # Final training
        try:
            logger.info("🛠️ Training final LightGBM model...")
            model = lgb.LGBMRegressor(**best_params)
            model.fit(
                X_train, y_train,
                eval_set=[(X_val, y_val)],
                callbacks=[lgb.early_stopping(50), lgb.log_evaluation(50)]
            )

            self.models["lightgbm"] = model
            try:
                logger.info("Generating SHAP explanations for LightGBM...")
                shap_explainer = ShapExplainer(model, "lightgbm")
                background_sample = X_train.sample(n=min(100, len(X_train)), random_state=42)
                shap_explainer.fit_explainer(background_sample)
                
                if not hasattr(self, 'shap_explainers'):
                    self.shap_explainers = {}
                self.shap_explainers['lightgbm'] = shap_explainer
                logger.info("✅ SHAP explainer fitted for LightGBM")
            except Exception as e:
                logger.warning(f"⚠️ SHAP generation failed for LightGBM: {e}")


            logger.info("✅ LightGBM model trained and stored.")
            return model

        except Exception as e:
            logger.error(f"❌ LightGBM training failed: {e}")
            raise CustomException(f"LightGBM training error: {e}")


    def train_prophet(self, 
                    train_df: pd.DataFrame, 
                    val_df: pd.DataFrame,
                    date_col: str = 'date', 
                    target_col: str = 'sales') -> Prophet:
        """
        Train a Prophet model with optional regressors and evaluate on validation set.

        Args:
            train_df: Training DataFrame
            val_df: Validation DataFrame
            date_col: Name of date column
            target_col: Name of target variable

        Returns:
            Trained Prophet model
        """

        logger.info("📅 Starting Prophet training...")

        try:
            # --- Prepare training data ---
            prophet_train = train_df[[date_col, target_col]].rename(
                columns={date_col: 'ds', target_col: 'y'}
            ).dropna().sort_values('ds')

            # Load Prophet hyperparameters from config
            prophet_params = self.model_config.get('prophet', {}).get('params', {})
            prophet_params.update({
                'stan_backend': 'CMDSTANPY',
                'mcmc_samples': 0,             # No Bayesian sampling = faster
                'uncertainty_samples': 100     # Reasonable uncertainty
            })

            model = Prophet(**prophet_params)

            # --- Select numeric regressors ---
            numeric_cols = train_df.select_dtypes(include=[np.number]).columns.tolist()
            regressor_cols = [c for c in numeric_cols if c not in [target_col, 'year', 'month', 'day', 'week', 'quarter']]

            # Reduce to top 5 highest variance
            if len(regressor_cols) > 5:
                variances = {col: train_df[col].var() for col in regressor_cols}
                regressor_cols = sorted(variances, key=variances.get, reverse=True)[:5]

            for col in regressor_cols:
                if col in train_df and train_df[col].std() > 0:
                    model.add_regressor(col)
                    prophet_train[col] = train_df[col]
                    logger.info(f"➕ Added regressor to Prophet: {col}")
                else:
                    logger.warning(f"⚠️ Skipping regressor '{col}' due to zero variance or missing data.")

            # --- Fit the model ---
            model.fit(prophet_train)
            self.models['prophet'] = model
            logger.info("✅ Prophet model trained successfully.")

        except Exception as e:
            logger.error(f"❌ Prophet training failed with error: {e}")

            logger.info("🔁 Retrying Prophet with fallback parameters...")

            try:
                model = Prophet(
                    yearly_seasonality=True,
                    weekly_seasonality=True,
                    daily_seasonality=False,
                    changepoint_prior_scale=0.05,
                    seasonality_prior_scale=10.0,
                    uncertainty_samples=50,
                    mcmc_samples=0
                )
                fallback_train = prophet_train[['ds', 'y']]
                model.fit(fallback_train)
                self.models['prophet'] = model
                logger.info("✅ Prophet fallback model trained successfully.")
            except Exception as fallback_error:
                logger.error("❌ Fallback Prophet training also failed.")
                raise CustomException(f"Prophet training completely failed: {fallback_error}")

        # --- Validation Prediction ---
        try:
            prophet_val = val_df[[date_col, target_col]].rename(
                columns={date_col: 'ds', target_col: 'y'}
            ).dropna().sort_values('ds')

            # Add matching regressors to validation data
            for col in regressor_cols:
                if col in val_df:
                    prophet_val[col] = val_df[col]
                else:
                    logger.warning(f"⚠️ Regressor '{col}' missing in validation set")

            forecast = model.predict(prophet_val)

            y_true = prophet_val['y'].values
            y_pred = forecast['yhat'].values

            val_rmse = np.sqrt(mean_squared_error(y_true, y_pred))
            val_mae = mean_absolute_error(y_true, y_pred)
            val_r2 = r2_score(y_true, y_pred)

            logger.info(f"📊 Prophet Validation Metrics — RMSE: {val_rmse:.4f} | MAE: {val_mae:.4f} | R²: {val_r2:.4f}")
        except Exception as eval_error:
            logger.error(f"❌ Failed during Prophet validation: {eval_error}")
            raise CustomException(f"Prophet evaluation failed: {eval_error}")

        return model

    
    def train_all_models(self, train_df: pd.DataFrame, val_df: pd.DataFrame,
                        test_df: pd.DataFrame, target_col: str = 'sales',
                        use_optuna: bool = True) -> Dict[str, Dict[str, Any]]:
        """
        Train all models (XGBoost, LightGBM, Prophet) and build ensemble.
        Improvements applied: bug fixes, defensive checks, improved logging.
        """
        results = {}
        logger.info("🚀 Starting full model training pipeline...")

        # Start MLflow run via manager (assume manager wraps mlflow.start_run)
        run_name = f"sales_forecast_training_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
        run_id = self.mlflow_manager.start_run(run_name, tags={"model_type": "ensemble", "use_optuna": str(use_optuna)})
        logger.info(f"🎯 MLflow run started with run_id={run_id}")

        try:
            # ------------------------
            # Preprocess Data
            # ------------------------
            logger.info("🧹 Preprocessing features and target variables...")
            X_train, X_val, X_test, y_train, y_val, y_test = self.preprocess_features(
                train_df, val_df, test_df, target_col
            )
            logger.info(f"📊 Data sizes - Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")
            self.mlflow_manager.log_params({
                "train_size": len(train_df),
                "val_size": len(val_df),
                "test_size": len(test_df),
                "n_features": X_train.shape[1]
            })

            # Keep placeholders for predictions (so we can always compose test_predictions)
            xgb_pred = lgb_pred = prophet_pred = None

            # ------------
            # Train XGBoost
            # ------------
            logger.info("🔥 Training XGBoost model...")
            try:
                xgb_model = self.train_xgboost(X_train, y_train, X_val, y_val, use_optuna)
                xgb_pred = xgb_model.predict(X_test)
                xgb_metrics = self.calculate_metrics(y_test, xgb_pred)

                self.mlflow_manager.log_metrics({f"xgboost_{k}": v for k, v in xgb_metrics.items()})
                self.mlflow_manager.log_model(xgb_model, "xgboost", input_example=X_train.iloc[:5])

                # Feature importance: handle sklearn wrapper or booster object
                try:
                    if hasattr(xgb_model, "feature_importances_"):
                        imp = xgb_model.feature_importances_
                    else:
                        # try booster score fallbacks
                        booster = getattr(xgb_model, "get_booster", None)
                        if callable(booster):
                            score = xgb_model.get_booster().get_score(importance_type='gain')
                            imp = [score.get(f, 0.0) for f in self.feature_cols]
                        else:
                            imp = [0.0] * len(self.feature_cols)
                except Exception:
                    imp = [0.0] * len(self.feature_cols)

                feature_importance = pd.DataFrame({
                    'feature': self.feature_cols,
                    'importance': imp
                }).sort_values('importance', ascending=False).head(20)

                logger.info(f"🌟 Top XGBoost features:\n{feature_importance.to_string()}")
                for i, (_, row) in enumerate(feature_importance.iterrows()):
                    self.mlflow_manager.log_params({f"xgb_top_feature_{i}": f"{row['feature']} ({row['importance']:.4f})"})

                results['xgboost'] = {
                    'model': xgb_model,
                    'metrics': xgb_metrics,
                    'predictions': xgb_pred,
                    'actual': y_test
                }
                logger.info("✅ XGBoost training complete!")
            except Exception as e:
                logger.error(f"❌ XGBoost training failed: {e}", exc_info=True)
                raise

            # ------------
            # Train LightGBM
            # ------------
            logger.info("🔥 Training LightGBM model...")
            try:
                lgb_model = self.train_lightgbm(X_train, y_train, X_val, y_val, use_optuna)
                lgb_pred = lgb_model.predict(X_test)
                lgb_metrics = self.calculate_metrics(y_test, lgb_pred)

                self.mlflow_manager.log_metrics({f"lightgbm_{k}": v for k, v in lgb_metrics.items()})
                self.mlflow_manager.log_model(lgb_model, "lightgbm", input_example=X_train.iloc[:5])

                # LightGBM feature importance defensive
                try:
                    if hasattr(lgb_model, "feature_importance"):
                        imp = lgb_model.feature_importance()
                    elif hasattr(lgb_model, "booster_"):
                        imp = lgb_model.booster_.feature_importance()
                    else:
                        imp = [0.0] * len(self.feature_cols)
                except Exception:
                    imp = [0.0] * len(self.feature_cols)

                lgb_importance = pd.DataFrame({
                    'feature': self.feature_cols,
                    'importance': imp
                }).sort_values('importance', ascending=False).head(20)

                logger.info(f"🌟 Top LightGBM features:\n{lgb_importance.to_string()}")

                results['lightgbm'] = {
                    'model': lgb_model,
                    'metrics': lgb_metrics,
                    'predictions': lgb_pred,
                    'actual': y_test
                }
                logger.info("✅ LightGBM training complete!")
            except Exception as e:
                logger.error(f"❌ LightGBM training failed: {e}", exc_info=True)
                raise

            # ------------------------
            # Train Prophet (optional)
            # ------------------------
            prophet_enabled = self.model_config.get('prophet', {}).get('enabled', True)
            if prophet_enabled:
                logger.info("🌟 Training Prophet model...")
                try:
                    prophet_model = self.train_prophet(train_df, val_df)

                    future = test_df[['date']].rename(columns={'date': 'ds'}).copy()

                    # Add regressors if they exist both in model and test_df
                    regressor_cols = []
                    if hasattr(prophet_model, 'extra_regressors') and isinstance(prophet_model.extra_regressors, dict):
                        regressor_cols = [c for c in prophet_model.extra_regressors.keys() if c in test_df.columns]
                    else:
                        # fallback: check if test_df contains likely regressor columns used in training
                        regressor_cols = [c for c in test_df.columns if c not in ['date', target_col]]

                    for col in regressor_cols:
                        future[col] = test_df[col]

                    prophet_pred = prophet_model.predict(future)['yhat'].values
                    prophet_metrics = self.calculate_metrics(y_test, prophet_pred)

                    self.mlflow_manager.log_metrics({f"prophet_{k}": v for k, v in prophet_metrics.items()})
                    self.mlflow_manager.log_model(prophet_model, "prophet", input_example=future.iloc[:5])

                    results['prophet'] = {
                        'model': prophet_model,
                        'metrics': prophet_metrics,
                        'predictions': prophet_pred,
                        'actual': y_test
                    }
                    logger.info("✅ Prophet training complete!")
                except Exception as e:
                    logger.warning(f"⚠️ Prophet training failed: {e}. Falling back to XGBoost+LightGBM ensemble.")
                    prophet_enabled = False
                    prophet_pred = None
            else:
                logger.info("ℹ️ Prophet training skipped by config.")
                prophet_pred = None

            # ------------------------
            # Create test_predictions dict (must exist before ensemble optimizer)
            # ------------------------
            test_predictions = {
                'xgboost': xgb_pred,
                'lightgbm': lgb_pred,
                'prophet': prophet_pred
            }

            # ------------------------
            # Build Weighted Ensemble (if prophet missing use 2-model weighting)
            # ------------------------
            logger.info("🧩 Building stacking/weighted ensemble...")
            try:
                if prophet_enabled and prophet_pred is not None:
                    # Simple equal-weight ensemble when Prophet available (or you can compute a better blend)
                    ensemble_pred = (xgb_pred + lgb_pred + prophet_pred) / 3.0
                    ensemble_weights = {'xgboost': 1/3, 'lightgbm': 1/3, 'prophet': 1/3}
                    logger.info("⚖️ Using equal weights for XGB/LGB/Prophet ensemble.")
                else:
                    # compute weights based on validation R2 (defensive: avoid div by zero)
                    xgb_val_pred = xgb_model.predict(X_val)
                    lgb_val_pred = lgb_model.predict(X_val)

                    xgb_val_r2 = r2_score(y_val, xgb_val_pred)
                    lgb_val_r2 = r2_score(y_val, lgb_val_pred)

                    # ensure positive and min floor
                    min_weight = 0.2
                    denom = (xgb_val_r2 + lgb_val_r2) if (xgb_val_r2 + lgb_val_r2) != 0 else 1.0
                    xgb_weight = max(min_weight, xgb_val_r2 / denom)
                    lgb_weight = max(min_weight, lgb_val_r2 / denom)

                    total = xgb_weight + lgb_weight
                    xgb_weight /= total
                    lgb_weight /= total

                    ensemble_weights = {'xgboost': xgb_weight, 'lightgbm': lgb_weight}
                    ensemble_pred = xgb_weight * xgb_pred + lgb_weight * lgb_pred
                    logger.info(f"⚖️ Ensemble weights - XGBoost: {xgb_weight:.3f}, LightGBM: {lgb_weight:.3f}")
            except Exception as e:
                logger.error(f"❌ Failed to build weighted ensemble: {e}", exc_info=True)
                raise

            # ------------------------
            # Use AdvancedEnsemble to find optimal blend on test predictions
            # ------------------------
            try:
                advanced_ensemble = AdvancedEnsemble()
                # ensure we pass only available predictions to the optimizer
                available_preds = {k: v for k, v in test_predictions.items() if v is not None}
                if len(available_preds) >= 2:
                    _, optimal_weights = advanced_ensemble.create_blended_ensemble(available_preds, y_test, optimization_metric='rmse')
                    # map optimal_weights back to model keys (assumed ordering inside create_blended_ensemble)
                else:
                    optimal_weights = ensemble_weights
            except Exception as e:
                logger.warning(f"⚠️ Advanced ensemble optimization failed: {e}. Using heuristic weights.")
                optimal_weights = ensemble_weights

            # Build EnsembleModel object using models that exist
            ensemble_models = {'xgboost': xgb_model, 'lightgbm': lgb_model}
            if 'prophet' in results:
                ensemble_models['prophet'] = results['prophet']['model']

            ensemble_model = EnsembleModel(ensemble_models, optimal_weights)
            self.models['ensemble'] = ensemble_model

            # Evaluate ensemble
            ensemble_metrics = self.calculate_metrics(y_test, ensemble_pred)
            self.mlflow_manager.log_metrics({f"ensemble_{k}": v for k, v in ensemble_metrics.items()})
            self.mlflow_manager.log_model(ensemble_model, "ensemble", input_example=X_train.iloc[:5])

            results['ensemble'] = {
                'model': ensemble_model,
                'metrics': ensemble_metrics,
                'predictions': ensemble_pred,
                'actual': y_test
            }
            logger.info("🏆 Ensemble training complete!")

            # ------------------------
            # Generate SHAP visualizations (best-effort)
            # ------------------------
            logger.info("📊 Generating SHAP visualizations and explanations...")
            try:
                shap_artifacts = self._generate_shap_artifacts(X_test, y_test, test_df)
                if isinstance(shap_artifacts, dict):
                    if shap_artifacts.get('metrics'):
                        self.mlflow_manager.log_metrics(shap_artifacts['metrics'])
                    if shap_artifacts.get('params'):
                        self.mlflow_manager.log_params(shap_artifacts['params'])
                    for file_path in shap_artifacts.get('files', []):
                        self.mlflow_manager.log_artifact(file_path, "shap_explanations")
                    logger.info("✅ SHAP artifacts logged to MLflow")
                else:
                    logger.warning("⚠️ SHAP artifacts returned unexpected type; skipping MLflow logging.")
            except Exception as e:
                logger.warning(f"⚠️ SHAP visualization generation failed: {e}")

            # ------------------------
            # Diagnostics & Visualizations
            # ------------------------
            try:
                test_predictions_for_diag = {
                    k: v for k, v in {
                        'xgboost': xgb_pred,
                        'lightgbm': lgb_pred,
                        'ensemble': ensemble_pred
                    }.items() if v is not None
                }
                diagnosis = diagnose_model_performance(train_df, val_df, test_df, test_predictions_for_diag, target_col)
                logger.info("📋 Diagnostic recommendations:")
                for rec in diagnosis.get('recommendations', []):
                    logger.warning(f"⚠️ - {rec}")
            except Exception as e:
                logger.error(f"❌ Diagnostics failed: {e}", exc_info=True)

            try:
                self._generate_and_log_visualizations(results, test_df, target_col)
            except Exception as viz_error:
                logger.error(f"❌ Visualization generation failed: {viz_error}", exc_info=True)

            # Save artifacts to disk / local artifact store
            logger.info("💾 Saving artifacts...")
            try:
                self.save_artifacts()
            except Exception as e:
                logger.warning(f"⚠️ save_artifacts() failed: {e}")

            # Obtain run id from mlflow.active_run if available, else fall back to run_id returned earlier
            try:
                import mlflow
                active = mlflow.active_run()
                current_run_id = active.info.run_id if active else run_id
            except Exception:
                current_run_id = run_id

            # End the run via manager (assume it closes mlflow)
            self.mlflow_manager.end_run()
            logger.info("🏁 MLflow run ended.")

            # ------------------------
            # Sync to S3 (best-effort)
            # ------------------------
            logger.info("☁️ Syncing artifacts to S3...")
            try:
                from src.utils.mlflow_s3_utils import MLflowS3Manager
                s3_manager = MLflowS3Manager()
                s3_manager.sync_mlflow_artifacts_to_s3(current_run_id)
                logger.info("✓ Successfully synced artifacts to S3")

                from src.utils.s3_verification import verify_s3_artifacts, log_s3_verification_results
                logger.info("🔍 Verifying S3 artifact storage...")
                verification_results = verify_s3_artifacts(
                    run_id=current_run_id,
                    expected_artifacts=[
                        'models/',
                        'scalers.pkl',
                        'encoders.pkl',
                        'feature_cols.pkl',
                        'visualizations/',
                        'reports/'
                    ]
                )
                log_s3_verification_results(verification_results)
                if not verification_results.get("success", False):
                    logger.warning("⚠️ S3 artifact verification failed after sync")
            except Exception as e:
                logger.error(f"❌ Failed to sync artifacts to S3: {e}", exc_info=True)

        except Exception as e:
            # Ensure MLflow run closed and marked failed
            try:
                self.mlflow_manager.end_run(status="FAILED")
            except Exception:
                logger.warning("⚠️ Could not gracefully end MLflow run on exception.")
            logger.error(f"💥 Training pipeline failed: {e}", exc_info=True)
            raise

        return results


    def _create_combined_html_report(self, saved_files: Dict[str, str], save_dir: str) -> None:
        """Create a combined HTML report with all visualizations"""
        import os
        from datetime import datetime
        import base64

        try:
            logger.info("📝 Creating combined HTML report for visualizations...")

            html_content = """
            <!DOCTYPE html>
            <html>
            <head>
                <title>Model Comparison Report</title>
                <style>
                    body {{
                        font-family: Arial, sans-serif;
                        margin: 20px;
                        background-color: #f5f5f5;
                    }}
                    h1, h2 {
                        color: #333;
                    }
                    .section {
                        background-color: white;
                        padding: 20px;
                        margin-bottom: 20px;
                        border-radius: 8px;
                        box-shadow: 0 2px 4px rgba(0,0,0,0.1);
                    }
                    .timestamp {
                        color: #666;
                        font-size: 14px;
                    }
                    iframe {
                        width: 100%;
                        height: 800px;
                        border: 1px solid #ddd;
                        border-radius: 4px;
                        margin-top: 10px;
                    }
                    img {
                        max-width: 100%;
                        height: auto;
                        border-radius: 4px;
                        margin-top: 10px;
                    }
                </style>
            </head>
            <body>
                <h1>Sales Forecast Model Comparison Report</h1>
                <p class="timestamp">Generated on: {timestamp}</p>
            """

            html_content = html_content.format(timestamp=datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

            sections = [
                ('metrics_comparison', 'Model Performance Metrics'),
                ('predictions_comparison', 'Predictions Comparison'),
                ('residuals_analysis', 'Residuals Analysis'),
                ('error_distribution', 'Error Distribution'),
                ('feature_importance', 'Feature Importance'),
                ('summary', 'Summary Statistics')
            ]

            for key, title in sections:
                if key in saved_files:
                    html_content += f'<div class="section"><h2>{title}</h2>'

                    try:
                        with open(saved_files[key], 'rb') as f:
                            img_data = base64.b64encode(f.read()).decode()
                        html_content += f'<img src="data:image/png;base64,{img_data}" alt="{title}">'
                    except Exception as e:
                        logger.warning(f"⚠️ Failed to embed image for section '{title}': {e}")

                    html_content += '</div>'

            html_content += """
            </body>
            </html>
            """

            report_path = os.path.join(save_dir, 'model_comparison_report.html')
            with open(report_path, 'w') as f:
                f.write(html_content)

            logger.info(f"✅ Combined HTML report created at: {report_path}")

        except Exception as e:
            logger.error(f"💥 Failed to create combined HTML report: {e}", exc_info=True)


    def _generate_shap_artifacts(self, X_test: pd.DataFrame, y_test: np.ndarray, 
                            test_df: pd.DataFrame) -> Dict[str, Any]:
        """Generate SHAP explanations and visualizations."""
        import tempfile
        import os
        from src.visualizations.shap_visualizer import ShapVisualizer
        
        artifacts = {'metrics': {}, 'params': {}, 'files': []}
        
        with tempfile.TemporaryDirectory() as temp_dir:
            for model_name in ['xgboost', 'lightgbm']:
                if not hasattr(self, 'shap_explainers') or model_name not in self.shap_explainers:
                    logger.warning(f"⚠️ No SHAP explainer found for {model_name}")
                    continue
                
                try:
                    explainer = self.shap_explainers[model_name]
                    visualizer = ShapVisualizer(explainer)
                    
                    # Generate explanations
                    test_sample = X_test.sample(n=min(100, len(X_test)), random_state=42)
                    explanation = explainer.explain_prediction(test_sample)
                    
                    # Global importance
                    importance_df = explainer.get_global_importance(explanation, top_k=15)
                    csv_path = os.path.join(temp_dir, f"{model_name}_feature_importance.csv")
                    importance_df.to_csv(csv_path, index=False)
                    artifacts['files'].append(csv_path)
                    
                    # Global importance visualization
                    html_path = os.path.join(temp_dir, f"{model_name}_global_importance.html")
                    visualizer.plot_global_importance(explanation, save_path=html_path)
                    artifacts['files'].append(html_path)
                    
                    # Summary plot
                    summary_path = os.path.join(temp_dir, f"{model_name}_summary.html")
                    visualizer.plot_summary_plot(explanation, save_path=summary_path)
                    artifacts['files'].append(summary_path)
                    
                    # Individual waterfall plots (first 3 samples)
                    for i in range(min(3, len(test_sample))):
                        waterfall_path = os.path.join(temp_dir, f"{model_name}_waterfall_{i}.html")
                        visualizer.plot_waterfall_explanation(explanation, sample_idx=i, save_path=waterfall_path)
                        artifacts['files'].append(waterfall_path)
                    
                    # Business explanations
                    business_exp = visualizer.create_business_explanation(
                        explanation, 
                        sample_idx=0,
                        store_id=str(test_sample.iloc[0].get('store_id', 'Unknown')),
                        item_id=str(test_sample.iloc[0].get('item_id', 'Unknown')),
                        pred_date=str(test_sample.iloc[0].get('date', 'Unknown'))
                    )
                    
                    # Log metrics
                    artifacts['metrics'][f'{model_name}_shap_top_feature'] = importance_df.iloc[0]['feature']
                    artifacts['metrics'][f'{model_name}_shap_top_importance'] = float(importance_df.iloc[0]['importance'])
                    
                    # Log params (top 5 features)
                    for i, row in importance_df.head(5).iterrows():
                        artifacts['params'][f'{model_name}_feature_{i+1}'] = f"{row['feature']} ({row['importance']:.4f})"
                    
                    logger.info(f"✅ Generated SHAP artifacts for {model_name}")
                    
                except Exception as e:
                    logger.error(f"❌ Failed to generate SHAP artifacts for {model_name}: {e}")
                    continue
        
        return artifacts


    def save_artifacts(self, version: str = None):
        """
        Save scalers, encoders, feature columns, and trained models.
        Also logs everything to MLflow for version tracking.
        """
        import os
        import joblib
        from datetime import datetime

        version = version or datetime.now().strftime("%Y%m%d_%H%M%S")
        base_dir = f'/tmp/artifacts/{version}'

        try:
            os.makedirs(base_dir, exist_ok=True)
            logger.info(f"📁 Created artifact directory: {base_dir}")

            # Save preprocessing objects
            joblib.dump(self.scalers, os.path.join(base_dir, 'scalers.pkl'))
            joblib.dump(self.encoders, os.path.join(base_dir, 'encoders.pkl'))
            joblib.dump(self.feature_cols, os.path.join(base_dir, 'feature_cols.pkl'))
            logger.info("💾 Saved scalers, encoders, and feature columns.")

            # Save model directories
            model_dirs = {
                'xgboost': os.path.join(base_dir, 'models/xgboost'),
                'lightgbm': os.path.join(base_dir, 'models/lightgbm'),
                'ensemble': os.path.join(base_dir, 'models/ensemble')
            }

            for mname, mdir in model_dirs.items():
                os.makedirs(mdir, exist_ok=True)
                if mname in self.models:
                    model = self.models[mname]
                    joblib.dump(model, os.path.join(mdir, f"{mname}_model.pkl"))
                    logger.info(f"🛠️ Saved model: {mname} -> {mdir}")

            # Save metadata for reproducibility
            metadata = {
                "version": version,
                "timestamp": datetime.now().isoformat(),
                "models_saved": list(self.models.keys()),
                "feature_count": len(self.feature_cols),
            }
            joblib.dump(metadata, os.path.join(base_dir, 'metadata.pkl'))
            logger.info("📜 Saved metadata.")

            # Log artifacts to MLflow
            if hasattr(self, "mlflow_manager") and self.mlflow_manager:
                self.mlflow_manager.log_artifacts(base_dir)
                logger.info("🚀 Artifacts logged to MLflow.")
            else:
                logger.warning("⚠️ mlflow_manager not found or None. Skipping MLflow logging.")

            logger.info(f"✅ Artifacts saved successfully in {base_dir}")

        except Exception as e:
            logger.error(f"❌ Failed to save artifacts: {e}", exc_info=True)


In [17]:
trainer = ModelTrainer()

[ 2025-10-01 16:21:05,420 ] src.utils.config_loader - [32mINFO[0m - [32mLoaded configuration from: C:\Users\Administrator\OneDrive\Desktop\SalesAI\backend\configs\ml_config.yaml[0m
[ 2025-10-01 16:21:05,482 ] src.utils.config_loader - [32mINFO[0m - [32mLoaded configuration from: C:\Users\Administrator\OneDrive\Desktop\SalesAI\backend\configs\ml_config.yaml[0m
[ 2025-10-01 16:21:05,489 ] src.utils.service_discovery - [32mINFO[0m - [32mUsing MLFLOW_TRACKING_URI from env: http://localhost:5001[0m
[ 2025-10-01 16:21:05,496 ] src.utils.mlflow_utils - [32mINFO[0m - [32mMlflow tracking URI discovered: http://localhost:5001[0m
[ 2025-10-01 16:21:07,331 ] src.utils.mlflow_utils - [32mINFO[0m - [32mSet MLflow experiment: sales_forecasting[0m
[ 2025-10-01 16:21:07,335 ] src.utils.service_discovery - [32mINFO[0m - [32mUsing MLFLOW_S3_ENDPOINT_URL from env: http://localhost:9000[0m
[ 2025-10-01 16:21:07,353 ] src.utils.mlflow_utils - [32mINFO[0m - [32mConfigured MinIO end

In [7]:
df = pd.read_parquet(r"C:\Users\Administrator\OneDrive\Desktop\SalesAI\backend\data\features\m5\m5_features.parquet")

In [8]:
df.shape

(2958084, 82)

In [9]:
n = len(df)
new_df = df.iloc[:n//9]

In [10]:
new_df.shape

(328676, 82)

In [11]:
train_df,value_df,test_df = trainer.prepare_data(
    df=new_df,
    target_col="sales",
    date_col="date",
    group_cols=['store_id'],
    categorical_cols=['store_id']
)

[ 2025-10-01 16:06:12,550 ] __main__ - [32mINFO[0m - [32m🛠 Preparing data for training[0m
[ 2025-10-01 16:06:12,686 ] src.features.feature_pipeline - [32mINFO[0m - [32m🚀 Initialized FeaturePipeline with target: sales, groups: ['store_id'], country: US[0m
[ 2025-10-01 16:06:12,693 ] src.features.feature_pipeline - [32mINFO[0m - [32m🚦 Starting Feature Pipeline...[0m


[ 2025-10-01 16:06:12,702 ] src.features.feature_pipeline - [32mINFO[0m - [32m📅 Adding date features[0m
[ 2025-10-01 16:06:15,553 ] src.features.feature_pipeline - [32mINFO[0m - [32m✅ Added date features: ['year', 'month', 'day', 'dayofweek', 'quarter', 'weekofyear', 'is_weekend', 'is_holiday'][0m
[ 2025-10-01 16:06:15,558 ] src.features.feature_pipeline - [32mINFO[0m - [32m📅 Date features added.[0m
[ 2025-10-01 16:06:15,562 ] src.features.feature_pipeline - [32mINFO[0m - [32m🕰 Adding lag features[0m
[ 2025-10-01 16:06:16,073 ] src.features.feature_pipeline - [32mINFO[0m - [32m✅ Created lag feature: sales_lag_1[0m
[ 2025-10-01 16:06:16,082 ] src.features.feature_pipeline - [32mINFO[0m - [32m✅ Created lag feature: sales_lag_2[0m
[ 2025-10-01 16:06:16,099 ] src.features.feature_pipeline - [32mINFO[0m - [32m✅ Created lag feature: sales_lag_3[0m
[ 2025-10-01 16:06:16,114 ] src.features.feature_pipeline - [32mINFO[0m - [32m✅ Created lag feature: sales_lag_7[0

  means = df.groupby(col)[target_col].mean()


[ 2025-10-01 16:06:22,176 ] src.features.feature_pipeline - [32mINFO[0m - [32m🏁 Target encoding completed for 1 columns.[0m
[ 2025-10-01 16:06:22,182 ] __main__ - [32mINFO[0m - [32m🎯 Applied target encoding to categorical columns.[0m
[ 2025-10-01 16:06:23,664 ] __main__ - [32mINFO[0m - [32m📊 Data split → Train: 230073, Val: 32867, Test: 65736[0m


In [12]:
train_df.isnull().sum()

id                         0
item_id                    0
dept_id                    0
cat_id                     0
store_id                   0
                          ..
month_sin                  0
month_cos                  0
dow_sin                    0
dow_cos                    0
store_id_target_encoded    0
Length: 111, dtype: int64

In [13]:
X_train, X_val, X_test, y_train, y_val, y_test = trainer.preprocess_features(
                train_df, value_df, test_df, target_col='sales'
            )

[ 2025-10-01 16:06:28,247 ] __main__ - [32mINFO[0m - [32m🔄 Starting feature preprocessing...[0m
[ 2025-10-01 16:06:29,344 ] __main__ - [32mINFO[0m - [32m✅ Label encoded 'id' in Train set.[0m
[ 2025-10-01 16:06:29,421 ] __main__ - [32mINFO[0m - [32m✅ Label encoded 'id' in Val set.[0m
[ 2025-10-01 16:06:29,531 ] __main__ - [32mINFO[0m - [32m✅ Label encoded 'id' in Test set.[0m
[ 2025-10-01 16:06:29,964 ] __main__ - [32mINFO[0m - [32m✅ Label encoded 'item_id' in Train set.[0m
[ 2025-10-01 16:06:30,017 ] __main__ - [32mINFO[0m - [32m✅ Label encoded 'item_id' in Val set.[0m
[ 2025-10-01 16:06:30,091 ] __main__ - [32mINFO[0m - [32m✅ Label encoded 'item_id' in Test set.[0m
[ 2025-10-01 16:06:30,439 ] __main__ - [32mINFO[0m - [32m✅ Label encoded 'dept_id' in Train set.[0m
[ 2025-10-01 16:06:30,488 ] __main__ - [32mINFO[0m - [32m✅ Label encoded 'dept_id' in Val set.[0m
[ 2025-10-01 16:06:30,553 ] __main__ - [32mINFO[0m - [32m✅ Label encoded 'dept_id' in Te

In [49]:
model_light = trainer.train_lightgbm(X_train,y_train,X_val,y_val,use_optuna=False)

[ 2025-09-29 23:56:54,026 ] __main__ - [32mINFO[0m - [32m⚙️ Starting LightGBM training...[0m
[ 2025-09-29 23:56:54,030 ] __main__ - [32mINFO[0m - [32m📦 Using static config parameters for LightGBM...[0m
[ 2025-09-29 23:56:54,037 ] __main__ - [32mINFO[0m - [32m✅ Loaded config params: {'num_leaves': 31, 'learning_rate': 0.05, 'n_estimators': 100, 'objective': 'regression', 'random_state': 42}[0m
[ 2025-09-29 23:56:54,040 ] __main__ - [32mINFO[0m - [32m🛠️ Training final LightGBM model...[0m
Training until validation scores don't improve for 50 rounds
[50]	valid_0's l2: 0.0580903
[100]	valid_0's l2: 0.0087255
Did not meet early stopping. Best iteration is:
[100]	valid_0's l2: 0.0087255
[ 2025-09-29 23:57:00,590 ] __main__ - [32mINFO[0m - [32m✅ LightGBM model trained and stored.[0m


In [50]:
y_pred = model_light.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))

In [51]:
rmse

0.09341039322822894

In [53]:
target_col = 'sales'
exclude_cols = 'date'
feature_cols = [col for col in train_df.columns ]


In [57]:
lgb_importance =({
                'feature': feature_cols,
                'importance': model_light.feature_importances_
            })
lgb_importance

{'feature': ['id',
  'item_id',
  'dept_id',
  'cat_id',
  'store_id',
  'state_id',
  'd',
  'sales',
  'date',
  'wm_yr_wk',
  'weekday',
  'wday',
  'month',
  'year',
  'event_name_1',
  'event_type_1',
  'event_name_2',
  'event_type_2',
  'snap_CA',
  'snap_TX',
  'snap_WI',
  'day',
  'quarter',
  'week_of_year',
  'is_weekend',
  'has_event',
  'snap_any',
  'sell_price',
  'revenue',
  'is_holiday',
  'snap_benefit_period',
  'price_lag_1',
  'price_change_1d',
  'price_increased_1d',
  'price_decreased_1d',
  'price_lag_7',
  'price_change_7d',
  'price_increased_7d',
  'price_decreased_7d',
  'price_lag_14',
  'price_change_14d',
  'price_increased_14d',
  'price_decreased_14d',
  'price_lag_28',
  'price_change_28d',
  'price_increased_28d',
  'price_decreased_28d',
  'price_volatility_7d',
  'price_volatility_28d',
  'event_sporting',
  'event_cultural',
  'event_national',
  'event_religious',
  'sales_lag_1',
  'sales_lag_2',
  'sales_lag_3',
  'sales_lag_7',
  'sales_la

In [15]:
model_xgb = trainer.train_xgboost(X_train, y_train, X_val, y_val, use_optuna=False)

[ 2025-09-30 01:41:38,381 ] __main__ - [32mINFO[0m - [32m🚀 Starting training for XGBoost model...[0m
[ 2025-09-30 01:41:38,422 ] __main__ - [32mINFO[0m - [32m⚙️ Using tree_method: `hist`[0m
[ 2025-09-30 01:41:38,424 ] __main__ - [32mINFO[0m - [32m📦 Using config-defined hyperparameters for XGBoost...[0m
[ 2025-09-30 01:41:38,428 ] __main__ - [32mINFO[0m - [32m✅ Loaded XGBoost params: {'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.1, 'objective': 'reg:squarederror', 'random_state': 42}[0m
[ 2025-09-30 01:41:38,431 ] __main__ - [32mINFO[0m - [32m🛠️ Training final XGBoost model...[0m
[0]	validation_0-rmse:2.29048
[1]	validation_0-rmse:2.06796
[2]	validation_0-rmse:1.86772
[3]	validation_0-rmse:1.68659
[4]	validation_0-rmse:1.52195
[5]	validation_0-rmse:1.37407
[6]	validation_0-rmse:1.24226
[7]	validation_0-rmse:1.12233
[8]	validation_0-rmse:1.01449
[9]	validation_0-rmse:0.91729
[10]	validation_0-rmse:0.82957
[11]	validation_0-rmse:0.75128
[12]	validation_0-rms

In [60]:
xgp_pred = model_xgb.predict(X_test)
xgb_metric = trainer.calculate_metrics(y_test,xgp_pred)

[ 2025-09-30 00:02:23,872 ] __main__ - [32mINFO[0m - [32m📊 Calculating model evaluation metrics...[0m
[ 2025-09-30 00:02:23,951 ] __main__ - [32mINFO[0m - [32m✅ Metrics calculated: {'rmse': 0.1025, 'mae': 0.0249, 'mape': 1.9689, 'r2': 0.9987}[0m


In [61]:
xgb_metric

{'rmse': 0.1025, 'mae': 0.0249, 'mape': 1.9689, 'r2': 0.9987}

In [63]:
feature_importance = ({
                'feature': feature_cols,
                'importance': model_xgb.feature_importances_
            })

In [64]:
feature_importance

{'feature': ['id',
  'item_id',
  'dept_id',
  'cat_id',
  'store_id',
  'state_id',
  'd',
  'sales',
  'date',
  'wm_yr_wk',
  'weekday',
  'wday',
  'month',
  'year',
  'event_name_1',
  'event_type_1',
  'event_name_2',
  'event_type_2',
  'snap_CA',
  'snap_TX',
  'snap_WI',
  'day',
  'quarter',
  'week_of_year',
  'is_weekend',
  'has_event',
  'snap_any',
  'sell_price',
  'revenue',
  'is_holiday',
  'snap_benefit_period',
  'price_lag_1',
  'price_change_1d',
  'price_increased_1d',
  'price_decreased_1d',
  'price_lag_7',
  'price_change_7d',
  'price_increased_7d',
  'price_decreased_7d',
  'price_lag_14',
  'price_change_14d',
  'price_increased_14d',
  'price_decreased_14d',
  'price_lag_28',
  'price_change_28d',
  'price_increased_28d',
  'price_decreased_28d',
  'price_volatility_7d',
  'price_volatility_28d',
  'event_sporting',
  'event_cultural',
  'event_national',
  'event_religious',
  'sales_lag_1',
  'sales_lag_2',
  'sales_lag_3',
  'sales_lag_7',
  'sales_la

In [69]:
model_prophet = trainer.train_prophet(train_df,value_df)

[ 2025-09-30 00:07:34,982 ] __main__ - [32mINFO[0m - [32mTraining Prophet model[0m


00:08:18 - cmdstanpy - INFO - Chain [1] start processing


[ 2025-09-30 00:08:18,272 ] cmdstanpy - [32mINFO[0m - [32mChain [1] start processing[0m


00:13:18 - cmdstanpy - INFO - Chain [1] done processing


[ 2025-09-30 00:13:18,668 ] cmdstanpy - [32mINFO[0m - [32mChain [1] done processing[0m
[ 2025-09-30 00:13:23,636 ] __main__ - [32mINFO[0m - [32mProphet Val RMSE: 2.5317, MAE: 1.1956, R2: 0.0011[0m


In [18]:
results = trainer.train_all_models(train_df,value_df,test_df,target_col='sales',use_optuna=False)

[ 2025-10-01 16:21:19,920 ] __main__ - [32mINFO[0m - [32m🚀 Starting full model training pipeline...[0m
[ 2025-10-01 16:21:20,931 ] src.utils.mlflow_utils - [32mINFO[0m - [32mStarted MLflow run: 9f1e2b0dfcb44bcc92772b5fb47fd35d[0m
[ 2025-10-01 16:21:20,938 ] __main__ - [32mINFO[0m - [32m🎯 MLflow run started with run_id=9f1e2b0dfcb44bcc92772b5fb47fd35d[0m
[ 2025-10-01 16:21:20,940 ] __main__ - [32mINFO[0m - [32m🧹 Preprocessing features and target variables...[0m
[ 2025-10-01 16:21:20,943 ] __main__ - [32mINFO[0m - [32m🔄 Starting feature preprocessing...[0m
[ 2025-10-01 16:21:22,894 ] __main__ - [32mINFO[0m - [32m✅ Label encoded 'id' in Train set.[0m
[ 2025-10-01 16:21:22,984 ] __main__ - [32mINFO[0m - [32m✅ Label encoded 'id' in Val set.[0m
[ 2025-10-01 16:21:23,132 ] __main__ - [32mINFO[0m - [32m✅ Label encoded 'id' in Test set.[0m
[ 2025-10-01 16:21:23,810 ] __main__ - [32mINFO[0m - [32m✅ Label encoded 'item_id' in Train set.[0m
[ 2025-10-01 16:21:23

  self.get_booster().save_model(fname)


[ 2025-10-01 16:22:03,379 ] src.utils.mlflow_utils - [32mINFO[0m - [32mSuccessfully logged XGBoost model 'xgboost'[0m
[ 2025-10-01 16:22:03,432 ] __main__ - [32mINFO[0m - [32m🌟 Top XGBoost features:
                   feature  importance
73          sales_velocity    0.385631
78         zero_sales_flag    0.233212
84        sales_roll_3_std    0.130486
75   sales_ratio_to_7d_avg    0.068551
76  sales_ratio_to_28d_avg    0.056559
86        sales_roll_3_max    0.055788
69             sales_ewm_7    0.052685
65      sales_roll_28_mean    0.004663
26                 revenue    0.004244
70            sales_ewm_14    0.003047
51             sales_lag_1    0.000935
33             price_lag_7    0.000438
83       sales_roll_3_mean    0.000403
37            price_lag_14    0.000346
68       sales_roll_56_std    0.000294
61       sales_roll_7_mean    0.000291
71            sales_ewm_28    0.000173
25              sell_price    0.000159
52             sales_lag_2    0.000141
72            



[ 2025-10-01 16:22:28,907 ] src.utils.mlflow_utils - [32mINFO[0m - [32mSuccessfully logged LightGBM model 'lightgbm'[0m
[ 2025-10-01 16:22:28,930 ] __main__ - [32mINFO[0m - [32m🌟 Top LightGBM features:
                   feature  importance
86        sales_roll_3_max         706
69             sales_ewm_7         335
76  sales_ratio_to_28d_avg         262
75   sales_ratio_to_7d_avg         254
65      sales_roll_28_mean         244
73          sales_velocity         180
83       sales_roll_3_mean         162
51             sales_lag_1         148
74      sales_acceleration         143
26                 revenue         116
84        sales_roll_3_std         108
52             sales_lag_2          74
61       sales_roll_7_mean          44
87     sales_roll_3_median          38
63      sales_roll_14_mean          19
66       sales_roll_28_std          17
94      sales_roll_21_mean          13
97       sales_roll_21_max          12
41            price_lag_28          11
25         


invalid value encountered in divide


invalid value encountered in divide



[ 2025-10-01 16:23:09,298 ] src.models.digonistics - [32mINFO[0m - [32m💡 Many zero sales (160711 in training). Consider log-transform or zero-inflated models.[0m



invalid value encountered in divide


invalid value encountered in divide



[ 2025-10-01 16:23:09,309 ] src.models.digonistics - [32mINFO[0m - [32m📂 Diagnosis JSON saved to diagnostics\diagnosis.json[0m
[ 2025-10-01 16:23:09,315 ] src.models.digonistics - [32mINFO[0m - [32m📄 Markdown diagnostic report saved to diagnostics\diagnosis_report.md[0m
[ 2025-10-01 16:23:09,317 ] __main__ - [32mINFO[0m - [32m📋 Diagnostic recommendations:[0m
[ 2025-10-01 16:23:09,332 ] __main__ - [31mERROR[0m - [31m❌ Visualization generation failed: 'ModelTrainer' object has no attribute '_generate_and_log_visualizations'[0m
Traceback (most recent call last):
  File "C:\Users\Administrator\AppData\Local\Temp\ipykernel_21988\1788269474.py", line 814, in train_all_models
    self._generate_and_log_visualizations(results, test_df, target_col)
AttributeError: 'ModelTrainer' object has no attribute '_generate_and_log_visualizations'
[ 2025-10-01 16:23:09,344 ] __main__ - [32mINFO[0m - [32m💾 Saving artifacts...[0m
[ 2025-10-01 16:23:09,356 ] __main__ - [32mINFO[0m - [3

Downloading artifacts:   0%|          | 0/20 [00:00<?, ?it/s]2025/10/01 16:23:18 INFO mlflow.store.artifact.artifact_repo: The progress bar can be disabled by setting the environment variable MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR to false
Downloading artifacts: 100%|██████████| 20/20 [00:00<00:00, 106.91it/s]


[ 2025-10-01 16:23:23,479 ] src.utils.mlflow_s3_utils - [32mINFO[0m - [32mSuccessfully synced 20 artifact(s) for run 9f1e2b0dfcb44bcc92772b5fb47fd35d to S3[0m
[ 2025-10-01 16:23:23,481 ] __main__ - [32mINFO[0m - [32m✓ Successfully synced artifacts to S3[0m
[ 2025-10-01 16:23:23,486 ] __main__ - [32mINFO[0m - [32m🔍 Verifying S3 artifact storage...[0m
[ 2025-10-01 16:23:23,729 ] src.utils.s3_verification - [31mERROR[0m - [31m✗ S3 artifact verification FAILED[0m
[ 2025-10-01 16:23:23,729 ] src.utils.s3_verification - [31mERROR[0m - [31m  - Artifact URI: mlflow-artifacts:/111669681926201532/9f1e2b0dfcb44bcc92772b5fb47fd35d/artifacts[0m
[ 2025-10-01 16:23:23,737 ] src.utils.s3_verification - [31mERROR[0m - [31m  - Error: Artifact URI is not S3: mlflow-artifacts:/111669681926201532/9f1e2b0dfcb44bcc92772b5fb47fd35d/artifacts[0m
