In [1]:
import os
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor
import optuna
import warnings
warnings.filterwarnings('ignore')

# Financial indicators to forecast
financial_indicators = [
    '1_year_rate', '3_months_rate', '6_months_rate', 'CPI', 'INDPRO',
    '10_year_rate', 'share_price', 'unemployment_rate', 'PPI',
    'OECD_CLI_index', 'CSI_index', 'gdp_per_capita'
]

# Recession probability target columns to exclude from features
recession_targets = [
    'recession_probability', '1_month_recession_probability',
    '3_month_recession_probability', '6_month_recession_probability'
]

def prepare_existing_features(train_df, test_df, target_indicator):
    """Prepare existing features from train and test dataframes, excluding recession targets"""
    print(f"\nPreparing existing features for {target_indicator}...")
    
    # Define features to exclude (date, target, and recession targets)
    features_to_exclude = ['date'] + recession_targets + [target_indicator]
    
    # Get available features from training data
    available_features = [c for c in train_df.columns if c not in features_to_exclude]
    
    print(f"Available existing features: {len(available_features)}")
    print(f"Features: {available_features[:10]}..." if len(available_features) > 10 else f"Features: {available_features}")
    
    # Ensure test data has the same features
    common_features = [f for f in available_features if f in test_df.columns]
    
    if len(common_features) != len(available_features):
        missing_in_test = set(available_features) - set(common_features)
        print(f"Warning: {len(missing_in_test)} features missing in test data: {missing_in_test}")
    
    train_features = train_df[common_features].copy()
    test_features = test_df[common_features].copy()
    
    return train_features, test_features, common_features

def clean_existing_features(train_features, test_features, feature_cols):
    """Clean existing feature data"""
    print(f"Cleaning existing feature data...")
    print(f"  Initial shapes - Train: {train_features.shape}, Test: {test_features.shape}")
    
    # Handle infinite values and NaNs
    for col in feature_cols:
        if col in train_features.columns:
            # Clean training features
            train_features[col] = train_features[col].replace([np.inf, -np.inf], np.nan)
            train_features[col] = train_features[col].fillna(method='ffill').fillna(method='bfill')
            if train_features[col].isna().any():
                train_features[col] = train_features[col].fillna(train_features[col].median())
            if train_features[col].isna().any():
                train_features[col] = train_features[col].fillna(0)
        
        if col in test_features.columns:
            # Clean test features
            test_features[col] = test_features[col].replace([np.inf, -np.inf], np.nan)
            test_features[col] = test_features[col].fillna(method='ffill').fillna(method='bfill')
            if test_features[col].isna().any():
                # Use training median for consistency
                fill_value = train_features[col].median() if col in train_features.columns else 0
                test_features[col] = test_features[col].fillna(fill_value)
            if test_features[col].isna().any():
                test_features[col] = test_features[col].fillna(0)
    
    # Remove columns with no variation in training data
    varying_cols = []
    for col in feature_cols:
        if col in train_features.columns and train_features[col].nunique() > 1:
            varying_cols.append(col)
    
    # Keep only varying columns in both datasets
    train_features_clean = train_features[varying_cols]
    test_features_clean = test_features[[c for c in varying_cols if c in test_features.columns]]
    
    print(f"  Final shapes - Train: {train_features_clean.shape}, Test: {test_features_clean.shape}")
    print(f"  Features with variation: {len(varying_cols)}")
    print(f"  Remaining missing values - Train: {train_features_clean.isnull().sum().sum()}, Test: {test_features_clean.isnull().sum().sum()}")
    
    return train_features_clean, test_features_clean, varying_cols

def make_xgb_objective(X_train, y_train, X_val, y_val, random_state=42):
    """Create Optuna objective function for XGBoost hyperparameter tuning"""
    def objective(trial):
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 200, 2000),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "max_depth": trial.suggest_int("max_depth", 2, 8),
            "min_child_weight": trial.suggest_float("min_child_weight", 1.0, 10.0),
            "subsample": trial.suggest_float("subsample", 0.6, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
            "gamma": trial.suggest_float("gamma", 0.0, 5.0),
            "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 5.0),
            "reg_lambda": trial.suggest_float("reg_lambda", 0.5, 5.0),
            "random_state": random_state,
            "objective": "reg:squarederror",
            "tree_method": "hist",
        }
        model = XGBRegressor(**params)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            early_stopping_rounds=100,
            verbose=False
        )
        preds = model.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, preds))
        return rmse
    return objective

def time_val_split(X, y, val_ratio=0.2):
    """Keep order; last portion as validation."""
    n = len(X)
    cut = int(np.floor(n * (1 - val_ratio)))
    return X[:cut], X[cut:], y[:cut], y[cut:]

def forecast_indicator_with_xgb(train_df, test_df, indicator, n_trials=50):
    """Forecast indicator using XGBoost with existing features only"""
    print(f"\n{'='*60}\nFORECASTING: {indicator}\n{'='*60}")
    
    if indicator not in train_df.columns or indicator not in test_df.columns:
        print(f"ERROR: {indicator} not in datasets")
        return None
    
    # Get target series
    train_series = train_df[indicator].dropna()
    test_series = test_df[indicator].dropna()
    print(f"Series lengths - Train: {len(train_series)}, Test: {len(test_series)}")
    
    if len(train_series) < 10:
        print(f"ERROR: Insufficient training data for {indicator}")
        return None
    
    # Prepare existing features
    try:
        train_features, test_features, available_features = prepare_existing_features(train_df, test_df, indicator)
        train_features_clean, test_features_clean, feature_cols = clean_existing_features(train_features, test_features, available_features)
        
        if len(feature_cols) == 0:
            print(f"WARNING: No features available for {indicator}")
            return None
            
    except Exception as e:
        print(f"ERROR preparing features: {e}")
        return None
    
    # Align features with target series (handle any index mismatches)
    train_features_aligned = train_features_clean.loc[train_series.index]
    test_features_aligned = test_features_clean.loc[test_series.index]
    
    # Remove any remaining NaN rows
    train_mask = ~(train_series.isna() | train_features_aligned.isna().any(axis=1))
    test_mask = ~(test_series.isna() | test_features_aligned.isna().any(axis=1))
    
    X_train_full = train_features_aligned.loc[train_mask].values
    y_train_full = train_series.loc[train_mask].values
    X_test = test_features_aligned.loc[test_mask].values
    y_test = test_series.loc[test_mask].values
    
    if len(X_train_full) < 10:
        print(f"ERROR: Insufficient clean training data for {indicator}: {len(X_train_full)} rows")
        return None
    
    print(f"Clean data shapes - Train: {X_train_full.shape}, Test: {X_test.shape}")
    print(f"Using {len(feature_cols)} features")
    
    # Split training data for hyperparameter tuning
    X_train, X_val, y_train, y_val = time_val_split(X_train_full, y_train_full)
    
    # Hyperparameter tuning with Optuna
    print(f"\nRunning XGBoost hyperparameter optimization...")
    print(f"Training shape: {X_train.shape}, Validation shape: {X_val.shape}")
    
    try:
        study = optuna.create_study(direction="minimize")
        study.optimize(
            make_xgb_objective(X_train, y_train, X_val, y_val, random_state=42), 
            n_trials=n_trials, 
            show_progress_bar=False
        )
        
        best_params = study.best_params
        print(f"Best parameters: {best_params}")
        print(f"Best validation RMSE: {study.best_value:.4f}")
        
        # Train final model on full training data with validation for early stopping
        X_train_final, X_val_final, y_train_final, y_val_final = time_val_split(X_train_full, y_train_full)
        
        final_model = XGBRegressor(
            **{k: v for k, v in best_params.items() if k != "random_state"},
            random_state=42,
            objective="reg:squarederror",
            tree_method="hist"
        )
        
        final_model.fit(
            X_train_final, y_train_final,
            eval_set=[(X_val_final, y_val_final)],
            early_stopping_rounds=100,
            verbose=False
        )
        
        # Make predictions on test data
        forecast = final_model.predict(X_test)
        
        # Calculate metrics
        mae = mean_absolute_error(y_test, forecast)
        rmse = np.sqrt(mean_squared_error(y_test, forecast))
        
        # Calculate MAPE with mask for non-zero values
        mask = y_test != 0
        mape = np.mean(np.abs((y_test[mask] - forecast[mask]) / y_test[mask])) * 100 if mask.any() else np.inf
        
        print(f"\nAccuracy Metrics:\n  MAE: {mae:.4f}\n  RMSE: {rmse:.4f}\n  MAPE: {mape:.2f}%")
        
        # Save model
        os.makedirs("xgb_models", exist_ok=True)
        with open(f"xgb_models/{indicator}_xgb_model.pkl", "wb") as f:
            pickle.dump(final_model, f)
        print(f"Model saved to xgb_models/{indicator}_xgb_model.pkl")
        
        return {
            'model': final_model,
            'best_params': best_params,
            'best_val_rmse': study.best_value,
            'forecast': forecast,
            'actual': y_test,
            'mae': mae,
            'rmse': rmse,
            'mape': mape,
            'feature_cols': feature_cols,
            'train_length': len(y_train_full),
            'test_length': len(y_test),
            'train_dates': train_df.loc[train_series.loc[train_mask].index, 'date'].values,
            'test_dates': test_df.loc[test_series.loc[test_mask].index, 'date'].values
        }
        
    except Exception as e:
        print(f"ERROR training XGBoost for {indicator}: {e}")
        return None

def plot_xgb_forecast_results(results, indicator, train_df, test_df):
    """Plot XGBoost forecast results similar to ARIMA style"""
    if results is None:
        return
        
    plt.figure(figsize=(15, 8))
    
    # Plot training data (last portion for visibility)
    train_series = train_df[indicator].dropna()
    train_plot = train_series.tail(min(50, len(train_series)))
    train_dates = train_df.loc[train_plot.index, 'date']
    
    plt.plot(train_dates, train_plot.values, label='Training', color='blue', alpha=0.7)
    
    # Plot actual and forecast
    actual = results['actual']
    forecast = results['forecast']
    test_dates = results['test_dates']
    
    plt.plot(test_dates, actual, label='Actual', color='green', linewidth=2, marker='o', markersize=4)
    plt.plot(test_dates, forecast, label='Forecast', color='red', linewidth=2, marker='s', markersize=4)
    
    # Add title with metrics
    plt.title(f'{indicator} - XGBoost Forecast\n'
              f'MAE: {results["mae"]:.4f}, RMSE: {results["rmse"]:.4f}, MAPE: {results["mape"]:.2f}%',
              fontsize=14)
    
    plt.xlabel('Date')
    plt.ylabel('Value')
    plt.legend()
    plt.grid(alpha=0.3)
    plt.xticks(rotation=45)
    
    # Add vertical line to separate training and test
    if len(test_dates) > 0:
        plt.axvline(x=test_dates[0], color='gray', linestyle='--', alpha=0.7)
    
    plt.tight_layout()
    plt.show()

def run_xgb_forecasting_existing_features(train_df, test_df, financial_indicators=None, n_trials=40, plot_results=True):
    """Run XGBoost forecasting pipeline using existing features only"""
    print("=" * 80)
    print("XGBoost TIME SERIES FORECASTING - EXISTING FEATURES ONLY")
    print("=" * 80)
    
    if financial_indicators is None:
        financial_indicators = [
            '1_year_rate', '3_months_rate', '6_months_rate', 'CPI', 'INDPRO',
            '10_year_rate', 'share_price', 'unemployment_rate', 'PPI',
            'OECD_CLI_index', 'CSI_index', 'gdp_per_capita'
        ]
    
    train_work = train_df.copy()
    test_work = test_df.copy()
    train_work['date'] = pd.to_datetime(train_work['date'])
    test_work['date'] = pd.to_datetime(test_work['date'])
    
    # Check available indicators
    available = [i for i in financial_indicators if i in train_work.columns and i in test_work.columns]
    print(f"Indicators to forecast: {available}")
    print(f"Excluding recession features: {recession_targets}")
    
    all_results = {}
    forecasted = pd.DataFrame({'date': test_work['date'].copy()})
    
    for ind in available:
        res = forecast_indicator_with_xgb(
            train_work, test_work, ind, 
            n_trials=n_trials
        )
        
        if res:
            all_results[ind] = res
            
            # Add results to forecasted dataframe
            steps = len(res['forecast'])
            forecasted[f'{ind}_forecast'] = np.nan
            forecasted[f'{ind}_actual'] = np.nan
            
            # Find matching indices for test dates
            test_dates_df = pd.DataFrame({'date': pd.to_datetime(res['test_dates'])})
            merged = forecasted.merge(test_dates_df.reset_index(), on='date', how='left')
            valid_indices = merged.dropna()['index'].astype(int).values[:steps]
            
            if len(valid_indices) == len(res['forecast']):
                forecasted.loc[valid_indices, f'{ind}_forecast'] = res['forecast']
                forecasted.loc[valid_indices, f'{ind}_actual'] = res['actual']
            
            if plot_results:
                plot_xgb_forecast_results(res, ind, train_work, test_work)
    
    print(f"\n{'='*60}")
    print("FORECASTING COMPLETE")
    print(f"{'='*60}")
    print(f"Successfully forecasted {len(all_results)} indicators")
    
    # Print summary metrics
    if all_results:
        print(f"\nSUMMARY METRICS:")
        for ind, res in all_results.items():
            print(f"{ind:20s} - MAE: {res['mae']:.4f}, RMSE: {res['rmse']:.4f}, MAPE: {res['mape']:.2f}%")
    
    return all_results, forecasted



In [None]:
# Example usage:
# Load your data with train/test split
train_df = pd.read_csv('')
test_df = pd.read_csv('test_data.csv')

results, forecasts = run_xgb_forecasting_existing_features(
    train_df, test_df,
    financial_indicators=financial_indicators,
    n_trials=40,
    plot_results=True
)