In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import pandas_datareader as pdr
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import xgboost as xgb

# --- Configuration ---
holding_period = 20
transaction_cost = 0.000025
total_cost = 2 * transaction_cost

# --- Data Loading ---
# Load and concatenate E-mini S&P 500 futures data
df_es = pd.DataFrame()
for i in range(1, 12):
    df_temp = pd.read_csv(f'ES_part_{i}.csv')
    df_es = pd.concat([df_es, df_temp], ignore_index=True)

# Load economic surprise data
df_surprise = pd.read_csv('US_economic_releases_events.csv')

# --- Data Cleaning and Preprocessing ---
df_surprise.drop(columns=['S', 'Month', 'Surv(A)', 'Surv(H)', 'Surv(L)'], inplace=True)
df_surprise.drop(columns=['Flag', 'Country/Region', 'Day', 'C', 'Category','Subcategory', 'Std Dev', 'Period', 'Actual'], inplace=True, errors='ignore')


df_surprise.replace("--", pd.NA, inplace=True)
df_surprise.dropna(subset=['Surprise'], inplace=True)
df_surprise = df_surprise[df_surprise['Surprise'] != 0]

df_surprise['Surprise'] = pd.to_numeric(df_surprise['Surprise'], errors='coerce')
df_surprise.dropna(subset=['Surprise'], inplace=True)
df_surprise = df_surprise[df_surprise['Surprise'] != 0]

df_surprise.dropna(subset=['Time'], inplace=True)

# Winsorizing the 'Surprise' column
lower_bound = df_surprise['Surprise'].quantile(0.005)
upper_bound = df_surprise['Surprise'].quantile(0.995)
df_surprise = df_surprise[(df_surprise['Surprise'] >= lower_bound) & (df_surprise['Surprise'] <= upper_bound)]

# --- DateTime Conversion ---
df_surprise.rename(columns={'Unnamed: 0': 'Date'}, inplace=True)
df_surprise.reset_index(drop=True, inplace=True)
# FIX: Use str.cat() for robust string concatenation to avoid alignment errors
df_surprise['DateTime'] = pd.to_datetime(
    df_surprise['Date'].astype(str).str.cat(df_surprise['Time'].astype(str), sep=' '),
    errors='coerce'
)

# FIX: Use str.cat() for robust string concatenation here as well
df_es['DateTime'] = pd.to_datetime(
    df_es['Date'].astype(str).str.cat(df_es['Time'].astype(str), sep=' '),
    format='%m/%d/%Y %H:%M',
    errors='coerce'
)
df_es['Time'] = df_es['DateTime'].dt.strftime('%H:%M:%S')


# --- Merging DataFrames ---
df_combined = pd.merge(df_es, df_surprise, on='DateTime', how='outer', suffixes=('_es', '_surprise'))
df_combined.dropna(subset=['Open'], inplace=True)
df_combined = df_combined.sort_values(by='DateTime').reset_index(drop=True)

# --- Feature Engineering ---
df_combined['Surprise Occurred'] = df_combined['Surprise'].notna()
df_combined['First Post Surprise'] = df_combined['Surprise Occurred'].shift(1).fillna(False)

# --- Return and Profit Calculation ---
holding_period_half = int(holding_period * 0.5)
holding_period_double = int(holding_period * 2)

df_combined['Return'] = (df_combined['Open'].shift(-holding_period) - df_combined['Open']) / df_combined['Open']
df_combined['Return_half'] = (df_combined['Open'].shift(-holding_period_half) - df_combined['Open']) / df_combined['Open']
df_combined['Return_double'] = (df_combined['Open'].shift(-holding_period_double) - df_combined['Open']) / df_combined['Open']

for horizon in ['', '_half', '_double']:
    df_combined[f'Profit{horizon}'] = np.nan
    profitable_long = df_combined[f'Return{horizon}'] > total_cost
    profitable_short = -df_combined[f'Return{horizon}'] > total_cost
    df_combined.loc[profitable_long, f'Profit{horizon}'] = 1
    df_combined.loc[profitable_short, f'Profit{horizon}'] = 2
    df_combined[f'Profit{horizon}'].fillna(0, inplace=True)

# --- Technical Features ---
def create_technical_features(df):
    df['Price'] = df['Open']
    # Calculate the 'R' (Range) feature
    df['R'] = df['High'] - df['Low']
    ma_windows = [5, 10, 15, 20, 50, 100, 200]
    for window in ma_windows:
        df[f'SMA{window}'] = df['Open'].rolling(window).mean()
        df[f'SMA{window}Cross'] = (df['Open'] > df[f'SMA{window}']).astype(int)

    trend_periods = [10, 15, 50]
    for period in trend_periods:
        df[f'UpDown{period}'] = np.sign(df['Open'].pct_change(periods=period, fill_method='pad'))
    return df

df_combined = create_technical_features(df_combined)
df_combined['Volume_L1'] = df_combined['Volume'].shift(1)

# --- Prepare Data for ML ---
df_intm = df_combined[df_combined['First Post Surprise']].copy()
df_intm['Surprise'] = df_intm['Surprise'].ffill()
df_intm = df_intm.dropna(subset=['Return', 'Return_half', 'Return_double'])

ticker_dummies = pd.get_dummies(df_intm['Ticker'], prefix='Ticker')
df_ml = pd.concat([df_intm, ticker_dummies], axis=1)

ticker_cols = [col for col in df_ml.columns if 'Ticker_' in col]
feature_cols = ['Surprise', 'Volume_L1', 'R', 'SMA5', 'SMA10', 'SMA20', 'SMA200',
                'SMA5Cross', 'SMA10Cross','SMA15Cross', 'SMA20Cross',
                'SMA50Cross', 'SMA100Cross', 'SMA200Cross', 'UpDown10',
                'UpDown15', 'UpDown50'] + ticker_cols

# --- Macroeconomic Data ---
start = datetime.datetime(1997, 1, 1)
end = datetime.datetime.today()

# Get GDP Growth Data from FRED
gdp_gr = pdr.DataReader('A191RL1Q225SBEA', 'fred', start, end)
gdp_gr_ml = gdp_gr.reset_index()
gdp_gr_ml.rename(columns={'A191RL1Q225SBEA': 'gdp_gr'}, inplace=True)
gdp_gr_ml['DateTime'] = pd.to_datetime(gdp_gr_ml['DATE']) + pd.Timedelta(hours=23, minutes=59)
gdp_gr_ml = gdp_gr_ml[['DateTime', 'gdp_gr']]

# Get VIX Data from FRED
vix = pdr.DataReader('VIXCLS', 'fred', start, end)
vix_ml = vix.reset_index()
vix_ml.rename(columns={'VIXCLS': 'VIX'}, inplace=True)
vix_ml['DateTime'] = pd.to_datetime(vix_ml['DATE']) + pd.Timedelta(hours=23, minutes=59)
vix_ml = vix_ml[['DateTime', 'VIX']]


df_ml = pd.merge_asof(df_ml.sort_values('DateTime'), gdp_gr_ml.sort_values('DateTime'), on='DateTime', direction='backward')
df_ml = pd.merge_asof(df_ml.sort_values('DateTime'), vix_ml.sort_values('DateTime'), on='DateTime', direction='backward')
df_ml.rename(columns={'gdp_gr': 'last_gdp_gr', 'VIX': 'last_vix'}, inplace=True)
feature_cols.extend(['last_gdp_gr', 'last_vix'])

df_ml = df_ml.dropna(subset=feature_cols).reset_index(drop=True)

# --- XGBoost Model and Backtesting ---
min_date = df_ml['DateTime'].min()
max_date = df_ml['DateTime'].max()
train_end_date = min_date + (max_date - min_date) * 0.3

results = []
all_test_returns, all_dates, all_test_preds, all_test_truths, all_test_probas, fold_mean_returns, best_horizons_list = [], [], [], [], [], [], []

def calculate_grouped_returns(df_group, horizon):
    active_trades = df_group[df_group['pred'] != 0]
    if active_trades.empty:
        return 0.0
    longs = active_trades[active_trades['pred'] == 1]
    shorts = active_trades[active_trades['pred'] == 2]
    total_confidence = longs['proba'].sum() + shorts['proba'].sum()
    if total_confidence == 0:
        return 0.0
    num_trades = len(longs) + len(shorts)
    total_costs = total_cost * num_trades
    long_returns = (longs[f'Return{horizon}'] * (longs['proba'] / total_confidence)).sum()
    short_returns = (-shorts[f'Return{horizon}'] * (shorts['proba'] / total_confidence)).sum()
    return (long_returns + short_returns) - total_costs

while True:
    val_start_date = train_end_date + pd.DateOffset(days=1)
    val_end_date = val_start_date + pd.DateOffset(years=1)
    test_start_date = val_end_date + pd.DateOffset(days=1)
    test_end_date = test_start_date + pd.DateOffset(years=1)

    if val_end_date > max_date or test_end_date > max_date:
        break

    train_data = df_ml[df_ml['DateTime'] <= train_end_date]
    val_data = df_ml[(df_ml['DateTime'] >= val_start_date) & (df_ml['DateTime'] <= val_end_date)]
    test_data = df_ml[(df_ml['DateTime'] >= test_start_date) & (df_ml['DateTime'] <= test_end_date)]

    if val_data.empty or test_data.empty:
        train_end_date = val_end_date
        continue

    horizon_performance = {}
    models = {}
    for horizon in ['', '_half', '_double']:
        X_train = train_data[feature_cols]
        y_train = train_data[f'Profit{horizon}']
        X_val = val_data[feature_cols]
        y_val = val_data[f'Profit{horizon}']
        
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_val_scaled = scaler.transform(X_val)

        model = xgb.XGBClassifier(num_class=3, n_estimators=60, max_depth=7, random_state=42, seed=42, use_label_encoder=False, eval_metric='mlogloss')
        model.fit(X_train_scaled, y_train, eval_set=[(X_val_scaled, y_val)], verbose=False)
        
        val_preds = model.predict(X_val_scaled)
        val_probas = model.predict_proba(X_val_scaled)

        val_trade_df = pd.DataFrame({
            'DateTime': val_data['DateTime'], 'pred': val_preds, 
            'proba': [p[pred] for pred, p in zip(val_preds, val_probas)], 
            f'Return{horizon}': val_data[f'Return{horizon}']
        })
        val_returns = val_trade_df.groupby('DateTime').apply(lambda x: calculate_grouped_returns(x, horizon))
        horizon_performance[horizon] = (1 + val_returns).prod() - 1
        models[horizon] = model

    best_horizon = max(horizon_performance, key=horizon_performance.get)
    best_model = models[best_horizon]
    best_horizons_list.append(best_horizon.replace('_', '') or 'standard')
    
    X_test = test_data[feature_cols]
    y_test = test_data[f'Profit{best_horizon}']
    X_test_scaled = scaler.transform(X_test)

    test_preds = best_model.predict(X_test_scaled)
    test_probas = best_model.predict_proba(X_test_scaled)

    test_trade_df = pd.DataFrame({
        'DateTime': test_data['DateTime'], 'pred': test_preds,
        'proba': [p[pred] for pred, p in zip(test_preds, test_probas)],
        f'Return{best_horizon}': test_data[f'Return{best_horizon}']
    })
    grouped_returns = test_trade_df.groupby('DateTime').apply(lambda x: calculate_grouped_returns(x, best_horizon))
    
    all_dates.extend(grouped_returns.index)
    all_test_returns.extend(grouped_returns)
    all_test_preds.extend(test_preds)
    all_test_truths.extend(y_test)
    all_test_probas.extend(test_probas)
    fold_mean_returns.append(grouped_returns.mean())

    train_end_date = val_end_date

# --- Results ---
if all_test_returns:
    cumulative_returns = (1 + np.array(all_test_returns)).cumprod() - 1
    
    plt.figure(figsize=(12, 6))
    plt.plot(all_dates, cumulative_returns * 100)
    plt.title('Cumulative Returns (XGBoost)')
    plt.xlabel('Date')
    plt.ylabel('Cumulative Return (%)')
    plt.grid(True)
    plt.show()

    print(f"Final Cumulative Compounded Return: {cumulative_returns[-1]*100:.2f}%")
    print(f"\nOverall Test Accuracy : {accuracy_score(all_test_truths, all_test_preds)*100:.2f}%")
    print(f"Overall Precision : {precision_score(all_test_truths, all_test_preds, average='macro', zero_division=0)*100:.2f}%")
    print(f"Overall Recall : {recall_score(all_test_truths, all_test_preds, average='macro', zero_division=0)*100:.2f}%")
    print(f"Overall F1 Score : {f1_score(all_test_truths, all_test_preds, average='macro', zero_division=0)*100:.2f}%")
    print(f"Overall ROC AUC : {roc_auc_score(all_test_truths, np.vstack(all_test_probas), multi_class='ovo', average='macro')*100:.2f}%")
else:
    print("No valid test periods found.")


  df_surprise = pd.read_csv('US_economic_releases_events.csv')


AttributeError: 'DataFrame' object has no attribute 'str'