In [2]:
import pandas as pd
import numpy as np

# Load data and drop unnecessary columns
# get all ES_part_X files, 1 up to 5 and place in one df_es dataframe
df_es = pd.DataFrame()
for i in range(1, 6):
    df_temp = pd.read_csv(f'ES_part_{i}.csv')
    df_es = pd.concat([df_es, df_temp], ignore_index=True)

df_surprise = pd.read_csv('US_economic_releases_events.csv')
df_surprise.drop(columns=['S', 'Month', 'Surv(A)', 'Surv(H)', 'Surv(L)', 'Flag', 'Country/Region', 'Day', 'C', 'Category', 'Subcategory', 'Std Dev', 'Period', 'Actual'], inplace=True)


# Clean and preprocess surprise data
df_surprise.dropna(subset=['Surprise'], inplace=True)
df_surprise = df_surprise[df_surprise['Surprise'] != 0]
df_surprise.replace("--", pd.NA, inplace=True)
df_surprise.dropna(subset=['Surprise'], inplace=True)
df_surprise = df_surprise[df_surprise['Surprise'] != 0]
df_surprise['Surprise'] = pd.to_numeric(df_surprise['Surprise'], errors='coerce')
df_surprise = df_surprise[df_surprise['Surprise'] != 0].dropna(subset=['Surprise'])
df_surprise.dropna(subset=['Time'], inplace=True)
lower_bound = df_surprise['Surprise'].quantile(0.005)
upper_bound = df_surprise['Surprise'].quantile(0.995)
df_surprise = df_surprise[(df_surprise['Surprise'] >= lower_bound) & (df_surprise['Surprise'] <= upper_bound)]

# Create DateTime columns
df_surprise['Date'] = df_surprise['Unnamed: 0'].astype(str)
df_surprise['Time'] = df_surprise['Time'].astype(str)
df_surprise['DateTime'] = pd.to_datetime(df_surprise['Date'].str[:10] + ' ' + df_surprise['Time'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
df_surprise.drop(columns=['Unnamed: 0','Date'], inplace=True)

df_es['Date'] = df_es['Date'].astype(str)
df_es['Time'] = df_es['Time'].astype(str) + ':00'
df_es['DateTime'] = pd.to_datetime(df_es['Date'] + ' ' + df_es['Time'], format='%m/%d/%Y %H:%M:%S', errors='coerce')

# Merge dataframes
df_combined = pd.merge(df_es, df_surprise, on='DateTime', how='outer', suffixes=('_es', '_surprise'), indicator=True)
df_combined.dropna(subset=['Open'], inplace=True)

df_combined

# Sort df_combined by DateTime
df_combined.sort_values(by='DateTime', inplace=True)

  df_surprise = pd.read_csv('US_economic_releases_events.csv')


In [3]:
# print out all columns from df combined
df_combined.columns

Index(['Date', 'Time_es', 'Open', 'Close', 'Volume', 'DateTime', 'Event',
       'Ticker', 'Prior', 'Revised', 'Freq.', 'First Rev.', 'Last Rev.',
       'Time_surprise', 'R', 'Surv(M)', '# Ests.', 'Surprise', '_merge'],
      dtype='object')

In [4]:
df_combined

Unnamed: 0,Date,Time_es,Open,Close,Volume,DateTime,Event,Ticker,Prior,Revised,Freq.,First Rev.,Last Rev.,Time_surprise,R,Surv(M),# Ests.,Surprise,_merge
2,09/10/1997,00:01:00,0.00,0.00,0.0,1997-09-10 00:01:00,,,,,,,,,,,,,left_only
3,09/10/1997,00:02:00,0.00,0.00,0.0,1997-09-10 00:02:00,,,,,,,,,,,,,left_only
4,09/10/1997,00:03:00,0.00,0.00,0.0,1997-09-10 00:03:00,,,,,,,,,,,,,left_only
5,09/10/1997,00:04:00,0.00,0.00,0.0,1997-09-10 00:04:00,,,,,,,,,,,,,left_only
6,09/10/1997,00:05:00,0.00,0.00,0.0,1997-09-10 00:05:00,,,,,,,,,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9695761,12/19/2024,15:56:00,5941.75,5941.75,318.0,2024-12-19 15:56:00,,,,,,,,,,,,,left_only
9695762,12/19/2024,15:57:00,5941.75,5941.50,386.0,2024-12-19 15:57:00,,,,,,,,,,,,,left_only
9695763,12/19/2024,15:58:00,5941.50,5941.00,484.0,2024-12-19 15:58:00,,,,,,,,,,,,,left_only
9695764,12/19/2024,15:59:00,5940.75,5941.00,6462.0,2024-12-19 15:59:00,,,,,,,,,,,,,left_only


In [6]:
# Only keep the following columns: 'DateTime', 'Close', 'Surprise', 'Ticker', 'Event'
df_combined = df_combined[['DateTime', 'Close', 'Surprise', 'Ticker', 'Event']]

# Save the last 10% of the data into a csv file, Dont use random, just take the first 10% of rows)
df_combined.iloc[int(len(df_combined)*0.9):].to_csv('last_10_percent.csv', index=False)

# print out the last 10% of the data
df_combined.iloc[int(len(df_combined)*0.9):]

Unnamed: 0,DateTime,Close,Surprise,Ticker,Event
8726186,2022-03-30 04:59:00,5101.44,,,
8726187,2022-03-30 05:00:00,5102.83,,,
8726188,2022-03-30 05:01:00,5098.40,,,
8726189,2022-03-30 05:02:00,5099.78,,,
8726190,2022-03-30 05:03:00,5100.06,,,
...,...,...,...,...,...
9695761,2024-12-19 15:56:00,5941.75,,,
9695762,2024-12-19 15:57:00,5941.50,,,
9695763,2024-12-19 15:58:00,5941.00,,,
9695764,2024-12-19 15:59:00,5941.00,,,


<h1>Machine Learning Model</h1>
<p>Frist creating classification variables using TC of 0.0025bps and explanatory variables to use in models</p>

In [7]:
!pip install pandas numpy scikit-learn matplotlib tensorflow




In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import traceback # Import traceback for detailed error printing

# Ensure a clean TensorFlow state (important if running multiple times in the same environment)
tf.keras.backend.clear_session()

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping

# --- Configuration ---
FILENAME = 'last_10_percent.csv' # Make sure this file is in the same directory or provide the full path
TRADING_WINDOW = 20 # minutes to hold the trade
PRICE_CHANGE_THRESHOLD = 0.0005 # 0.05% threshold for buy/sell signal definition
TEST_SIZE = 0.2 # Use last 20% of surprise events for testing
N_CLASSES = 3 # Long (1), Hold (0), Short (-1) -> mapped to 0, 1, 2 for Keras
VALIDATION_SPLIT = 0.2 # Use 20% of training data for validation during training
EPOCHS = 50 # Max number of epochs to train
BATCH_SIZE = 32 # Number of samples per gradient update
EARLY_STOPPING_PATIENCE = 5 # Stop training if validation loss doesn't improve for 5 epochs

# --- Initialize Variables ---
# Ensure all key variables that control workflow are initialized to None
df = None
df_surprise = None
X_train_processed, y_train = None, None
X_test_processed, y_test = None, None
preprocessing_info = None
preprocessor = None
model = None
history = None
trade_results = None
predicted_signals = None

# --- Load Data ---
print(f"Loading data from {FILENAME}...")
try:
    df = pd.read_csv(FILENAME)
    print("Data loaded successfully.")
except FileNotFoundError:
    print(f"Error: '{FILENAME}' not found. Please place it in the correct directory.")
    # df remains None

# --- Data Preprocessing ---
if df is not None:
    print("Starting preprocessing...")
    # 1. Convert DateTime and Handle Duplicates
    try:
        # Check if 'DateTime' column exists before using it
        if 'DateTime' not in df.columns:
            raise KeyError("'DateTime' column not found in the CSV file.")

        df['DateTime'] = pd.to_datetime(df['DateTime'])
        print("DateTime converted.")

        # Handle Duplicate Timestamps
        n_original = len(df)
        df = df.drop_duplicates(subset=['DateTime'], keep='first')
        n_after_drop = len(df)
        if n_original > n_after_drop:
            print(f"Removed {n_original - n_after_drop} rows with duplicate DateTime entries.")

        # 2. Set Index and Sort
        df = df.set_index('DateTime')
        df = df.sort_index()
        print("DateTime index set and sorted.")

    except KeyError as ke:
        print(f"Error: {ke}")
        df = None # Stop processing if essential column missing
    except Exception as e:
        print(f"Error processing DateTime column: {e}")
        df = None

# Check for essential columns after basic processing
essential_cols = ['Close', 'Surprise', 'Event']
if df is not None and all(col in df.columns for col in essential_cols):

    # Keep necessary columns - df now has a unique index
    df_full = df.copy() # Keep a copy for price lookups
    df = df[['Close', 'Surprise', 'Event']].copy() # Work with relevant columns

    # 3. Filter for surprise events
    df_surprise = df.dropna(subset=['Surprise']).copy()
    print(f"Found {len(df_surprise)} initial surprise events.")

    if len(df_surprise) >= 50: # Need some data to proceed
        # 4. Define Target Variable (Future Return and Signal)
        print("Calculating future returns...")
        df_surprise['future_price'] = np.nan
        target_times = df_surprise.index + pd.Timedelta(minutes=TRADING_WINDOW)

        # Ensure we only look up times within the original data range
        valid_target_times = target_times[target_times <= df_full.index.max()]
        valid_surprise_indices = df_surprise.index[target_times <= df_full.index.max()]

        if not valid_surprise_indices.empty:
            # The reindex operation should now work as df_full has a unique index
            print(f"Attempting reindex on df_full (size: {len(df_full)}, index unique: {df_full.index.is_unique}) using {len(valid_target_times)} target times.")
            try:
                future_prices = df_full['Close'].reindex(valid_target_times, method='ffill')
                future_prices.index = valid_surprise_indices # Align index back to the surprise time
                df_surprise.loc[valid_surprise_indices, 'future_price'] = future_prices
                print("Reindex successful.")
            except ValueError as ve:
                 print(f"ERROR during reindex even after dropping duplicates: {ve}")
                 df_surprise = None # Stop processing on error

            if df_surprise is not None: # Check if reindex succeeded
                # Calculate future return where future price is available
                df_surprise['future_return'] = (df_surprise['future_price'] - df_surprise['Close']) / df_surprise['Close']

                # Drop rows where future return couldn't be calculated
                initial_count = len(df_surprise)
                df_surprise = df_surprise.dropna(subset=['future_return'])
                dropped_count = initial_count - len(df_surprise)
                if dropped_count > 0:
                    print(f"Dropped {dropped_count} events where future return could not be calculated (e.g., near end of data).")
                print(f"Number of events with valid future returns: {len(df_surprise)}")

                # --- Start of block for splitting and preprocessing ---
                # This block needs df_surprise to have enough data
                if len(df_surprise) >= 50:
                    try:
                        # Define signal based on threshold
                        df_surprise['signal'] = 0 # Default to Hold
                        df_surprise.loc[df_surprise['future_return'] > PRICE_CHANGE_THRESHOLD, 'signal'] = 1 # Long
                        df_surprise.loc[df_surprise['future_return'] < -PRICE_CHANGE_THRESHOLD, 'signal'] = -1 # Short

                        # Map signals for Keras: Short: 0, Hold: 1, Long: 2
                        signal_mapping = {-1: 0, 0: 1, 1: 2}
                        df_surprise['target_keras'] = df_surprise['signal'].map(signal_mapping)
                        print("Target signals defined.")
                        print("Signal distribution (Actual):\n", df_surprise['signal'].value_counts(normalize=True))

                        # Fill any remaining NaNs in 'Event' column
                        df_surprise['Event'] = df_surprise['Event'].fillna('Unknown_Event')
                        print("Filled potential NaNs in 'Event' column.")

                        # 5. Prepare Features (X) and Target (y)
                        print("Preparing features (X) and target (y)...")
                        features = ['Surprise', 'Event']
                        target = 'target_keras'
                        X = df_surprise[features]
                        y = df_surprise[target]
                        # Convert target to categorical format for Keras
                        y_cat = to_categorical(y, num_classes=N_CLASSES)
                        print("Features and target prepared.")

                        # 6. Split Data Chronologically
                        print("Splitting data...")
                        split_index = int(len(X) * (1 - TEST_SIZE))
                        if split_index <= 0 or split_index >= len(X): # Ensure both sets have data
                             raise ValueError(f"Test size ({TEST_SIZE}) results in an empty train or test set (Split index: {split_index}, Total size: {len(X)}).")

                        X_train, X_test = X[:split_index], X[split_index:]
                        y_train_cat, y_test_cat = y_cat[:split_index], y_cat[split_index:] # Use different names for clarity
                        signals_test = df_surprise['signal'][split_index:]
                        returns_test = df_surprise['future_return'][split_index:]
                        dates_test = df_surprise.index[split_index:]
                        print(f"Data split: {len(X_train)} train, {len(X_test)} test samples.")
                        # Check if split resulted in empty dataframes which might cause issues later
                        if X_train.empty or X_test.empty:
                            raise ValueError("Train or test split resulted in empty DataFrame.")

                        # Assign y_train here AFTER successful split
                        y_train = y_train_cat
                        y_test = y_test_cat


                        # 7. Preprocessing Pipeline
                        print("Defining preprocessing pipeline...")
                        numerical_features = ['Surprise']
                        categorical_features = ['Event']
                        numerical_transformer = StandardScaler()
                        categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

                        preprocessor = ColumnTransformer(
                            transformers=[
                                ('num', numerical_transformer, numerical_features),
                                ('cat', categorical_transformer, categorical_features)
                            ],
                            remainder='passthrough'
                            )
                        print("Pipeline defined.")

                        # Apply preprocessing
                        print("Applying preprocessing (fit_transform on train)...")
                        X_train_processed = preprocessor.fit_transform(X_train) # X_train_processed assigned here
                        print(f"Shape of processed training features: {X_train_processed.shape}")
                        print("Applying preprocessing (transform on test)...")
                        X_test_processed = preprocessor.transform(X_test) # X_test_processed assigned here
                        print(f"Shape of processed testing features: {X_test_processed.shape}")
                        n_features = X_train_processed.shape[1]
                        print(f"Preprocessing complete. Number of features: {n_features}")

                        # Store preprocessor and related info for later use
                        print("Storing preprocessing info...")
                        preprocessing_info = {
                            'n_features': n_features,
                            'dates_test': dates_test,
                            'signals_test': signals_test,
                            'returns_test': returns_test,
                            'signal_mapping_inv': {v: k for k, v in signal_mapping.items()}
                        }
                        print("Preprocessing info stored.")
                        # --- End of successful splitting/preprocessing block ---

                    except Exception as e:
                        print(f"ERROR during data preparation/splitting/preprocessing: {e}")
                        # Ensure variables remain None or are reset if error occurs mid-process
                        X_train_processed = None
                        y_train = None # Reset y_train as well
                        X_test_processed = None
                        y_test = None
                        preprocessing_info = None
                        print("Variables reset due to error.")
                        traceback.print_exc() # Print detailed traceback

                # --- End of block requiring >= 50 events ---
                else:
                     print("Error: Not enough data remaining after calculating future returns (<50 events).")
                     df_surprise = None # Flag failure

        else:
            print("Error: Could not determine valid future time targets (check data range and TRADING_WINDOW).")
            df_surprise = None # Flag failure
    elif df is not None:
         # Handling missing essential columns after initial load/processing check
         missing_cols = [col for col in essential_cols if col not in df.columns]
         if missing_cols:
             print(f"Error: Missing essential columns after initial processing: {missing_cols}")
         else:
             # This case handles df being too small initially
             print("Error: Not enough data in the initial DataFrame to proceed.")
         df_surprise = None # Flag failure
    else:
         # df was None from the start or after DateTime processing error
         print("Error: Data loading or initial processing failed.")
         df_surprise = None # df_surprise should already be None


# --- Build and Train Neural Network ---
# Check if ALL necessary inputs for training are valid
if X_train_processed is not None and y_train is not None and preprocessing_info is not None:
    print(f"\nProceeding to build model with {preprocessing_info['n_features']} input features...")

    # Define the model architecture
    model = Sequential(name="Macro_Surprise_Trader")
    model.add(Input(shape=(preprocessing_info['n_features'],), name='Input_Layer'))
    model.add(Dense(64, activation='relu', name='Hidden_Layer_1'))
    model.add(Dropout(0.25, name='Dropout_1')) # Regularization
    model.add(Dense(32, activation='relu', name='Hidden_Layer_2'))
    model.add(Dropout(0.25, name='Dropout_2')) # Regularization
    model.add(Dense(N_CLASSES, activation='softmax', name='Output_Layer')) # Output layer for 3 classes

    # Compile the model
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy', # Suitable for multi-class classification
                  metrics=['accuracy'])

    # Print model summary
    model.summary()

    # Define callbacks for training
    early_stopping = EarlyStopping(monitor='val_loss', # Monitor validation loss
                                   patience=EARLY_STOPPING_PATIENCE,
                                   restore_best_weights=True, # Keep the best model weights
                                   verbose=1)

    # Train the model
    print("\nTraining the model...")
    try:
      # Ensure y_train is in the correct format (categorical)
      if y_train.shape[1] != N_CLASSES:
           raise ValueError(f"y_train shape {y_train.shape} is not compatible with N_CLASSES={N_CLASSES}")

      history = model.fit(X_train_processed, y_train,
                          epochs=EPOCHS,
                          batch_size=BATCH_SIZE,
                          validation_split=VALIDATION_SPLIT, # Use part of training data for validation
                          callbacks=[early_stopping],
                          verbose=1) # Show training progress
      print("\nModel training finished.")

      # Plot training history (Loss and Accuracy)
      if history is not None:
          pd.DataFrame(history.history).plot(figsize=(10, 6))
          plt.grid(True)
          plt.title('Model Training History')
          plt.xlabel('Epoch')
          plt.ylabel('Metric Value')
          # Dynamically adjust legend based on available history keys
          legend_items = [key for key in ['loss', 'accuracy', 'val_loss', 'val_accuracy'] if key in history.history]
          plt.legend(legend_items)
          plt.show()

    except Exception as e:
        print(f"\nERROR during model training: {e}")
        traceback.print_exc() # Print detailed traceback
        model = None # Ensure model is None if training failed


else:
    print("\nSkipping model building and training due to error or lack of data during preprocessing steps.")


# --- Evaluate Model ---
# Check if model exists and test data is valid
if model is not None and X_test_processed is not None and y_test is not None and preprocessing_info is not None:
    print("\nEvaluating model on the test set...")
    try:
        # Ensure y_test is valid before evaluation
        if y_test.shape[1] != N_CLASSES:
            raise ValueError(f"y_test shape {y_test.shape} is not compatible with N_CLASSES={N_CLASSES}")

        loss, accuracy = model.evaluate(X_test_processed, y_test, verbose=0)
        print(f"Test Loss: {loss:.4f}")
        print(f"Test Accuracy: {accuracy:.4f}")

        # Get predictions (probabilities and predicted class index)
        y_pred_proba = model.predict(X_test_processed)
        y_pred_classes = np.argmax(y_pred_proba, axis=1) # Index of the max probability class
        y_test_classes = np.argmax(y_test, axis=1) # Convert one-hot encoded test labels back to class indices

        # Map predicted classes back to original signals (-1, 0, 1) for interpretation
        predicted_signals = np.array([preprocessing_info['signal_mapping_inv'].get(p, 0) for p in y_pred_classes]) # Use .get for safety
        actual_signals_test = np.array([preprocessing_info['signal_mapping_inv'].get(p, 0) for p in y_test_classes])

        # Classification Report
        print("\nClassification Report:")
        # Use actual signals (-1, 0, 1) for report labels
        print(classification_report(actual_signals_test, predicted_signals, labels=[-1, 0, 1], target_names=['Short (-1)', 'Hold (0)', 'Long (1)'], zero_division=0))

        # Confusion Matrix
        print("\nConfusion Matrix:")
        cm = confusion_matrix(actual_signals_test, predicted_signals, labels=[-1, 0, 1]) # Ensure labels are ordered
        print(pd.DataFrame(cm, index=['Actual Short', 'Actual Hold', 'Actual Long'], columns=['Pred Short', 'Pred Hold', 'Pred Long']))

    except Exception as e:
        print(f"ERROR during model evaluation: {e}")
        traceback.print_exc()
        predicted_signals = None # Ensure this is None if evaluation fails


else:
    # Provide more context on why evaluation is skipped
    if model is None:
        print("\nSkipping model evaluation because the model was not trained successfully.")
    else:
        print("\nSkipping model evaluation due to lack of processed test data.")


# --- Backtesting ---
# Check if predictions are available and necessary info exists
if predicted_signals is not None and preprocessing_info is not None:
    print("\nStarting backtest simulation...")
    try:
        # Create a DataFrame for backtesting results using the test set dates/indices
        backtest_df = pd.DataFrame({
            'Actual_Return': preprocessing_info['returns_test'], # The actual return over the 20min window
            'Predicted_Signal': predicted_signals,
            'Actual_Signal': preprocessing_info['signals_test'] # Actual signal {-1, 0, 1} based on threshold
        }, index=preprocessing_info['dates_test'])

        # Calculate strategy return based on predicted signal
        backtest_df['Strategy_Return'] = 0.0
        # Use .loc for assignment to avoid SettingWithCopyWarning
        backtest_df.loc[backtest_df['Predicted_Signal'] == 1, 'Strategy_Return'] = backtest_df.loc[backtest_df['Predicted_Signal'] == 1, 'Actual_Return']
        backtest_df.loc[backtest_df['Predicted_Signal'] == -1, 'Strategy_Return'] = -backtest_df.loc[backtest_df['Predicted_Signal'] == -1, 'Actual_Return']

        # Calculate cumulative returns (geometric compounding)
        backtest_df['Cumulative_Strategy_Return'] = (1 + backtest_df['Strategy_Return']).cumprod() - 1

        print("\nBacktest Simulation Complete.")

        # --- Display Results ---
        total_strategy_return = backtest_df['Cumulative_Strategy_Return'].iloc[-1] if not backtest_df.empty else 0
        print(f"\nTotal Strategy Return (Test Period): {total_strategy_return:.4%}")

        # Plot cumulative returns
        if not backtest_df.empty:
            plt.figure(figsize=(12, 6))
            backtest_df['Cumulative_Strategy_Return'].plot(label='Strategy Cumulative Return')
            plt.title(f'Backtest Cumulative Returns ({TRADING_WINDOW} min holding)')
            plt.xlabel('Date')
            plt.ylabel('Cumulative Return')
            plt.legend()
            plt.grid(True)
            plt.show()
        else:
            print("No backtesting data to plot.")

        # Yearly Returns Calculation (Geometric)
        print("\nYearly Strategy Returns (Test Period):")
        if not backtest_df.empty and isinstance(backtest_df.index, pd.DatetimeIndex) and len(backtest_df.index.year.unique()) > 0 :
             yearly_returns_geo = backtest_df['Strategy_Return'].groupby(backtest_df.index.year).apply(lambda x: (1 + x).prod() - 1)
             if not yearly_returns_geo.empty:
                 print(yearly_returns_geo.map('{:.4%}'.format))
             else:
                 print("Could not calculate yearly returns (possibly insufficient data span).")
        else:
            print("No data or insufficient time span for yearly returns calculation.")

        # Store results
        trade_results = backtest_df

    except Exception as e:
        print(f"ERROR during backtesting: {e}")
        traceback.print_exc()
        trade_results = None # Ensure this is None if backtesting fails

else:
    # Provide more context on why backtesting is skipped
    if predicted_signals is None:
        print("\nSkipping backtesting because model predictions are not available.")
    else:
        print("\nSkipping backtesting due to lack of preprocessing information.")


# --- Final Status Check ---
print("\n--- Final Status ---")
final_status = "Process Completed: Model trained, evaluated, and backtested."
if df is None:
    final_status = "Process Halted: Input data could not be loaded or processed initially."
elif df_surprise is None:
    final_status = "Process Halted: Not enough valid surprise event data after processing."
elif X_train_processed is None or y_train is None or preprocessing_info is None:
     final_status = "Process Halted: Error during data preparation, splitting, or preprocessing."
elif model is None:
    final_status = "Process Halted: Model training did not complete successfully."
elif predicted_signals is None:
    final_status = "Process Halted: Model evaluation did not complete successfully."
elif trade_results is None:
    final_status = "Process Halted: Backtesting did not complete successfully."

print(final_status)
if final_status.startswith("Process Completed"):
    print("Review the plots and printed metrics for performance details.")
else:
    print("Please review the error messages above to diagnose the issue.")

print("--------------------")

Loading data from last_10_percent.csv...
Data loaded successfully.
Starting preprocessing...
DateTime converted.
Removed 2155 rows with duplicate DateTime entries.
DateTime index set and sorted.
Found 1094 surprise events.
Calculating future returns...
Attempting reindex on df_full (size: 967412, index unique: True) using 1094 target times.
Reindex successful.
Calculated future returns for 1094 events.
Target signals defined.
Signal distribution (Actual):
 signal
 1    0.372943
-1    0.364717
 0    0.262340
Name: proportion, dtype: float64
Filled potential NaNs in 'Event' column.

Skipping model building and training due to lack of processed data or preprocessing failure.

Skipping model evaluation as the model was not trained or test data is unavailable.

Skipping backtesting due to lack of model predictions or necessary info.

--- Final Status ---
Process Halted: Model training did not complete successfully.
--------------------
