In [1]:
import pandas as pd
import os
import numpy as np
import json
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
import seaborn as sns
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, Bidirectional, LSTM, Dense, Dropout, Activation, Lambda
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

# Set seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

#import warnings
import warnings
warnings.filterwarnings('ignore')


In [2]:
# Technical Indicator Functions
def calculate_moving_averages(df, windows=[50, 200]):
    """Calculate moving averages for specified windows."""
    for window in windows:
        df[f'MA_{window}'] = df['Close'].rolling(window=window).mean()
    return df

def calculate_rsi(df, window=14):
    """Calculate Relative Strength Index (RSI)."""
    delta = df['Close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    df['RSI'] = 100 - (100 / (1 + rs))
    return df

def calculate_macd(df, short_window=12, long_window=26, signal_window=9):
    """Calculate Moving Average Convergence Divergence (MACD)."""
    short_ema = df['Close'].ewm(span=short_window, adjust=False).mean()
    long_ema = df['Close'].ewm(span=long_window, adjust=False).mean()
    df['MACD'] = short_ema - long_ema
    df['Signal_Line'] = df['MACD'].ewm(span=signal_window, adjust=False).mean()
    return df

def calculate_bollinger_bands(df, window=20):
    """Calculate Bollinger Bands."""
    df['BB_Middle'] = df['Close'].rolling(window=window).mean()
    df['BB_Upper'] = df['BB_Middle'] + 2 * df['Close'].rolling(window=window).std()
    df['BB_Lower'] = df['BB_Middle'] - 2 * df['Close'].rolling(window=window).std()
    return df


In [3]:
# Feature Selection with Mutual Information
def categories_features(df):
    # Define feature categories
    price_indicators = ['Close', 'Returns', 'Log_Returns', 'Price_Range', 'Price_Range_Pct']
    moving_averages = [col for col in df.columns if col.startswith(('MA_', 'EMA_', 'Returns_'))]
    volatility_metrics = [col for col in df.columns if col.startswith(('Volatility_', 'Volume_MA_', 'BB_Width_'))]
    technical_indicators = ['RSI_9', 'RSI_14', 'RSI_25', 'MACD', 'Signal_Line', 'MACD_Histogram',
                        'Momentum_14', 'ROC_14', 'MFI_14', 'MFI_28'] + \
                        [col for col in df.columns if col.startswith('Channel_Width_')]
    volume_indicators = ['OBV', 'Volume_Ratio', 'Volume_StdDev']
    fundamental_features = ['PE_Ratio', 'PB_Ratio', 'Dividend_Yield', 'Profit_Margin', 'Beta', 
                        'Enterprise_Value', 'Forward_EPS', 'Trailing_EPS']
    market_features = ['Market_Return', 'Market_Volatility', 'Rolling_Beta', 'VIX', 'VIX_MA_10']
    all_features = (price_indicators + moving_averages + volatility_metrics +
                technical_indicators + volume_indicators + fundamental_features + market_features)
    features = [f for f in all_features if f in df.columns]
    
    return features
# Function to select features using mutual information
def select_features_with_mi(df, features, target_col='Target', n_select=30):

    """Select top features using mutual information."""
    data = df.dropna(subset=[target_col])
    mi_scores = mutual_info_classif(data[features], data[target_col])
    mi_df = pd.DataFrame({'Feature': features, 'MI Score': mi_scores})
    mi_df = mi_df.sort_values('MI Score', ascending=False)
    print(f"Top {n_select} features by mutual information:")
    print(mi_df.head(n_select))
    return mi_df.head(n_select)['Feature'].tolist()
# Function to load and preprocess data
def load_and_preprocess_data(file_path, symbol, seq_length):
    """
    Load stock data from a CSV file, preprocess it, and create sequences for the specified symbol.
    
    Args:
        file_path (str): Path to the CSV file containing stock data.
        symbol (str): Stock symbol to filter (e.g., 'AAPL').
        seq_length (int): Number of time steps in each input sequence.
    
    Returns:
        np.array: Input sequences (X).
        np.array: Target values (y).
        list: List of feature column names.
    """
    # Load data
    df = pd.read_csv(file_path)
    if 'Symbol' in df.columns:
        df = df[df['Symbol'] == symbol].copy()
    else:
        raise ValueError(f"No data found for symbol '{symbol}' in the dataset.")
    
    # Convert 'Date' to datetime and set as index
    df['Date'] = pd.to_datetime(df['Date'])
    df.set_index('Date', inplace=True)
    
    # Calculate technical indicators
    df = calculate_moving_averages(df)
    df = calculate_rsi(df)
    df = calculate_macd(df)
    df = calculate_bollinger_bands(df)

    # Handle missing values with forward fill
    df.ffill(inplace=True)
    
    # Define target: 1 if next day's return > 0, else 0 include sell if
    df['Return'] = df['Close'].pct_change().shift(-1)
    df['Target'] = np.where(df['Return'] > 0, 1, 0)
    df = df.dropna()
    
    # Define features
    features = categories_features(df)
    # Feature selection
    #selected_features = select_features_with_mi(df, features, 'Target', n_select=30)
    selected_features = ['Close', 'Volume', 'Return', 'MA_50', 'MA_200', 'RSI', 'MACD', 'Signal_Line', 'BB_Middle', 'BB_Upper', 'BB_Lower']
    print(f"Selected {len(selected_features)} features")
    
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(df[selected_features])
    
    # Create sequences
    X, y = [], []
    for i in range(seq_length, len(scaled_data)):
        X.append(scaled_data[i - seq_length:i])
        y.append(df['Target'].iloc[i])
    X, y = np.array(X), np.array(y)
    
    return X, y, features, df

# Function to build the CNN-BiLSTM model
def build_cnn_bilstm_model(seq_length, num_features, num_classes=1):
    """
    Build a CNN-BiLSTM model with attention for time series forecasting in a trading system.
    
    Args:
        seq_length (int): Number of time steps in each input sequence.
        num_features (int): Number of features in the input data.
        num_classes (int): Number of output classes (default: 1 for binary classification).
    
    Returns:
        Model: Compiled Keras model.
    """
    # Define input layer
    inputs = Input(shape=(seq_length, num_features))
    
    # CNN layers for feature extraction
    x = Conv1D(filters=64, kernel_size=3, activation='relu')(inputs)
    x = MaxPooling1D(pool_size=2)(x)
    x = Dropout(0.2)(x)
    
    # First BiLSTM layer for temporal dependencies
    x = Bidirectional(LSTM(128, return_sequences=True))(x)
    x = Dropout(0.2)(x)
    
    # Second BiLSTM layer, outputting a sequence for attention
    sequence = Bidirectional(LSTM(32, return_sequences=True))(x)
    x = Dropout(0.2)(sequence)

    # Final BiLSTM layer
    x = Bidirectional(LSTM(32))(x)
    x = Dropout(0.2)(x)

    # Attention mechanism using Keras operations
    attention_scores = Dense(1)(sequence)  # Shape: (batch_size, time_steps, 1)
    attention_weights = Activation('softmax')(attention_scores)  # Normalize weights across time steps
    #attention_output = Lambda(lambda x: tf.reduce_sum(x[0] * x[1], axis=1))([sequence, attention_weights])  # Weighted sum: (batch_size, 64)
    # Update inside build_cnn_bilstm_model where Lambda is used:
    attention_output = Lambda(lambda x: tf.reduce_sum(x[0] * tf.expand_dims(tf.squeeze(x[1], -1), -1), axis=1))([sequence, attention_weights])
    # Dense layers for prediction
    x = Dense(32, activation='relu')(attention_output)
    outputs = Dense(num_classes, activation='sigmoid' if num_classes == 1 else 'softmax')(x)
    
    # Create and compile the model
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer=Adam(learning_rate=0.001),
                  loss='binary_crossentropy' if num_classes == 1 else 'categorical_crossentropy',
                  metrics=['accuracy'])
    
    return model

# Function to split data into train, validation, and test sets
def split_data(X, y, train_ratio=0.7, val_ratio=0.15):
    """
    Split the data into training, validation, and test sets.
    
    Args:
        X (np.array): Input sequences.
        y (np.array): Target values.
        train_ratio (float): Proportion of data for training.
        val_ratio (float): Proportion of data for validation.
    
    Returns:
        tuple: X_train, X_val, X_test, y_train, y_val, y_test, train_size, val_size
    """
    num_samples = len(X)
    train_size = int(train_ratio * num_samples)
    val_size = int(val_ratio * num_samples)
    X_train, X_val, X_test = X[:train_size], X[train_size:train_size + val_size], X[train_size + val_size:]
    y_train, y_val, y_test = y[:train_size], y[train_size:train_size + val_size], y[train_size + val_size:]
    return X_train, X_val, X_test, y_train, y_val, y_test, train_size, val_size

# Function to evaluate and visualize model performance
def evaluate_and_visualize_model(y_test_labels, y_pred, history, symbol, symbol_dir):
    """
    Evaluate the model's performance and visualize the results for a given stock symbol.

    Args:
        y_test_labels (array-like): True labels for the test set.
        y_pred (array-like): Predicted labels for the test set.
        history (History): Training history object from model.fit, containing accuracy and loss metrics.
        symbol (str): Stock symbol (e.g., 'AAPL') used for naming plots.
        symbol_dir (str): Directory path where the plots will be saved.

    Returns:
        None: Prints metrics and saves plots to the specified directory.
    """
    # Calculate performance metrics
    accuracy = accuracy_score(y_test_labels, y_pred)
    precision = precision_score(y_test_labels, y_pred, average='weighted')
    recall = recall_score(y_test_labels, y_pred, average='weighted')
    f1 = f1_score(y_test_labels, y_pred, average='weighted')

    # Print metrics and classification report
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test_labels, y_pred))

    # Plot and save confusion matrix
    cm = confusion_matrix(y_test_labels, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Hold', 'Buy'], yticklabels=['Hold', 'Buy'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix - {symbol}')
    plt.savefig(os.path.join(symbol_dir, f'confusion_matrix_{symbol}.png'))
    plt.close()

    # Plot and save training history (accuracy and loss)
    plt.figure(figsize=(12, 5))
    
    # Accuracy plot
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title(f'Model Accuracy - {symbol}')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')
    
    # Loss plot
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title(f'Model Loss - {symbol}')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')
    
    plt.tight_layout()
    plt.savefig(os.path.join(symbol_dir, f'training_history_{symbol}.png'))
    plt.close()
    return accuracy, precision, recall, f1
# Placeholder for advanced trading signal generation
def generate_advanced_trading_signals(model, X_test, df_test, confidence_threshold):
    """Generate trading signals based on model predictions."""
    y_pred_proba = model.predict(X_test)
    
    # Ensure proper shape for binary classification
    if len(y_pred_proba.shape) > 1 and y_pred_proba.shape[1] == 1:
        y_pred_proba = y_pred_proba.flatten()
    
    signals = pd.DataFrame(index=df_test.index[:len(y_pred_proba)])
    
    # For binary classification (0 = down, 1 = up)
    signals['pred_prob'] = y_pred_proba  
    signals['predicted_class'] = (signals['pred_prob'] > confidence_threshold).astype(int)
    signals['confidence'] = np.where(signals['pred_prob'] > 0.5, 
                                    signals['pred_prob'], 
                                    1 - signals['pred_prob'])
    
    # Generate positions (-1 for short, 0 for neutral, 1 for long)
    signals['position'] = 0
    signals.loc[(signals['predicted_class'] == 1) & (signals['confidence'] > confidence_threshold), 'position'] = 1  # Long
    signals.loc[(signals['predicted_class'] == 0) & (signals['confidence'] > confidence_threshold), 'position'] = -1  # Short
    
    # Add trading signals label
    signals['Signal'] = 'Hold'  # Default
    signals.loc[signals['position'] == 1, 'Signal'] = 'Buy'
    signals.loc[signals['position'] == -1, 'Signal'] = 'Sell'
    
    # Calculate position size based on confidence
    signals['position_size'] = signals['position'] * (signals['confidence'] - 0.5) * 2
    signals.loc[signals['position_size'] < 0, 'position_size'] = signals['position_size'].abs()
    
    # Add market data
    signals['price'] = df_test['Close'].values[:len(signals)]
    
    # Calculate returns
    signals['market_return'] = np.log(signals['price'] / signals['price'].shift(1))
    signals['strategy_return'] = signals['position'].shift(1) * signals['market_return']
    signals['sized_strategy_return'] = signals['position_size'].shift(1) * signals['market_return']
    
    # Handle NaN values in return calculations
    signals.dropna(subset=['market_return', 'strategy_return', 'sized_strategy_return'], inplace=True)
    
    # Calculate cumulative returns and drawdowns
    signals['cumulative_market_return'] = np.exp(signals['market_return'].cumsum()) - 1
    signals['cumulative_strategy_return'] = np.exp(signals['sized_strategy_return'].cumsum()) - 1
    signals['drawdown'] = signals['cumulative_strategy_return'] - signals['cumulative_strategy_return'].cummax()
    
    # Calculate performance metrics
    if len(signals) > 0:
        total_return = np.exp(signals['sized_strategy_return'].sum()) - 1
        annual_return = np.exp(signals['sized_strategy_return'].mean() * 252) - 1
        sharpe_ratio = np.sqrt(252) * signals['sized_strategy_return'].mean() / (signals['sized_strategy_return'].std() or 1e-8)
        max_drawdown = signals['drawdown'].min()
        win_rate = len(signals[signals['sized_strategy_return'] > 0]) / (len(signals[signals['sized_strategy_return'] != 0]) or 1)
        gross_profits = signals.loc[signals['sized_strategy_return'] > 0, 'sized_strategy_return'].sum()
        gross_losses = abs(signals.loc[signals['sized_strategy_return'] < 0, 'sized_strategy_return'].sum())
        profit_factor = gross_profits / (gross_losses or 1e-8)
        
        performance_metrics = {
            'total_return': total_return,
            'annual_return': annual_return,
            'sharpe_ratio': sharpe_ratio,
            'max_drawdown': max_drawdown,
            'win_rate': win_rate,
            'profit_factor': profit_factor
        }
    else:
        performance_metrics = {
            'total_return': 0,
            'annual_return': 0,
            'sharpe_ratio': 0,
            'max_drawdown': 0,
            'win_rate': 0,
            'profit_factor': 0
        }
    
    return signals, performance_metrics
# Function to visualize trading performance
def visualize_trading_performance(signals, performance_metrics, symbol_name,symbol_dir):
    """Visualize trading performance metrics and signals."""
    fig = plt.figure(figsize=(15, 12))
    ax1 = fig.add_subplot(3, 1, 1)
    signals['cumulative_market_return'].plot(ax=ax1, label=f'{symbol_name} Return', color='blue', alpha=0.7)
    signals['cumulative_strategy_return'].plot(ax=ax1, label='Strategy Return', color='green')
    ax1.set_title(f'Cumulative Returns Comparison - {symbol_name}')
    ax1.set_ylabel('Return (%)')
    ax1.legend()
    ax1.grid(True)
    ax2 = fig.add_subplot(3, 1, 2)
    signals['drawdown'].plot(ax=ax2, color='red')
    ax2.set_title('Strategy Drawdown')
    ax2.set_ylabel('Drawdown (%)')
    ax2.grid(True)
    ax3 = fig.add_subplot(3, 1, 3)
    ax3.plot(signals.index, signals['price'], color='black', alpha=0.7)
    buy_signals = signals[signals['position'].diff() > 0]
    sell_signals = signals[signals['position'].diff() < 0]
    ax3.scatter(buy_signals.index, buy_signals['price'], marker='^', color='green', s=100, label='Buy')
    ax3.scatter(sell_signals.index, sell_signals['price'], marker='v', color='red', s=100, label='Sell')
    ax3.set_title(f'Trading Signals - {symbol_name}')
    ax3.set_ylabel('Price')
    ax3.legend()
    plt.figtext(0.01, 0.01, f"""
    {symbol_name} Performance Metrics:
    - Total Return: {performance_metrics['total_return']*100:.2f}%
    - Annual Return: {performance_metrics['annual_return']*100:.2f}%
    - Sharpe Ratio: {performance_metrics['sharpe_ratio']:.2f}
    - Max Drawdown: {performance_metrics['max_drawdown']*100:.2f}%
    - Win Rate: {performance_metrics['win_rate']*100:.2f}%
    - Profit Factor: {performance_metrics['profit_factor']:.2f}
    """, fontsize=12, bbox=dict(facecolor='white', alpha=0.8))
    plt.tight_layout(rect=[0, 0.05, 1, 0.95])
    #plt.savefig(f'{symbol_name}_trading_performance.png')
    plt.savefig(os.path.join(symbol_dir, f'{symbol_name}_trading_performance.png'))
    plt.close()
# Function to save trading signals and performance metrics    
def simulate_trades(signals, df, stop_loss_pct=0.02, take_profit_pct=0.05, max_holding_days=30):
    """Simulate trades with stop-loss and take-profit."""
    portfolio = 10000  # Initial capital
    position = 0  # 0: no position, 1: long position
    entry_price = 0
    days_held = 0
    trade_returns = []
    portfolio_values = [portfolio]

    # Ensure indices are aligned
    if isinstance(signals.index, pd.DatetimeIndex) and isinstance(df.index, pd.DatetimeIndex):
        common_index = signals.index.intersection(df.index)
        signals = signals.loc[common_index]
        df = df.loc[common_index]

    for date, row in signals.iterrows():
        try:
            signal = row['position']
            close_price = df.loc[date, 'Close'] if date in df.index else None
            
            if close_price is None:
                continue
                
            if position == 0:  # No position
                if signal == 1:  # Buy signal
                    position = 1
                    entry_price = close_price
                    days_held = 0
                    print(f"Entering position at {close_price} on {date}")
            elif position == 1:  # Holding position
                days_held += 1
                if close_price <= entry_price * (1 - stop_loss_pct):
                    position = 0
                    trade_return = (close_price - entry_price) / entry_price
                    portfolio *= (1 + trade_return)
                    trade_returns.append(trade_return)
                    print(f"Stop-loss triggered at {close_price} on {date}, return: {trade_return:.2%}")
                elif close_price >= entry_price * (1 + take_profit_pct):
                    position = 0
                    trade_return = (close_price - entry_price) / entry_price
                    portfolio *= (1 + trade_return)
                    trade_returns.append(trade_return)
                    print(f"Take-profit triggered at {close_price} on {date}, return: {trade_return:.2%}")
                elif days_held >= max_holding_days:
                    position = 0
                    trade_return = (close_price - entry_price) / entry_price
                    portfolio *= (1 + trade_return)
                    trade_returns.append(trade_return)
                    print(f"Max holding period reached, exiting at {close_price} on {date}, return: {trade_return:.2%}")

            portfolio_values.append(portfolio)
            
        except Exception as e:
            print(f"Error processing trade on {date}: {e}")
            continue

    # Calculate performance metrics
    daily_returns = pd.Series(portfolio_values).pct_change().dropna()
    total_return = (portfolio - 10000) / 10000
    sharpe_ratio = daily_returns.mean() / (daily_returns.std() or 1e-8) * np.sqrt(252)
    max_drawdown = (pd.Series(portfolio_values).cummax() - pd.Series(portfolio_values)).max() / pd.Series(portfolio_values).cummax().max()

    return {
        'total_return': total_return,
        'sharpe_ratio': sharpe_ratio,
        'max_drawdown': max_drawdown,
        'num_trades': len(trade_returns),
        'win_rate': sum(r > 0 for r in trade_returns) / (len(trade_returns) or 1)
    }
# Main function to run the trading system
def run_trading_system(file_path, symbol='AAPL', seq_length=30, confidence_threshold=0.6,results_dir='results'):
    """
    Run the trading system for a given stock symbol using a CNN-BiLSTM model.
    
    Args:
        file_path (str): Path to the CSV file with stock data.
        symbol (str): Stock symbol (default: 'AAPL').
        seq_length (int): Number of days in each sequence (default: 30).
        confidence_threshold (float): Threshold for generating buy signals (default: 0.6).
    
    Returns:
        model: Trained CNN-BiLSTM model.
        signals (pd.DataFrame): Generated trading signals.
        performance_metrics (dict): Performance metrics including accuracy, precision, recall, and F1 score.
    """
    # In the run_trading_system function, after SMOTE and before model.fit, add class weighting calculation
    from sklearn.utils.class_weight import compute_class_weight
    from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
    # Create symbol-specific directory
    symbol_dir = os.path.join(results_dir, symbol)
    os.makedirs(symbol_dir, exist_ok=True)
    # Load and preprocess data
    X, y, feature_cols, df = load_and_preprocess_data(file_path, symbol, seq_length)
    
    # Split data into train, validation, and test sets
    X_train, X_val, X_test, y_train, y_val, y_test, train_size, val_size = split_data(X, y)
    
    # Reshape X_train for SMOTE (from 3D to 2D)
    num_samples, seq_len, num_features = X_train.shape
    X_train_reshaped = X_train.reshape(num_samples, seq_len * num_features)

    # Apply SMOTE to balance the training data
    # Apply SMOTE to balance the training data
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_reshaped, y_train)
    
    # Reshape the data back to 3D after SMOTE
    new_num_samples = X_train_resampled.shape[0]
    X_train_resampled = X_train_resampled.reshape(new_num_samples, seq_length, num_features)
    
    # Ensure validation and test data are in correct shape
    X_val = X_val.reshape(X_val.shape[0], seq_length, num_features)
    X_test = X_test.reshape(X_test.shape[0], seq_length, num_features)
    
    # Print shapes to verify
    print("Shapes after reshaping:")
    print(f"X_train_resampled shape: {X_train_resampled.shape}")
    print(f"X_val shape: {X_val.shape}")
    print(f"X_test shape: {X_test.shape}")

    
    # Calculate class weights for the resampled training set
    y_train_labels = y_train_resampled if len(y_train_resampled.shape) == 1 else y_train_resampled.astype(int)
    classes = np.unique(y_train_labels)
    class_weights = compute_class_weight('balanced', classes=classes, y=y_train_labels)
    class_weight_dict = {i: w for i, w in enumerate(class_weights)}

    # Build the CNN-BiLSTM model
    model = build_cnn_bilstm_model(seq_length, num_features)

    # Add advanced regularization callbacks
    callbacks = [
        EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
        ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)
    ]

    # Train the model with class weights and callbacks
    history = model.fit(
        X_train_resampled, y_train_resampled,
        validation_data=(X_val, y_val),
        epochs=50,
        batch_size=32,
        class_weight=class_weight_dict,
        callbacks=callbacks,
        verbose=0
    )
    # Evaluate model
    print("Evaluating model...")
    y_pred_proba = model.predict(X_test)
    y_pred = (y_pred_proba > confidence_threshold).astype(int)
    y_test_labels = y_test  # Binary classification, no argmax needed

    # Plot training and validation loss
    accuracy, precision, recall, f1 = evaluate_and_visualize_model(y_test_labels, y_pred, history, symbol, symbol_dir)
    
    # Generate trading signals
    print(f"Generating trading signals for {symbol}...")
    start_index = seq_length + train_size + val_size
    df_test = df.iloc[start_index:start_index + len(y_test)]
    signals, performance_metrics = generate_advanced_trading_signals(
        model, 
        X_test, 
        df_test, 
        confidence_threshold
    )
    
    # Visualize trading performance (placeholder)
    visualize_trading_performance(signals, performance_metrics, symbol, symbol_dir)
    
    # Simulate trades
    trades = simulate_trades(signals, df)

    # Update performance metrics with evaluation results
    performance_metrics.update({
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1
    })
    with open(os.path.join(symbol_dir, f'metrics_{symbol}.json'), 'w') as f:
        json.dump(performance_metrics, f, indent=4)

    # Simulate trades
    #Save signals and trades to CSV
    signals.to_csv(os.path.join(symbol_dir, f'signals_{symbol}.csv'))
    pd.DataFrame([trades]).to_csv(os.path.join(symbol_dir, f'trades_{symbol}.csv'), index=False)
    
    return model, signals, performance_metrics, trades


#test


In [4]:
def calculate_atr(df, window=14):
    high_low = df['High'] - df['Low']
    high_close = np.abs(df['High'] - df['Close'].shift())
    low_close = np.abs(df['Low'] - df['Close'].shift())
    ranges = pd.concat([high_low, high_close, low_close], axis=1)
    true_range = np.max(ranges, axis=1)
    atr = true_range.rolling(window).mean()
    return atr

def calculate_stochastic(df, window=14):
    low_min = df['Low'].rolling(window=window).min()
    high_max = df['High'].rolling(window=window).max()
    k = 100 * ((df['Close'] - low_min) / (high_max - low_min))
    return k

def calculate_ichimoku(df):
    # Tenkan-sen (Conversion Line)
    tenkan_sen_high = df['High'].rolling(window=9).max()
    tenkan_sen_low = df['Low'].rolling(window=9).min()
    tenkan_sen = (tenkan_sen_high + tenkan_sen_low) / 2
    
    # Kijun-sen (Base Line)
    kijun_sen_high = df['High'].rolling(window=26).max()
    kijun_sen_low = df['Low'].rolling(window=26).min()
    kijun_sen = (kijun_sen_high + kijun_sen_low) / 2
    
    # Senkou Span A (Leading Span A)
    senkou_span_a = ((tenkan_sen + kijun_sen) / 2).shift(26)
    
    # Senkou Span B (Leading Span B)
    senkou_span_b_high = df['High'].rolling(window=52).max()
    senkou_span_b_low = df['Low'].rolling(window=52).min()
    senkou_span_b = ((senkou_span_b_high + senkou_span_b_low) / 2).shift(26)
    
    # Chikou Span (Lagging Span)
    chikou_span = df['Close'].shift(-26)
    
    return tenkan_sen, kijun_sen, senkou_span_a, senkou_span_b, chikou_span

In [5]:
def build_enhanced_cnn_bilstm_model(seq_length, num_features, num_classes=1):
    # Input layer
    inputs = Input(shape=(seq_length, num_features))
    
    # Multiple CNN blocks with residual connections
    x = Conv1D(filters=64, kernel_size=3, padding='same', activation='relu')(inputs)
    x_res = x
    
    # First CNN block
    x = Conv1D(filters=128, kernel_size=3, padding='same', activation='relu')(x)
    x = BatchNormalization()(x)
    x = MaxPooling1D(pool_size=2)(x)
    x = Dropout(0.3)(x)
    
    # Second CNN block with residual connection
    x_res = Conv1D(filters=128, kernel_size=1, padding='same')(x_res)
    x_res = MaxPooling1D(pool_size=2)(x_res)
    x = Add()([x, x_res])
    x = Activation('relu')(x)
    
    # Bi-directional LSTM layers with attention
    lstm_out = Bidirectional(LSTM(128, return_sequences=True))(x)
    lstm_out = Dropout(0.3)(lstm_out)
    
    # Self-attention mechanism
    attention = Dense(1, activation='tanh')(lstm_out)
    attention = Flatten()(attention)
    attention = Activation('softmax')(attention)
    attention = RepeatVector(256)(attention)
    attention = Permute([2, 1])(attention)
    
    # Apply attention to LSTM output
    sent_representation = Multiply()([lstm_out, attention])
    sent_representation = Lambda(lambda x: K.sum(x, axis=1))(sent_representation)
    
    # Deep dense layers
    x = Dense(128, activation='relu')(sent_representation)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    x = Dense(64, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    
    # Output layer
    outputs = Dense(num_classes, activation='sigmoid')(x)
    
    # Compile model
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy', tf.keras.metrics.AUC(), tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
    )
    
    return model

In [6]:
def add_advanced_features(df):
    # Add Bollinger Band indicators
    df['BB_Squeeze'] = (df['BB_Upper'] - df['BB_Lower']) / df['BB_Middle']
    
    # Add momentum indicators
    df['ROC_5'] = df['Close'].pct_change(periods=5)
    df['ROC_10'] = df['Close'].pct_change(periods=10)
    df['ROC_20'] = df['Close'].pct_change(periods=20)
    
    # Add volatility indicators
    df['ATR_14'] = calculate_atr(df, window=14)
    
    # Add volume indicators
    df['Volume_ROC'] = df['Volume'].pct_change()
    df['Volume_Price_Trend'] = (df['Volume'] * df['Close'].diff()) / df['Close'].shift(1)
    
    # Add price indicators
    df['VWAP'] = (df['Close'] * df['Volume']).rolling(window=20).sum() / df['Volume'].rolling(window=20).sum()
    df['VWAP_Ratio'] = df['Close'] / df['VWAP']
    
    # Add oscillators
    df['Stochastic_K'] = calculate_stochastic(df, window=14)
    df['Stochastic_D'] = df['Stochastic_K'].rolling(window=3).mean()
    
    # Add Ichimoku Cloud components
    tenkan_sen, kijun_sen, senkou_span_a, senkou_span_b, chikou_span = calculate_ichimoku(df)
    df['Tenkan_Sen'] = tenkan_sen
    df['Kijun_Sen'] = kijun_sen
    df['Senkou_Span_A'] = senkou_span_a
    df['Senkou_Span_B'] = senkou_span_b
    df['Chikou_Span'] = chikou_span
    
    return df

In [10]:
def train_with_ensemble(file_path, symbol, seq_length=30):
    # Load data with enhanced features
    X, y, feature_cols, df = load_and_preprocess_data(file_path, symbol, seq_length)
    X = add_advanced_features(X)
    
    # Split data
    X_train, X_val, X_test, y_train, y_val, y_test, train_size, val_size = split_data(X, y)
    
    # Reshape for SMOTE
    num_samples, seq_len, num_features = X_train.shape
    X_train_reshaped = X_train.reshape(num_samples, seq_len * num_features)
    
    # Apply SMOTE
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_reshaped, y_train)
    
    # Reshape back to 3D
    new_num_samples = X_train_resampled.shape[0]
    X_train_resampled = X_train_resampled.reshape(new_num_samples, seq_length, num_features)

    # Add debugging information
    print(f"X_train_resampled shape: {X_train_resampled.shape}")
    print(f"X_val shape: {X_val.shape}")
    print(f"X_test shape: {X_test.shape}")
    
    # Create ensemble of models
    models = []
    
    # Model 1: CNN-BiLSTM with attention
    model1 = build_enhanced_cnn_bilstm_model(seq_length, num_features)
    
    # Model 2: GRU-based model
    model2 = build_gru_model(seq_length, num_features)
    
    # Model 3: Transformer-based model
    model3 = build_transformer_model(seq_length, num_features)
    
    # Train models with cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    # Train each model
    for model, name in zip([model1, model2, model3], ['CNN-BiLSTM', 'GRU', 'Transformer']):
        model.fit(
            X_train_resampled, y_train_resampled,
            validation_data=(X_val, y_val),
            epochs=100,
            batch_size=32,
            callbacks=[
                EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True),
                ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=7, min_lr=1e-6),
                ModelCheckpoint(f'best_{name}_{symbol}.h5', save_best_only=True)
            ],
            verbose=1
        )
        models.append(model)
    
    # Create ensemble prediction
    y_preds = [model.predict(X_test) for model in models]
    y_pred_ensemble = np.mean(y_preds, axis=0)
    
    return models, y_pred_ensemble, y_test

In [8]:
def build_gru_model(seq_length, num_features, num_classes=1):
    inputs = Input(shape=(seq_length, num_features))
    
    # GRU layers
    x = GRU(128, return_sequences=True)(inputs)
    x = Dropout(0.3)(x)
    x = GRU(64)(x)
    x = Dropout(0.3)(x)
    
    # Dense layers
    x = Dense(32, activation='relu')(x)
    x = Dropout(0.2)(x)
    outputs = Dense(num_classes, activation='sigmoid')(x)
    
    # Create and compile model
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model

def build_transformer_model(seq_length, num_features, num_classes=1):
    inputs = Input(shape=(seq_length, num_features))
    
    # Transformer encoder
    x = inputs
    for _ in range(2):  # 2 transformer blocks
        # Multi-head attention
        attention_output = MultiHeadAttention(
            key_dim=64, num_heads=4, dropout=0.1
        )(x, x)
        x = LayerNormalization(epsilon=1e-6)(x + attention_output)
        
        # Feed-forward network
        ffn = Dense(256, activation='relu')(x)
        ffn = Dropout(0.1)(ffn)
        ffn = Dense(num_features)(ffn)
        x = LayerNormalization(epsilon=1e-6)(x + ffn)
    
    # Global average pooling
    x = GlobalAveragePooling1D()(x)
    
    # Output
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.2)(x)
    outputs = Dense(num_classes, activation='sigmoid')(x)
    
    # Create and compile model
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model

In [19]:
def train_with_ensemble(file_path, symbol, seq_length=30):
    # Import necessary libraries
    from sklearn.model_selection import KFold
    from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
    from tensorflow.keras.layers import (Input, Dense, Dropout, Conv1D, MaxPooling1D, LSTM, GRU,
                                        Bidirectional, BatchNormalization, Flatten,
                                        Activation, Multiply, RepeatVector, Permute)
    from tensorflow.keras.models import Model
    from tensorflow.keras.optimizers import Adam
    import tensorflow as tf
    import numpy as np
    
    # Load and preprocess data
    X, y, feature_cols, df = load_and_preprocess_data(file_path, symbol, seq_length)
    
    # Split data
    X_train, X_val, X_test, y_train, y_val, y_test, train_size, val_size = split_data(X, y)
    
    # Reshape for SMOTE
    num_samples, seq_len, num_features = X_train.shape
    X_train_reshaped = X_train.reshape(num_samples, seq_len * num_features)
    
    # Apply SMOTE
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_reshaped, y_train)
    
    # Reshape back to 3D
    new_num_samples = X_train_resampled.shape[0]
    X_train_resampled = X_train_resampled.reshape(new_num_samples, seq_len, num_features)
    
    print(f"X_train_resampled shape: {X_train_resampled.shape}")
    print(f"X_val shape: {X_val.shape}")
    print(f"X_test shape: {X_test.shape}")
    
    # Define the models
    
    # 1. Enhanced CNN-BiLSTM model without Lambda layers
    def build_enhanced_cnn_bilstm_model(seq_length, num_features, num_classes=1):
        # Input layer
        inputs = Input(shape=(seq_length, num_features))
        
        # CNN layers for feature extraction
        x = Conv1D(filters=64, kernel_size=3, padding='same', activation='relu')(inputs)
        x = MaxPooling1D(pool_size=2)(x)
        x = BatchNormalization()(x)
        x = Dropout(0.3)(x)
        
        # BiLSTM layers
        x = Bidirectional(LSTM(128, return_sequences=True))(x)
        x = Dropout(0.3)(x)
        sequence = Bidirectional(LSTM(64))(x)  # Remove return_sequences=True
        
        # Use Global Average Pooling instead of attention
        x = Dense(64, activation='relu')(sequence)
        x = BatchNormalization()(x)
        x = Dropout(0.3)(x)
        
        # Output layer
        outputs = Dense(num_classes, activation='sigmoid')(x)
        
        # Create and compile model
        model = Model(inputs=inputs, outputs=outputs)
        model.compile(
            optimizer=Adam(learning_rate=0.001),
            loss='binary_crossentropy',
            metrics=['accuracy']
        )
        
        return model
    
    # 2. GRU model
    def build_gru_model(seq_length, num_features, num_classes=1):
        inputs = Input(shape=(seq_length, num_features))
        
        # GRU layers
        x = GRU(128, return_sequences=True)(inputs)
        x = Dropout(0.3)(x)
        x = GRU(64)(x)
        x = Dropout(0.3)(x)
        
        # Dense layers
        x = Dense(32, activation='relu')(x)
        x = Dropout(0.2)(x)
        outputs = Dense(num_classes, activation='sigmoid')(x)
        
        # Create and compile model
        model = Model(inputs=inputs, outputs=outputs)
        model.compile(
            optimizer=Adam(learning_rate=0.001),
            loss='binary_crossentropy',
            metrics=['accuracy']
        )
        
        return model
    
    # Create models
    print("Building models...")
    model1 = build_enhanced_cnn_bilstm_model(seq_len, num_features)
    model2 = build_gru_model(seq_len, num_features)
    
    models = []
    model_names = ['CNN-BiLSTM', 'GRU']
    
    # Train each model
    for i, (model, name) in enumerate(zip([model1, model2], model_names)):
        print(f"Training {name} model...")
        
        # Create callbacks
        callbacks = [
            EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True),
            ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=7, min_lr=1e-6)
        ]
        
        # Train model
        history = model.fit(
            X_train_resampled, y_train_resampled,
            validation_data=(X_val, y_val),
            epochs=50,
            batch_size=32,
            callbacks=callbacks,
            verbose=1
        )
        
        # Evaluate without saving
        models.append(model)
        
        # Print model performance
        y_pred = model.predict(X_test)
        y_pred_binary = (y_pred > 0.5).astype(int)
        accuracy = accuracy_score(y_test, y_pred_binary)
        precision = precision_score(y_test, y_pred_binary, average='weighted')
        recall = recall_score(y_test, y_pred_binary, average='weighted')
        f1 = f1_score(y_test, y_pred_binary, average='weighted')
        
        print(f"\n{name} Model Performance:")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")
    
    # Create ensemble prediction
    print("\nGenerating ensemble predictions...")
    y_preds = [model.predict(X_test) for model in models]
    y_pred_ensemble = np.mean(y_preds, axis=0)
    
    # Evaluate ensemble
    y_pred_ensemble_binary = (y_pred_ensemble > 0.5).astype(int)
    accuracy = accuracy_score(y_test, y_pred_ensemble_binary)
    precision = precision_score(y_test, y_pred_ensemble_binary, average='weighted')
    recall = recall_score(y_test, y_pred_ensemble_binary, average='weighted')
    f1 = f1_score(y_test, y_pred_ensemble_binary, average='weighted')
    
    print("\nEnsemble Model Performance:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    
    return models, y_pred_ensemble, y_test

In [20]:
if __name__ == "__main__":
    # Process a single stock symbol for testing
    symbol = 'AAPL'
    print(f"Processing {symbol} with ensemble approach...")
    
    try:
        models, y_pred_ensemble, y_test = train_with_ensemble(
            file_path='sp500_master_data.csv',
            symbol=symbol,
            seq_length=30
        )
        
        # Apply threshold and get metrics
        y_pred_classes = (y_pred_ensemble > 0.5).astype(int)
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred_classes)
        precision = precision_score(y_test, y_pred_classes, average='weighted')
        recall = recall_score(y_test, y_pred_classes, average='weighted')
        f1 = f1_score(y_test, y_pred_classes, average='weighted')
        
        print(f"\nFinal Performance Metrics for {symbol}:")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")
        
    except Exception as e:
        print(f"Error processing {symbol}: {e}")
        import traceback
        traceback.print_exc()

Processing AAPL with ensemble approach...
Selected 11 features
X_train_resampled shape: (744, 30, 11)
X_val shape: (153, 30, 11)
X_test shape: (155, 30, 11)
Building models...
Training CNN-BiLSTM model...
Epoch 1/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m282s[0m 1s/step - accuracy: 0.5112 - loss: 0.8723 - val_accuracy: 0.4837 - val_loss: 0.6979 - learning_rate: 0.0010
Epoch 2/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 111ms/step - accuracy: 0.4932 - loss: 0.8420 - val_accuracy: 0.4837 - val_loss: 0.7109 - learning_rate: 0.0010
Epoch 3/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 180ms/step - accuracy: 0.5212 - loss: 0.8074 - val_accuracy: 0.4837 - val_loss: 0.6991 - learning_rate: 0.0010
Epoch 4/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 360ms/step - accuracy: 0.5082 - loss: 0.7793 - val_accuracy: 0.4837 - val_loss: 0.7030 - learning_rate: 0.0010
Epoch 5/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━

test

In [12]:
# List of top 20 stock symbols
top_20_symbols = ['AAPL', 'NVDA', 'MSFT', 'AMZN', 'META', 'GOOGL', 'AVGO', 'TSLA',
                  'BRK.B', 'GOOG', 'JPM', 'LLY', 'V', 'COST', 'MA', 'UNH',
                  'NFLX', 'WMT', 'PG', 'JNJ', 'HD', 'ABBV', 'BAC', 'CRM']
#top_20_symbols = ['AAPL']
# Example usage
results_dir = 'results'
os.makedirs(results_dir, exist_ok=True)
if __name__ == "__main__":
    for symbol in top_20_symbols:
        print(f"Processing {symbol}...")
        try:
            model, signals, performance,trades = run_trading_system(
                file_path='sp500_master_data.csv',
                symbol=symbol,
                seq_length=30,
                confidence_threshold=0.6,
                results_dir=results_dir
            )
            print(f"\nPerformance Metrics {symbol}:", performance)
            print(f"\nTrades {symbol}:", trades)
        except Exception as e:
            print(f"Error processing {symbol}: {e}")

Processing AAPL...
Top 30 features by mutual information:
              Feature  MI Score
52  Market_Volatility  0.047995
48   Enterprise_Value  0.033587
53       Rolling_Beta  0.021552
54                VIX  0.021465
24     Volatility_60d  0.019219
2         Log_Returns  0.017328
13        Returns_20d  0.017272
1             Returns  0.017012
45     Dividend_Yield  0.013395
27        BB_Width_50  0.012483
9              EMA_10  0.011591
25      Volume_MA_60d  0.011515
28              RSI_9  0.010004
42      Volume_StdDev  0.008788
46      Profit_Margin  0.008283
4     Price_Range_Pct  0.007850
8               MA_10  0.005194
21       Volume_MA_5d  0.004087
31               MACD  0.003784
10        Returns_10d  0.003647
15             EMA_50  0.002938
39   Channel_Width_50  0.000994
51      Market_Return  0.000896
14              MA_50  0.000365
12             EMA_20  0.000346
43           PE_Ratio  0.000000
44           PB_Ratio  0.000000
50       Trailing_EPS  0.000000
47            

In [16]:
import pandas as pd
import json
# List of top 20 stock symbols
#top_20_symbols = ['AAPL', 'NVDA', 'MSFT', 'META', 'GOOGL', 'AVGO', 
#                   'GOOG', 'JPM', 'LLY', 'V', 'COST', 'MA', 'UNH',
#                   'WMT', 'PG', 'JNJ', 'HD', 'ABBV', 'BAC', 'CRM','MMM','T']

top_20_symbols = ['AAPL', 'NVDA', 'MSFT', 'META', 'GOOGL', 'AVGO',
                   'GOOG', 'JPM', 'LLY', 'V', 'COST', 'MA', 'UNH',
                  'WMT', 'PG', 'JNJ', 'HD', 'ABBV', 'BAC', 'CRM']
#top_20_symbols = ['T']
# load the data with symbol as the frist column in dataframe from the results folder
def load_data(symbol):
    symbol_dir = f'results/{symbol}'
    signals = pd.read_csv(f'{symbol_dir}/signals_{symbol}.csv', index_col=0)
    trades = pd.read_csv(f'{symbol_dir}/trades_{symbol}.csv')
    with open(f'{symbol_dir}/metrics_{symbol}.json', 'r') as f:
        metrics = json.load(f)
    return signals, trades, metrics

# Load data for each symbol and store to dataframe
data = []
all_trades = []
all_signals = []
for symbol in top_20_symbols:
    signals, trades, metrics = load_data(symbol)
    data.append({
        'Symbol': symbol,
        'accuracy': metrics['accuracy'],
        'precision': metrics['precision'],
        'recall': metrics['recall'],
        'f1_score': metrics['f1_score']
    })
      # Add symbol column to trades dataframe
    trades['Symbol'] = symbol
    
    # Reorder columns to have Symbol first
    cols = trades.columns.tolist()
    cols.remove('Symbol')
    cols = ['Symbol'] + cols
    trades = trades[cols]
    
    #print(trades)
    all_trades.append(trades)

    # Add symbol column to trades dataframe
    signals['Symbol'] = symbol
    
    # Reorder columns to have Symbol first
    cols = signals.columns.tolist()
    cols.remove('Symbol')
    cols = ['Symbol'] + cols
    signals = signals[cols]
    
    #print(trades)
    all_signals.append(signals)

# Create a DataFrame from the list of dictionaries for metrics
df_metrics = pd.DataFrame(data)
#print(df_metrics)
#save to file
df_metrics.to_csv('metrics.csv', index=False)

# Combine all trades data
combined_trades = pd.concat(all_trades, ignore_index=True)
#create df for trades
df_trades = pd.DataFrame(combined_trades)
#save to file
df_trades.to_csv('trades.csv', index=False)

# Combine all signals data
#print("\nAll trades with Symbol column:")
#print(df_trades)
# Combine all signals data
combined_signals = pd.concat(all_signals, ignore_index=True)
#create df for signals
df_signals = pd.DataFrame(combined_signals)
#save to file
df_signals.to_csv('signals.csv', index=False)
#print("\nAll trades with Symbol column:")
#print(df_signals)

In [23]:
# do Analysis on the df_metrics
# Perform statistical analysis on df_metrics
print("Basic Statistics for Trading Metrics:")
print("====================================")
print("\nDescriptive Statistics:")
print(df_metrics.describe())

print("\nCorrelation Analysis:")
# Calculate correlations between numeric columns
correlation_matrix = df_metrics.select_dtypes(include=[np.number]).corr()
print(correlation_matrix)

# Check for any extreme values or outliers
print("\nChecking for Outliers:")
for column in df_metrics.select_dtypes(include=[np.number]).columns:
    Q1 = df_metrics[column].quantile(0.25)
    Q3 = df_metrics[column].quantile(0.75)
    IQR = Q3 - Q1
    outliers = df_metrics[(df_metrics[column] < (Q1 - 1.5 * IQR)) | 
                         (df_metrics[column] > (Q3 + 1.5 * IQR))]
    if not outliers.empty:
        print(f"\nOutliers in {column}:")
        print(outliers[['Symbol', column]])

# Calculate performance rankings
print("\nPerformance Rankings:")
performance_metrics = ['total_return', 'sharpe_ratio', 'win_rate']
for metric in performance_metrics:
    print(f"\nTop performers by {metric}:")
    print(df_trades.nlargest(3, metric)[['Symbol', metric]])

# Calculate summary statistics
print("\nSummary Statistics:")
print(f"Average Accuracy: {df_metrics['accuracy'].mean():.2%}")
print(f"Average Precision: {df_metrics['precision'].mean():.2%}")
print(f"Average Recall: {df_metrics['recall'].mean():.2%}")
print(f"Average F1 Score: {df_metrics['f1_score'].mean():.2%}")



Basic Statistics for Trading Metrics:

Descriptive Statistics:
        accuracy  precision     recall   f1_score
count  20.000000  20.000000  20.000000  20.000000
mean    0.466129   0.285484   0.466129   0.324353
std     0.040311   0.130465   0.040311   0.078878
min     0.400000   0.160000   0.400000   0.228571
25%     0.438710   0.208356   0.438710   0.286109
50%     0.464516   0.231113   0.464516   0.309327
75%     0.496774   0.305919   0.496774   0.334571
max     0.541935   0.534056   0.541935   0.536200

Correlation Analysis:
           accuracy  precision    recall  f1_score
accuracy   1.000000   0.376129  1.000000  0.785111
precision  0.376129   1.000000  0.376129  0.826611
recall     1.000000   0.376129  1.000000  0.785111
f1_score   0.785111   0.826611  0.785111  1.000000

Checking for Outliers:

Outliers in precision:
   Symbol  precision
1    NVDA   0.461046
2    MSFT   0.526669
9       V   0.525731
11     MA   0.534056

Outliers in f1_score:
   Symbol  f1_score
2    MSFT  0.

In [27]:
# what is overall performance of the trading system
# Calculate overall performance metrics
total_return = df_trades['total_return'].mean()
sharpe_ratio = df_trades['sharpe_ratio'].mean()
max_drawdown = df_trades['max_drawdown'].max()
win_rate =df_trades['win_rate'].mean()


# Print overall performance metrics
print("\nOverall Performance Metrics:")
print(f"Total Return: {total_return:.2%}")
print(f"Sharpe Ratio: {sharpe_ratio:.2f}")
print(f"Max Drawdown: {max_drawdown:.2%}")
print(f"Win Rate: {win_rate:.2%}")
#save to file



Overall Performance Metrics:
Total Return: 0.31%
Sharpe Ratio: 0.05
Max Drawdown: 36.21%
Win Rate: 9.95%


In [28]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from matplotlib.gridspec import GridSpec

# Set the style for all plots
plt.style.use('fivethirtyeight')
sns.set_palette('Set2')

# Create a directory for saving visualizations
import os
os.makedirs('visualizations', exist_ok=True)


In [21]:
# 1. VISUALIZING df_metrics DATA
#=================================

# First, reset the index if Symbol is set as the index
if 'Symbol' not in df_metrics.columns and df_metrics.index.name == 'Symbol':
    df_metrics = df_metrics.reset_index()

# 1.1 - Performance Metrics Heatmap
plt.figure(figsize=(14, 10))
# Sort by Total Return
df_metrics_sorted = df_metrics.sort_values('Total Return', ascending=False)
# Set Symbol as index for the heatmap if it's not already
if 'Symbol' in df_metrics_sorted.columns:
    df_metrics_sorted = df_metrics_sorted.set_index('Symbol')
# Create a heatmap
metrics_heatmap = sns.heatmap(df_metrics_sorted, annot=True, fmt='.3f', cmap='RdYlGn', linewidths=0.5)
plt.title('Performance Metrics Heatmap (Sorted by Total Return)', fontsize=16)
plt.tight_layout()
plt.savefig('visualizations/1_1metrics_heatmap.png', dpi=300)
plt.close()

# For the next visualizations, make sure to use Symbol properly
# First get Symbol back as a column if it's an index
if 'Symbol' not in df_metrics.columns:
    df_metrics = df_metrics.reset_index()

# 1.2 - Top 10 Symbols by Total Return
plt.figure(figsize=(12, 6))
top_symbols = df_metrics.sort_values('Total Return', ascending=False).head(10)
ax = sns.barplot(x='Symbol', y='Total Return', data=top_symbols)
plt.title('Top 10 Symbols by Total Return', fontsize=16)
plt.xlabel('Symbol')
plt.ylabel('Total Return')
plt.xticks(rotation=45)
# Add value labels on top of bars
for i, v in enumerate(top_symbols['Total Return']):
    ax.text(i, v + 0.001, f'{v:.3f}', ha='center', va='bottom', fontsize=9)
plt.tight_layout()
plt.savefig('visualizations/1_2top_symbols_by_return.png', dpi=300)
plt.close()

# 1.3 - Risk-Return Scatter Plot
plt.figure(figsize=(12, 8))
scatter = plt.scatter(
    df_metrics['Max Drawdown'], 
    df_metrics['Total Return'], 
    s=df_metrics['Sharpe Ratio']*500, 
    c=df_metrics['Win Rate'],
    cmap='viridis',
    alpha=0.7
)
plt.colorbar(scatter, label='Win Rate')
plt.title('Risk-Return Analysis with Sharpe Ratio and Win Rate', fontsize=16)
plt.xlabel('Risk (Max Drawdown)')
plt.ylabel('Total Return')

# Add labels for each symbol
for i, symbol in enumerate(df_metrics.index):
    plt.annotate(
        symbol, 
        (df_metrics['Max Drawdown'].iloc[i], df_metrics['Total Return'].iloc[i]),
        textcoords="offset points",
        xytext=(0,10), 
        ha='center'
    )

plt.tight_layout()
plt.savefig('visualizations/1_3risk_return_scatter.png', dpi=300)
plt.close()


# 2. VISUALIZING df_trades DATA
#===============================

# 2.1 - Number of Trades by Symbol
plt.figure(figsize=(14, 7))
trade_counts = df_trades.groupby('Symbol')['num_trades'].sum().sort_values(ascending=False)
ax = sns.barplot(x=trade_counts.index, y=trade_counts.values)
plt.title('Number of Trades by Symbol', fontsize=16)
plt.xlabel('Symbol')
plt.ylabel('Number of Trades')
plt.xticks(rotation=45)
# Add value labels on top of bars
for i, v in enumerate(trade_counts.values):
    ax.text(i, v + 0.1, f'{int(v)}', ha='center', va='bottom')
plt.tight_layout()
plt.savefig('visualizations/2_1trades_by_symbol.png', dpi=300)
plt.close()

# 2.2 - Win Rate vs. Total Return
plt.figure(figsize=(12, 8))
sns.scatterplot(data=df_trades, x='win_rate', y='total_return', hue='Symbol', size='num_trades', 
                sizes=(50, 500), alpha=0.7)
plt.title('Win Rate vs. Total Return', fontsize=16)
plt.xlabel('Win Rate')
plt.ylabel('Total Return')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('visualizations/2_2win_rate_vs_return.png', dpi=300)
plt.close()

# 2.3 - Combined Performance Dashboard
plt.figure(figsize=(20, 16))
gs = GridSpec(3, 2)

# Return Distribution
ax1 = plt.subplot(gs[0, 0])
sns.histplot(df_trades['total_return'], kde=True, ax=ax1)
ax1.set_title('Distribution of Total Returns')
ax1.set_xlabel('Total Return')

# Sharpe Ratio vs Max Drawdown
ax2 = plt.subplot(gs[0, 1])
sns.scatterplot(data=df_trades, x='max_drawdown', y='sharpe_ratio', size='num_trades',
                sizes=(50, 500), alpha=0.7, ax=ax2)
ax2.set_title('Sharpe Ratio vs Max Drawdown')
ax2.set_xlabel('Max Drawdown')
ax2.set_ylabel('Sharpe Ratio')

# Win Rate Distribution
ax3 = plt.subplot(gs[1, 0])
sns.boxplot(data=df_trades, y='win_rate', x='Symbol', ax=ax3)
ax3.set_title('Win Rate Distribution by Symbol')
ax3.set_ylabel('Win Rate')
ax3.set_xticklabels(ax3.get_xticklabels(), rotation=90)

# Trade Count vs Sharpe Ratio
ax4 = plt.subplot(gs[1, 1])
sns.regplot(data=df_trades, x='num_trades', y='sharpe_ratio', ax=ax4, scatter_kws={'alpha':0.5})
ax4.set_title('Number of Trades vs Sharpe Ratio')
ax4.set_xlabel('Number of Trades')
ax4.set_ylabel('Sharpe Ratio')

# Top 5 and Bottom 5 Performers
ax5 = plt.subplot(gs[2, :])
top5 = df_trades.sort_values('total_return', ascending=False).head(5)
bottom5 = df_trades.sort_values('total_return').head(5)
compare = pd.concat([top5, bottom5])
compare_melted = pd.melt(compare.reset_index(), id_vars=['Symbol'], 
                          value_vars=['total_return', 'sharpe_ratio', 'max_drawdown', 'win_rate'])
sns.barplot(data=compare_melted, x='Symbol', y='value', hue='variable', ax=ax5)
ax5.set_title('Comparison of Top 5 and Bottom 5 Performers')
ax5.set_xticklabels(ax5.get_xticklabels(), rotation=45)
ax5.legend(title='Metric')

plt.suptitle('Trading Performance Dashboard', fontsize=20, y=0.98)
plt.tight_layout()
plt.savefig('visualizations/2_3performance_dashboard.png', dpi=300)
plt.close()

# 2.4 - Sharpe Ratio Dashboard
# Filter for top 10 sharpe ratios
top_sharpe = df_trades.sort_values('sharpe_ratio', ascending=False).head(10)

plt.figure(figsize=(12, 6))
ax = sns.barplot(x='Symbol', y='sharpe_ratio', data=top_sharpe)
plt.title('Top 10 Symbols by Sharpe Ratio', fontsize=16)
#plt.xlabel('Symbol')
plt.ylabel('Sharpe Ratio')
plt.xticks(rotation=45)
# Add value labels on top of bars
for i, v in enumerate(top_sharpe['sharpe_ratio']):
    ax.text(i, v + 0.02, f'{v:.3f}', ha='center', va='bottom', fontsize=9)
plt.tight_layout()
plt.savefig('visualizations/2_4top_symbols_by_sharpe.png', dpi=300)
plt.close()

print("Visualizations have been created and saved to the 'visualizations' folder.")

Visualizations have been created and saved to the 'visualizations' folder.
