In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score

import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, Conv1D, MaxPooling1D, LSTM, Bidirectional
from tensorflow.keras.layers import BatchNormalization, Activation, Flatten, Multiply, Lambda, RepeatVector, Permute
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras import backend as K
from tensorflow.keras.optimizers import Adam

import ta
import itertools
import warnings
warnings.filterwarnings('ignore')

In [None]:
def add_correlation_features(symbol_data):
    """Add correlation-based features specific to this symbol"""
    # Calculate correlation between returns and volume
    symbol_data['Returns_Volume_Corr'] = symbol_data['Returns'].rolling(20).corr(symbol_data['Volume'])
    
    # Calculate correlation between symbol returns and market returns
    if 'Market_Return' in symbol_data.columns:
        symbol_data['Market_Corr_20d'] = symbol_data['Returns'].rolling(20).corr(symbol_data['Market_Return'])
        symbol_data['Market_Corr_60d'] = symbol_data['Returns'].rolling(60).corr(symbol_data['Market_Return'])
    
    # Calculate correlation between price and volatility
    symbol_data['Price_Volatility_Corr'] = symbol_data['Close'].rolling(30).corr(
        symbol_data['Returns'].rolling(20).std())
    
    # Add price momentum correlation
    symbol_data['Mom_Corr'] = symbol_data['Returns'].rolling(10).corr(
        symbol_data['Returns'].shift(10).rolling(10).mean())
    
    return symbol_data
def detect_technical_pattern(data, pattern_type):
    """Detect technical chart patterns in price data"""
    result = np.zeros(len(data))
    
    if pattern_type == 'Double_Bottom':
        # Simple double bottom detection
        for i in range(30, len(data)):
            window = data['Low'].iloc[i-30:i]
            # Find two significant lows with a higher point between them
            if len(window) >= 20:
                min_idx = window.nsmallest(2).index
                if len(min_idx) >= 2 and abs(min_idx[0] - min_idx[1]) >= 10:
                    mid_point = window.iloc[min(min_idx):max(min_idx)].max()
                    left_low, right_low = window.iloc[min_idx]
                    # If middle point is significantly higher and lows are at similar levels
                    if mid_point > left_low * 1.03 and abs(left_low - right_low) / left_low < 0.03:
                        result[i] = 1
    
    elif pattern_type == 'Head_Shoulders':
        # Simple head and shoulders detection
        for i in range(40, len(data)):
            if i+20 < len(data):
                window = data['High'].iloc[i-40:i+20]
                if len(window) >= 60:
                    # Find 3 peaks
                    peaks = window.nlargest(3).index.tolist()
                    if len(peaks) == 3:
                        peaks.sort()
                        if len(peaks) == 3 and peaks[1] - peaks[0] >= 10 and peaks[2] - peaks[1] >= 10:
                            # Check if middle peak is higher
                            left, middle, right = window.iloc[peaks]
                            if middle > left * 1.02 and middle > right * 1.02 and abs(left - right) / left < 0.05:
                                result[i] = 1
    
    elif pattern_type == 'Triangle':
        # Simple triangle pattern detection
        for i in range(30, len(data)):
            window_high = data['High'].iloc[i-30:i]
            window_low = data['Low'].iloc[i-30:i]
            
            if len(window_high) >= 20 and len(window_low) >= 20:
                # Check decreasing highs
                highs = window_high.rolling(5).max().dropna()
                # Check increasing lows
                lows = window_low.rolling(5).min().dropna()
                
                if len(highs) >= 3 and len(lows) >= 3:
                    high_slope = np.polyfit(range(len(highs)), highs.values, 1)[0]
                    low_slope = np.polyfit(range(len(lows)), lows.values, 1)[0]
                    
                    # Triangle patterns: highs slope down, lows slope up
                    if high_slope < -0.001 and low_slope > 0.001:
                        result[i] = 1
    
    return result
def calculate_mutual_information(X, y):
    """Calculate mutual information between features and target"""
    from sklearn.feature_selection import mutual_info_classif
    
    # Convert target to class labels if one-hot encoded
    if len(y.shape) > 1 and y.shape[1] > 1:
        y_labels = np.argmax(y, axis=1)
    else:
        y_labels = y
    
    # Calculate mutual information
    mi_scores = mutual_info_classif(X, y_labels)
    
    # Create list of (feature, score) tuples
    mi_features = [(X.columns[i], score) for i, score in enumerate(mi_scores)]
    
    # Sort by mutual information score in descending order
    mi_features.sort(key=lambda x: x[1], reverse=True)
    
    return mi_features
def prepare_prediction_data(symbol_data, sequence_length, features):
    """Prepare sequence data for prediction"""
    # Extract features
    data = symbol_data[features].values
    
    # Create sequences
    X = []
    for i in range(len(data) - sequence_length + 1):
        X.append(data[i:i+sequence_length])
    
    return np.array(X)
def rebalance_portfolio(portfolio, target_allocations, df, day):
    """Rebalance portfolio based on target allocations"""
    # Calculate current portfolio value
    current_value = calculate_portfolio_value(portfolio, df, day)
    
    # Get current positions
    current_positions = portfolio['positions'].copy()
    
    # Calculate trades needed
    trades = {}
    for symbol, target_alloc in target_allocations.items():
        target_value = current_value * abs(target_alloc)
        target_direction = 1 if target_alloc > 0 else -1
        
        # Get current position
        current_position = current_positions.get(symbol, {'shares': 0, 'direction': 0})
        current_shares = current_position['shares']
        current_direction = current_position['direction']
        
        # Get current price
        symbol_data = df[(df['Symbol'] == symbol) & (df.index == day)]
        if len(symbol_data) == 0:
            continue  # Skip if no price data for today
            
        current_price = symbol_data['Close'].values[0]
        
        # Calculate target shares
        target_shares = target_value / current_price
        
        # Handle direction change
        if current_direction != target_direction and current_shares > 0:
            # Close current position
            portfolio['cash'] += current_shares * current_price * current_direction
            # Remove position
            if symbol in portfolio['positions']:
                del portfolio['positions'][symbol]
            # Reset current shares
            current_shares = 0
            current_direction = 0
        
        # Calculate shares to trade
        if target_direction == current_direction or current_shares == 0:
            shares_to_trade = target_shares - current_shares
            if abs(shares_to_trade) < 1:  # Skip small trades
                continue
                
            # Execute trade
            cost = shares_to_trade * current_price
            if shares_to_trade > 0:  # Buy
                if portfolio['cash'] >= cost:
                    portfolio['cash'] -= cost
                    # Update position
                    portfolio['positions'][symbol] = {
                        'shares': current_shares + shares_to_trade,
                        'direction': target_direction,
                        'entry_price': current_price
                    }
            else:  # Sell
                portfolio['cash'] += abs(cost)
                # Update position
                if current_shares - abs(shares_to_trade) <= 0:
                    if symbol in portfolio['positions']:
                        del portfolio['positions'][symbol]
                else:
                    portfolio['positions'][symbol] = {
                        'shares': current_shares - abs(shares_to_trade),
                        'direction': current_direction,
                        'entry_price': current_position['entry_price']
                    }
    
    return portfolio
def calculate_portfolio_value(portfolio, df, day):
    """Calculate total portfolio value"""
    # Start with cash
    total_value = portfolio['cash']
    
    # Add position values
    for symbol, position in portfolio['positions'].items():
        # Get current price
        symbol_data = df[(df['Symbol'] == symbol) & (df.index == day)]
        if len(symbol_data) == 0:
            continue  # Skip if no price data for today
            
        current_price = symbol_data['Close'].values[0]
        
        # Add position value
        position_value = position['shares'] * current_price
        total_value += position_value
    
    return total_value
def calculate_performance_metrics(portfolio):
    """Calculate performance metrics for the portfolio"""
    # Get portfolio value history
    values = [entry['value'] for entry in portfolio['value_history']]
    dates = [entry['date'] for entry in portfolio['value_history']]
    
    # Calculate returns
    returns = np.diff(values) / values[:-1]
    
    # Calculate performance metrics
    total_return = (values[-1] / values[0]) - 1
    annual_return = (1 + total_return) ** (252 / len(values)) - 1
    volatility = np.std(returns) * np.sqrt(252)
    sharpe_ratio = annual_return / volatility if volatility > 0 else 0
    
    # Calculate drawdowns
    peak = values[0]
    drawdowns = []
    for value in values:
        if value > peak:
            peak = value
        drawdown = (peak - value) / peak
        drawdowns.append(drawdown)
    
    max_drawdown = max(drawdowns)
    
    # Calculate win rate for trades
    # This would require a transaction history that we don't have
    # in this simplified implementation
    
    return {
        'total_return': total_return,
        'annual_return': annual_return,
        'volatility': volatility,
        'sharpe_ratio': sharpe_ratio,
        'max_drawdown': max_drawdown
    }
def visualize_symbol_performance(symbol_performances, top_n=10):
    """Visualize performance of top and bottom symbols"""
    # Sort symbols by total return
    sorted_symbols = sorted(symbol_performances.keys(), 
                           key=lambda s: symbol_performances[s]['total_return'],
                           reverse=True)
    
    # Get top N and bottom N performers
    top_performers = sorted_symbols[:top_n]
    bottom_performers = sorted_symbols[-top_n:]
    
    # Plot top performers
    plt.figure(figsize=(12, 8))
    plt.subplot(2, 1, 1)
    
    for symbol in top_performers:
        perf = symbol_performances[symbol]
        plt.bar(symbol, perf['total_return'] * 100)
    
    plt.title(f'Top {top_n} Performing Symbols')
    plt.ylabel('Total Return (%)')
    plt.axhline(y=0, color='r', linestyle='-')
    plt.xticks(rotation=45)
    
    # Plot bottom performers
    plt.subplot(2, 1, 2)
    
    for symbol in bottom_performers:
        perf = symbol_performances[symbol]
        plt.bar(symbol, perf['total_return'] * 100)
    
    plt.title(f'Bottom {top_n} Performing Symbols')
    plt.ylabel('Total Return (%)')
    plt.axhline(y=0, color='r', linestyle='-')
    plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.show()
def visualize_sector_performance(symbol_performances):
    """Visualize performance by sector"""
    # Group performances by sector
    sector_performances = {}
    
    for symbol, perf in symbol_performances.items():
        sector = get_sector_for_symbol(symbol)
        if sector not in sector_performances:
            sector_performances[sector] = []
        
        sector_performances[sector].append(perf['total_return'])
    
    # Calculate average performance by sector
    sector_avg_performance = {}
    for sector, performances in sector_performances.items():
        sector_avg_performance[sector] = np.mean(performances)
    
    # Plot sector performance
    plt.figure(figsize=(12, 6))
    
    sectors = list(sector_avg_performance.keys())
    performances = [sector_avg_performance[s] * 100 for s in sectors]
    
    plt.bar(sectors, performances)
    plt.title('Average Performance by Sector')
    plt.ylabel('Average Return (%)')
    plt.axhline(y=0, color='r', linestyle='-')
    plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.show()
def plot_portfolio_performance(portfolio, benchmark_returns=None):
    """Plot portfolio performance over time"""
    # Get portfolio value history
    values = [entry['value'] for entry in portfolio['value_history']]
    dates = [entry['date'] for entry in portfolio['value_history']]
    
    # Calculate portfolio returns
    portfolio_returns = [values[i] / values[0] - 1 for i in range(len(values))]
    
    plt.figure(figsize=(12, 6))
    
    # Plot portfolio returns
    plt.plot(dates, portfolio_returns, label='Portfolio')
    
    # Plot benchmark if provided
    if benchmark_returns is not None:
        plt.plot(dates, benchmark_returns, label='Benchmark')
    
    plt.title('Portfolio Performance')
    plt.ylabel('Return (%)')
    plt.axhline(y=0, color='r', linestyle='-')
    plt.grid(True)
    plt.legend()
    
    plt.tight_layout()
    plt.show()
def analyze_prediction_accuracy(all_signals, df):
    """Analyze prediction accuracy for all symbols"""
    accuracy_by_symbol = {}
    
    for symbol, signals in all_signals.items():
        # Get actual returns
        symbol_data = df[df['Symbol'] == symbol]
        actual_returns = symbol_data['Returns'].shift(-1)  # Next day returns
        
        # Align signals with actual returns
        aligned_data = pd.concat([signals['predicted_class'], actual_returns], axis=1)
        aligned_data.columns = ['predicted_class', 'next_return']
        aligned_data = aligned_data.dropna()
        
        if len(aligned_data) == 0:
            continue
        
        # Calculate accuracy
        correct_predictions = 0
        total_predictions = len(aligned_data)
        
        for i, row in aligned_data.iterrows():
            predicted = row['predicted_class']
            actual_return = row['next_return']
            
            # Check if prediction was correct
            if (predicted == 2 and actual_return > 0) or \
               (predicted == 0 and actual_return < 0) or \
               (predicted == 1 and -0.003 <= actual_return <= 0.003):
                correct_predictions += 1
        
        accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
        accuracy_by_symbol[symbol] = accuracy
    
    return accuracy_by_symbol
def select_symbol_features(symbol_data, base_features, top_n=25):
    """Select most predictive features for this specific symbol"""
    features = base_features.copy()
    
    # Calculate feature importance for this symbol
    X = symbol_data[features].fillna(0)
    y = symbol_data['target']
    
    # Use mutual information to determine feature importance
    importance = calculate_mutual_information(X, y)
    
    # Select top features specific to this symbol
    top_features = [f[0] for f in importance[:top_n]]
    
    # Always include certain key features regardless of importance
    essential_features = ['Returns', 'Volatility_20d', 'Market_Return', 'RSI_14']
    for feature in essential_features:
        if feature in features and feature not in top_features:
            top_features.append(feature)
    
    return top_features
def get_optimal_hyperparameters(symbol):
    """Retrieve optimal hyperparameters for this symbol"""
    
    # Get sector for this symbol
    sector = get_sector_for_symbol(symbol)
    
    # Get symbol volatility profile
    volatility = get_symbol_volatility(symbol)
    
    # Define base hyperparameters
    base_params = {
        'cnn_filters_1': 64,
        'kernel_size_1': 3,
        'cnn_filters_2': 128,
        'kernel_size_2': 3,
        'pool_size': 2,
        'lstm_units_1': 64,
        'lstm_units_2': 32,
        'dense_units': 32,
        'dropout_1': 0.2,
        'dropout_2': 0.3,
        'dropout_lstm_1': 0.3,
        'dropout_lstm_2': 0.4,
        'dropout_dense': 0.4,
        'use_second_cnn': True,
        'use_bidirectional': True,
        'activation': 'relu',
        'learning_rate': 0.001,
        'loss_function': 'categorical_crossentropy'
    }
    
    # Adjust hyperparameters based on sector
    sector_params = {
        'Technology': {'cnn_filters_1': 128, 'lstm_units_1': 128},
        'Financial': {'lstm_units_1': 96, 'dropout_lstm_1': 0.4},
        'Healthcare': {'use_bidirectional': True, 'dropout_dense': 0.5},
        'Energy': {'kernel_size_1': 5, 'learning_rate': 0.0005},
        'Consumer': {'dense_units': 64, 'dropout_2': 0.25}
    }
    
    # Adjust hyperparameters based on volatility
    if volatility == 'high':
        volatility_params = {'dropout_1': 0.3, 'dropout_lstm_1': 0.4}
    elif volatility == 'medium':
        volatility_params = {'dropout_1': 0.2, 'dropout_lstm_1': 0.3}
    else:
        volatility_params = {'dropout_1': 0.1, 'dropout_lstm_1': 0.2}
    
    # Update base parameters with sector-specific parameters
    if sector in sector_params:
        base_params.update(sector_params[sector])
    
    # Update with volatility-specific parameters
    base_params.update(volatility_params)
    
    return base_params
def train_symbol_model(model, X_train, y_train, X_val, y_val):
    """Train model with adaptive learning rate and early stopping"""
    
    # Convert y_train to integer labels if one-hot encoded
    if len(y_train.shape) > 1 and y_train.shape[1] > 1:
        y_train_labels = np.argmax(y_train, axis=1)
    else:
        y_train_labels = y_train.copy()
    
    # Compute class weights
    class_counts = np.bincount(y_train_labels)
    total_samples = len(y_train_labels)
    class_weights = {
        i: (total_samples / (len(np.unique(y_train_labels)) * count)) 
        for i, count in enumerate(class_counts)
    }
    
    # Enhanced callbacks for symbol-specific training
    callbacks = [
        EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True),
        ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6),
        ModelCheckpoint(f'models/{symbol}_best_model.h5', monitor='val_loss', 
                         save_best_only=True, verbose=0)
    ]
    
    # Train with adaptive batch size based on dataset size
    batch_size = min(32, len(X_train) // 20)  # Ensure at least 20 batches
    batch_size = max(8, batch_size)  # Ensure batch size is at least 8
    
    # Train model
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=200,  # Use early stopping to determine actual epochs
        batch_size=batch_size,
        class_weight=class_weights,
        callbacks=callbacks,
        verbose=0  # Reduce verbosity for many symbols
    )
    
    return history, model
def generate_symbol_signals(symbol_models, df, sequence_length, symbol_features):
    """Generate trading signals for all symbols"""
    symbols = list(symbol_models.keys())
    all_signals = {}
    
    for symbol in symbols:
        # Get model and features for this symbol
        model = symbol_models[symbol]
        features = symbol_features[symbol]
        
        # Get data for this symbol
        symbol_data = df[df['Symbol'] == symbol].copy()
        
        # Prepare sequence data
        X = prepare_prediction_data(symbol_data, sequence_length, features)
        
        # Get predictions
        predictions = model.predict(X)
        predicted_classes = np.argmax(predictions, axis=1)
        prediction_probs = np.max(predictions, axis=1)
        
        # Create signals DataFrame
        signals = pd.DataFrame(index=symbol_data.index[-len(predictions):])
        signals['predicted_class'] = predicted_classes
        signals['confidence'] = prediction_probs
        signals['symbol'] = symbol
        
        # Add signal strength based on confidence
        signals['signal_strength'] = 0.0
        for i, cls in enumerate(predicted_classes):
            if cls == 2:  # Up signal
                signals['signal_strength'].iloc[i] = prediction_probs[i]
            elif cls == 0:  # Down signal
                signals['signal_strength'].iloc[i] = -prediction_probs[i]
        
        # Store signals
        all_signals[symbol] = signals
    
    return all_signals
def optimize_portfolio(all_signals, date, max_positions=20, max_allocation=0.2):
    """Create optimized portfolio based on signal strength"""
    # Get signals for the specific date
    date_signals = {}
    for symbol, signals in all_signals.items():
        if date in signals.index:
            date_signals[symbol] = signals.loc[date]
    
    # Sort symbols by signal strength (absolute value)
    symbols_ranked = sorted(date_signals.keys(), 
                           key=lambda s: abs(date_signals[s]['signal_strength']),
                           reverse=True)
    
    # Take top N signals
    top_symbols = symbols_ranked[:max_positions]
    
    # Calculate allocation based on signal strength
    total_strength = sum(abs(date_signals[s]['signal_strength']) for s in top_symbols)
    allocations = {}
    
    for symbol in top_symbols:
        # Determine position direction (long/short)
        direction = 1 if date_signals[symbol]['signal_strength'] > 0 else -1
        
        # Calculate allocation percentage (proportional to signal strength)
        allocation = (abs(date_signals[symbol]['signal_strength']) / total_strength) 
        
        # Cap at maximum allocation
        allocation = min(allocation, max_allocation)
        
        # Store allocation with direction
        allocations[symbol] = allocation * direction
    
    return allocations
def implement_symbol_strategy(symbol_models, df, sequence_length, symbol_features, 
                             start_date, end_date, initial_capital=1000000):
    """Implement the full trading strategy across all symbols"""
    # Get unique dates in the specified range
    trading_days = df[(df.index >= start_date) & (df.index <= end_date)].index.unique()
    
    # Initialize portfolio
    portfolio = {
        'cash': initial_capital,
        'positions': {},
        'value_history': []
    }
    
    # Generate all signals
    all_signals = generate_symbol_signals(symbol_models, df, sequence_length, symbol_features)
    
    # Iterate through trading days
    for day in trading_days:
        # Optimize portfolio for this day
        target_allocations = optimize_portfolio(all_signals, day)
        
        # Rebalance portfolio
        portfolio = rebalance_portfolio(portfolio, target_allocations, df, day)
        
        # Record portfolio value
        portfolio_value = calculate_portfolio_value(portfolio, df, day)
        portfolio['value_history'].append({
            'date': day,
            'value': portfolio_value
        })
    
    return portfolio
def calculate_risk_parameters(symbol, df):
    """Calculate sector-specific risk parameters"""
    # Get sector
    sector = get_sector_for_symbol(symbol)
    
    # Get last 250 trading days of data
    symbol_data = df[df['Symbol'] == symbol].tail(250)
    
    # Calculate volatility
    volatility = symbol_data['Returns'].std() * np.sqrt(252)
    
    # Determine appropriate stop-loss based on sector and volatility
    sector_multipliers = {
        'Technology': 2.0,
        'Financial': 1.5,
        'Healthcare': 1.8,
        'Energy': 2.2,
        'Consumer': 1.7,
        'Utilities': 1.2,
        'Materials': 2.0,
        'Industrial': 1.8,
        'Real Estate': 1.5,
        'Communication': 1.8
    }
    
    # Get appropriate multiplier with default of 1.8
    multiplier = sector_multipliers.get(sector, 1.8)
    
    # Calculate stop-loss percentage
    stop_loss_pct = volatility * multiplier
    
    # Ensure minimum and maximum stop-loss values
    stop_loss_pct = max(0.05, min(0.25, stop_loss_pct))
    
    # Calculate trailing stop parameters
    trailing_stop_activation = 0.05  # Activate trailing stop after 5% profit
    trailing_stop_distance = volatility * multiplier * 0.5  # Half the stop-loss distance
    
    return {
        'stop_loss_pct': stop_loss_pct,
        'trailing_stop_activation': trailing_stop_activation,
        'trailing_stop_distance': trailing_stop_distance
    }

In [27]:
def evaluate_symbol_model(model, X_test, y_test, test_data):
    """Evaluate model performance and return metrics"""
    # Make predictions on test data
    y_pred_proba = model.predict(X_test)
    y_pred = np.argmax(y_pred_proba, axis=1)
    
    # Convert y_test to integer class labels if one-hot encoded
    if len(y_test.shape) > 1 and y_test.shape[1] > 1:
        y_test_labels = np.argmax(y_test, axis=1)
    else:
        y_test_labels = y_test.copy()
    
    # Calculate classification metrics
    accuracy = accuracy_score(y_test_labels, y_pred)
    precision = precision_score(y_test_labels, y_pred, average='weighted')
    recall = recall_score(y_test_labels, y_pred, average='weighted')
    f1 = f1_score(y_test_labels, y_pred, average='weighted')
    
    # Calculate profit-based metrics
    # Assuming labels: 0=down, 1=neutral, 2=up
    # Map predictions to actual returns
    pred_returns = np.zeros(len(y_pred))
    
    # Set the predicted returns based on the actual returns in test_data
    for i in range(len(y_pred)):
        if y_pred[i] == 2:  # Predicted up
            pred_returns[i] = test_data['Returns'].iloc[i]
        elif y_pred[i] == 0:  # Predicted down
            pred_returns[i] = -test_data['Returns'].iloc[i]  # Reverse for short
        # Neutral predictions (1) are left as 0 return
    
    # Calculate financial metrics
    total_return = np.sum(pred_returns)
    win_rate = np.mean(pred_returns > 0)
    avg_win = np.mean(pred_returns[pred_returns > 0]) if any(pred_returns > 0) else 0
    avg_loss = np.mean(pred_returns[pred_returns < 0]) if any(pred_returns < 0) else 0
    
    # Profit factor = gross profit / gross loss
    gross_profit = np.sum(pred_returns[pred_returns > 0]) if any(pred_returns > 0) else 0
    gross_loss = abs(np.sum(pred_returns[pred_returns < 0])) if any(pred_returns < 0) else 0
    profit_factor = gross_profit / gross_loss if gross_loss > 0 else float('inf')
    
    # Return all metrics
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'total_return': total_return,
        'win_rate': win_rate,
        'avg_win': avg_win,
        'avg_loss': avg_loss,
        'profit_factor': profit_factor
    }

    """Build and train individualized models for each symbol"""
    symbol_models = {}
    symbol_features = {}
    symbol_performances = {}
    
    for symbol in symbols:
        print(f"Processing {symbol}...")
        
        # 1. Get symbol-specific data
        symbol_data = df[df['Symbol'] == symbol].copy()
        if len(symbol_data) < 500:  # Skip symbols with insufficient data
            continue
            
        # 2. Feature extraction and engineering for this specific symbol
        symbol_data = add_symbol_specific_features(symbol_data, base_features)
        
        # 3. Correlation analysis to find most relevant features for this symbol
        symbol_features[symbol] = select_symbol_features(symbol_data, base_features)
        
        # 4. Prepare sequence data
        X, y = prepare_sequence_data(symbol_data, sequence_length, 'target', symbol_features[symbol])
        
        # 5. Train/test split (time-based)
        train_size = int(0.7 * len(X))
        val_size = int(0.15 * len(X))
        
        X_train, y_train = X[:train_size], y[:train_size]
        X_val, y_val = X[train_size:train_size+val_size], y[train_size:train_size+val_size]
        X_test, y_test = X[train_size+val_size:], y[train_size+val_size:]
        
        # 6. Build and train model with optimized hyperparameters for this symbol
        input_shape = (X_train.shape[1], X_train.shape[2])
        model = build_optimized_model_for_symbol(symbol, input_shape)
        
        # 7. Train with early stopping
        history, model = train_symbol_model(model, X_train, y_train, X_val, y_val)
        
        # 8. Evaluate performance
        performance = evaluate_symbol_model(model, X_test, y_test, symbol_data.iloc[train_size+val_size+sequence_length:])
        symbol_performances[symbol] = performance
        
        # 9. Save model
        symbol_models[symbol] = model
        
    return symbol_models, symbol_features, symbol_performances

In [None]:
def get_volatility_regime(symbol_data):
    """Determine volatility regime (high, medium, low) for the symbol"""
    if 'Volatility_20d' not in symbol_data.columns:
        return 'medium'  # Default if volatility not calculated
    
    # Get recent volatility (last 20 days)
    recent_volatility = symbol_data['Volatility_20d'].iloc[-20:].mean()
    
    # Get historical volatility (full period)
    historical_volatility = symbol_data['Volatility_20d'].mean()
    historical_std = symbol_data['Volatility_20d'].std()
    
    # Determine regime based on z-score
    z_score = (recent_volatility - historical_volatility) / historical_std
    
    if z_score > 1.0:
        return 'high'
    elif z_score < -1.0:
        return 'low'
    else:
        return 'medium'
def get_days_since_earnings(symbol_data):
    """Calculate days since last earnings announcement"""
    # This is a placeholder - in a real implementation, you would need
    # an external data source for earnings dates
    # Here we use a cyclic pattern to approximate quarterly earnings
    
    trading_days = np.arange(len(symbol_data))
    # Assuming earnings every ~63 trading days (quarterly)
    days_since = trading_days % 63
    
    return days_since
def get_sector_for_symbol(symbol):
    """Get sector classification for a symbol"""
    # This is a simplified version - in real implementation, 
    # this would use an external data source for sector information
    
    # Example sector mapping for common symbols
    sector_map = {
        # Technology
        'AAPL': 'Technology', 'MSFT': 'Technology', 'GOOGL': 'Technology',
        'AMZN': 'Technology', 'META': 'Technology', 'NVDA': 'Technology',
        'INTC': 'Technology', 'AMD': 'Technology', 'CSCO': 'Technology',
        
        # Financial
        'JPM': 'Financial', 'BAC': 'Financial', 'WFC': 'Financial',
        'GS': 'Financial', 'MS': 'Financial', 'AXP': 'Financial',
        'V': 'Financial', 'MA': 'Financial', 'BLK': 'Financial',
        
        # Healthcare
        'JNJ': 'Healthcare', 'PFE': 'Healthcare', 'MRK': 'Healthcare',
        'UNH': 'Healthcare', 'ABT': 'Healthcare', 'LLY': 'Healthcare',
        
        # Energy
        'XOM': 'Energy', 'CVX': 'Energy', 'COP': 'Energy',
        'SLB': 'Energy', 'EOG': 'Energy', 'PSX': 'Energy',
        
        # Consumer
        'PG': 'Consumer', 'KO': 'Consumer', 'PEP': 'Consumer',
        'WMT': 'Consumer', 'MCD': 'Consumer', 'SBUX': 'Consumer',
        
        # Industrial
        'GE': 'Industrial', 'BA': 'Industrial', 'CAT': 'Industrial',
        'MMM': 'Industrial', 'HON': 'Industrial', 'UPS': 'Industrial',
        
        # Utilities
        'NEE': 'Utilities', 'DUK': 'Utilities', 'SO': 'Utilities',
        
        # Communication
        'T': 'Communication', 'VZ': 'Communication', 'CMCSA': 'Communication',
        
        # Materials
        'DD': 'Materials', 'DOW': 'Materials', 'FCX': 'Materials',
        
        # Real Estate
        'AMT': 'Real Estate', 'PLD': 'Real Estate', 'CCI': 'Real Estate'
    }
    
    return sector_map.get(symbol, 'Other')
def get_sector_returns(sector):
    """Get sector index returns"""
    # This is a placeholder function
    # In a real implementation, you would use external data for sector indices
    
    # Placeholder return value - would normally fetch sector index
    return 0.001  # Placeholder 0.1% daily return


In [None]:
def add_symbol_specific_features(symbol_data, base_features):
    """Add features specific to symbol characteristics"""
    
    # Calculate sector-specific metrics
    sector = get_sector_for_symbol(symbol_data['Symbol'].iloc[0])
    
    # Add volatility regime features
    symbol_data['Volatility_Regime'] = get_volatility_regime(symbol_data)
    
    # Add earnings seasonality
    symbol_data['Days_Since_Earnings'] = get_days_since_earnings(symbol_data)
    
    # Add symbol-specific technical patterns
    for pattern in ['Double_Bottom', 'Head_Shoulders', 'Triangle']:
        symbol_data[f'Pattern_{pattern}'] = detect_technical_pattern(symbol_data, pattern)
    
    # Add relative strength compared to sector and market
    symbol_data['Sector_RS'] = symbol_data['Returns_20d'] / get_sector_returns(sector)
    symbol_data['Market_RS'] = symbol_data['Returns_20d'] / symbol_data['Market_Return_20d']
    
    # Add symbol-specific correlation features
    symbol_data = add_correlation_features(symbol_data)
    
    return symbol_data
def prepare_sequence_data(df, sequence_length, target_column, feature_columns):
    """
    Prepare sequence data for time series forecasting
    
    Parameters:
    -----------
    df : pandas DataFrame
        The input dataframe containing features and target
    sequence_length : int
        Number of time steps in each sequence
    target_column : str
        Name of the target column
    feature_columns : list
        List of feature column names
    
    Returns:
    --------
    X : numpy array of shape (n_samples, sequence_length, n_features)
        The feature sequences
    y : numpy array of shape (n_samples,)
        The target values
    """
    
    X = []
    y = []
    
    # Extract features and target
    data = df[feature_columns].values
    targets = df[target_column].values
    
    # Create sequences
    for i in range(len(data) - sequence_length):
        X.append(data[i:i+sequence_length])
        y.append(targets[i+sequence_length])
    
    return np.array(X), np.array(y)
def build_optimized_model_for_symbol(symbol, input_shape, num_classes=3):
    """Build model with hyperparameters optimized for this symbol"""
    
    # Load optimal hyperparameters for this symbol (or sector)
    hyperparams = get_optimal_hyperparameters(symbol)
    
    # Input layer
    inputs = Input(shape=input_shape)
    
    # CNN layers with optimized parameters
    x = Conv1D(filters=hyperparams['cnn_filters_1'], 
                kernel_size=hyperparams['kernel_size_1'], 
                padding='same')(inputs)
    x = BatchNormalization()(x)
    x = Activation(hyperparams['activation'])(x)
    x = MaxPooling1D(pool_size=hyperparams['pool_size'])(x)
    x = Dropout(hyperparams['dropout_1'])(x)
    
    # Optional second CNN layer based on symbol complexity
    if hyperparams['use_second_cnn']:
        x = Conv1D(filters=hyperparams['cnn_filters_2'], 
                    kernel_size=hyperparams['kernel_size_2'], 
                    padding='same')(x)
        x = BatchNormalization()(x)
        x = Activation(hyperparams['activation'])(x)
        x = MaxPooling1D(pool_size=hyperparams['pool_size'])(x)
        x = Dropout(hyperparams['dropout_2'])(x)
    
    # LSTM layers
    if hyperparams['use_bidirectional']:
        x = Bidirectional(LSTM(units=hyperparams['lstm_units_1'], 
                               return_sequences=True))(x)
    else:
        x = LSTM(units=hyperparams['lstm_units_1'], 
                 return_sequences=True)(x)
    
    x = Dropout(hyperparams['dropout_lstm_1'])(x)
    
    if hyperparams['use_bidirectional']:
        x = Bidirectional(LSTM(units=hyperparams['lstm_units_2'], 
                               return_sequences=False))(x)
    else:
        x = LSTM(units=hyperparams['lstm_units_2'], 
                 return_sequences=False)(x)
    
    x = Dropout(hyperparams['dropout_lstm_2'])(x)
    
    # Dense layers
    x = Dense(hyperparams['dense_units'], activation=hyperparams['activation'])(x)
    x = Dropout(hyperparams['dropout_dense'])(x)
    outputs = Dense(num_classes, activation='softmax')(x)
    
    # Create model
    model = Model(inputs=inputs, outputs=outputs)
    
    # Compile with optimized learning rate
    optimizer = Adam(learning_rate=hyperparams['learning_rate'])
    model.compile(optimizer=optimizer, 
                  loss=hyperparams['loss_function'],
                  metrics=['accuracy'])
    
    return model


In [28]:
def build_symbol_specific_models(df,symbols, base_features, sequence_length=30):
    """Build and train individualized models for each symbol"""
    symbol_models = {}
    symbol_features = {}
    symbol_performances = {}
    
    for symbol in symbols:
        print(f"Processing {symbol}...")
        
        # 1. Get symbol-specific data
        symbol_data = df[df['Symbol'] == symbol].copy()
        if len(symbol_data) < 500:  # Skip symbols with insufficient data
            continue
            
        # 2. Feature extraction and engineering for this specific symbol
        symbol_data = add_symbol_specific_features(symbol_data, base_features)
        
        # 3. Correlation analysis to find most relevant features for this symbol
        symbol_features[symbol] = select_symbol_features(symbol_data, base_features)
        
        # 4. Prepare sequence data
        X, y = prepare_sequence_data(symbol_data, sequence_length, 'target', symbol_features[symbol])
        
        # 5. Train/test split (time-based)
        train_size = int(0.7 * len(X))
        val_size = int(0.15 * len(X))
        
        X_train, y_train = X[:train_size], y[:train_size]
        X_val, y_val = X[train_size:train_size+val_size], y[train_size:train_size+val_size]
        X_test, y_test = X[train_size+val_size:], y[train_size+val_size:]
        
        # 6. Build and train model with optimized hyperparameters for this symbol
        input_shape = (X_train.shape[1], X_train.shape[2])
        model = build_optimized_model_for_symbol(symbol, input_shape)
        
        # 7. Train with early stopping
        history, model = train_symbol_model(model, X_train, y_train, X_val, y_val)
        
        # 8. Evaluate performance
        performance = evaluate_symbol_model(model, X_test, y_test, symbol_data.iloc[train_size+val_size+sequence_length:])
        symbol_performances[symbol] = performance
        
        # 9. Save model
        symbol_models[symbol] = model
        
    return symbol_models, symbol_features, symbol_performances


In [29]:
# Global variable declaration at the module level
df = None
def main():
    """Main function to run the full pipeline"""
    print("Loading data...")
    # Load the S&P 500 data - replace with your data loading code
    df = pd.read_csv('sp500_master_data.csv', parse_dates=['Date'])
    df.set_index('Date', inplace=True)
    
    # Get list of symbols
    symbols = df['Symbol'].unique().tolist()
    
    # Define base features
    base_features = [
        'Close', 'Returns', 'Log_Returns', 'Volatility_20d', 
        'RSI_14', 'MACD', 'MFI_14', 'Market_Return', 'VIX'
    ]
    
    print(f"Building models for {len(symbols)} symbols...")
    # Build symbol-specific models
    symbol_models, symbol_features, symbol_performances = build_symbol_specific_models(
        df,symbols, base_features, sequence_length=30
    )
    
    print("Generating signals and implementing strategy...")
    # Implement trading strategy
    portfolio = implement_symbol_strategy(
        symbol_models, df, 30, symbol_features,
        start_date='2020-01-01', end_date='2021-12-31',
        initial_capital=1000000
    )
    
    print("Analyzing results...")
    # Calculate performance metrics
    performance_metrics = calculate_performance_metrics(portfolio)
    print("Performance Metrics:")
    for metric, value in performance_metrics.items():
        print(f"{metric}: {value:.4f}")
    
    # Analyze prediction accuracy
    all_signals = generate_symbol_signals(symbol_models, df, 30, symbol_features)
    accuracy_by_symbol = analyze_prediction_accuracy(all_signals, df)
    
    avg_accuracy = np.mean(list(accuracy_by_symbol.values()))
    print(f"Average prediction accuracy: {avg_accuracy:.4f}")
    
    # Visualize results
    visualize_symbol_performance(symbol_performances)
    visualize_sector_performance(symbol_performances)
    plot_portfolio_performance(portfolio)
    
    print("Done!")

if __name__ == "__main__":
    main()

Loading data...
Building models for 501 symbols...
Processing MMM...


TypeError: '>=' not supported between instances of 'Timedelta' and 'int'