In [None]:
#!/usr/bin/env python3
# Optimized AAPL Stock Price Prediction Model

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F
import os
from datetime import datetime

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Set Matplotlib to use default font
plt.rcParams['font.family'] = 'DejaVu Sans'

# Data loading and preprocessing function
def load_data(file_path, sequence_length=60, add_technical_indicators=True):
    """
    Load and preprocess stock data
    
    Parameters:
        file_path: Path to the data file
        sequence_length: Input sequence length
        add_technical_indicators: Whether to add technical indicators
        
    Returns:
        Preprocessed data and related objects
    """
    print("Loading AAPL historical stock data...")
    
    # Read CSV file
    df = pd.read_csv('HistoricalQuotes.csv')
    
    # Clean column names
    df.columns = df.columns.str.strip()
    
    # Data preview
    print(f"\nData shape: {df.shape}")
    print("\nFirst 5 rows:")
    print(df.head())
    
    # Data cleaning
    df['Date'] = pd.to_datetime(df['Date'])
    
    # Process price columns
    price_columns = ['Close/Last', 'Open', 'High', 'Low']
    for col in price_columns:
        if col in df.columns:
            if df[col].astype(str).str.contains('$').any():
                df[col] = df[col].str.replace('$', '', regex=False).str.replace(',', '', regex=False).astype(float)
            else:
                df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Process volume column
    if 'Volume' in df.columns:
        df['Volume'] = pd.to_numeric(df['Volume'], errors='coerce')
    
    # Rename columns
    if 'Close/Last' in df.columns:
        df = df.rename(columns={'Close/Last': 'Close'})
    
    # Sort by date
    df = df.sort_values('Date').reset_index(drop=True)
    
    # Calculate technical indicators
    if add_technical_indicators:
        print("\nCalculating technical indicators...")
        
        # Moving averages
        df['MA5'] = df['Close'].rolling(window=5).mean()
        df['MA10'] = df['Close'].rolling(window=10).mean()
        df['MA20'] = df['Close'].rolling(window=20).mean()
        df['MA50'] = df['Close'].rolling(window=50).mean()
        
        # Relative Strength Index (RSI)
        def calculate_rsi(data, window=14):
            delta = data.diff()
            gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
            loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
            rs = gain / loss.replace(0, 1e-10)
            rsi = 100 - (100 / (1 + rs))
            return rsi
        
        df['RSI'] = calculate_rsi(df['Close'], window=14)
        
        # MACD
        df['EMA12'] = df['Close'].ewm(span=12, adjust=False).mean()
        df['EMA26'] = df['Close'].ewm(span=26, adjust=False).mean()
        df['MACD'] = df['EMA12'] - df['EMA26']
        df['Signal'] = df['MACD'].ewm(span=9, adjust=False).mean()
        
        # Bollinger Bands
        df['BB_Middle'] = df['Close'].rolling(window=20).mean()
        df['BB_Std'] = df['Close'].rolling(window=20).std()
        df['BB_Upper'] = df['BB_Middle'] + 2 * df['BB_Std']
        df['BB_Lower'] = df['BB_Middle'] - 2 * df['BB_Std']
        
        # Volume change rate
        df['Volume_Change'] = df['Volume'].pct_change()
        
        # Price change rate
        df['Price_Change'] = df['Close'].pct_change()
    
    # Select required feature columns
    basic_features = ['Close', 'Open', 'High', 'Low', 'Volume']
    if add_technical_indicators:
        technical_features = ['MA5', 'MA10', 'MA20', 'MA50', 'RSI', 'MACD', 'Signal', 
                            'BB_Middle', 'BB_Upper', 'BB_Lower', 'Volume_Change', 'Price_Change']
        feature_columns = basic_features + technical_features
    else:
        feature_columns = basic_features
    
    data = df[['Date'] + feature_columns].copy()
    
    print(f"\nCleaned data shape: {data.shape}")
    print("\nBasic data info:")
    print(data.info())
    
    # Check missing values
    print("\nMissing values check:")
    print(data.isnull().sum())
    
    # Handle missing values
    data = data.dropna().reset_index(drop=True)
    
    # Data normalization
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaled_features = scaler.fit_transform(data[feature_columns])
    
    # Create normalized dataframe
    scaled_df = pd.DataFrame(scaled_features, columns=feature_columns)
    scaled_df['Date'] = data['Date'].values
    
    print("\nData normalization completed!")
    print(f"\nFeature columns used ({len(feature_columns)}):")
    print(feature_columns)
    
    return data, scaled_df, scaler, feature_columns

# Create sequence dataset
def create_sequences(data, sequence_length, feature_columns):
    """
    Create time series dataset
    
    Parameters:
        data: Normalized data
        sequence_length: Sequence length
        feature_columns: Feature columns to use
        
    Returns:
        Training and testing datasets
    """
    scaled_data = data[feature_columns].values
    
    # Create train/test split (80% train, 20% test)
    train_size = int(len(scaled_data) * 0.8)
    train_data = scaled_data[:train_size]
    test_data = scaled_data[train_size:]
    
    print(f"Total data points: {len(scaled_data)}")
    print(f"Training data size: {len(train_data)} ({train_size/len(scaled_data)*100:.1f}%)")
    print(f"Testing data size: {len(test_data)} ({(len(scaled_data)-train_size)/len(scaled_data)*100:.1f}%)")
    
    # Create sequences
    def _create_sequences(data, seq_len):
        X, y = [], []
        for i in range(len(data) - seq_len):
            X.append(data[i:i+seq_len])
            y.append(data[i+seq_len, 0])  # Predict close price (first feature)
        return np.array(X), np.array(y)
    
    X_train, y_train = _create_sequences(train_data, sequence_length)
    X_test, y_test = _create_sequences(test_data, sequence_length)
    
    print(f"\nTraining set shapes - X: {X_train.shape}, y: {y_train.shape}")
    print(f"Testing set shapes - X: {X_test.shape}, y: {y_test.shape}")
    
    # Convert to PyTorch tensors
    X_train_tensor = torch.FloatTensor(X_train)
    y_train_tensor = torch.FloatTensor(y_train).squeeze()
    X_test_tensor = torch.FloatTensor(X_test)
    y_test_tensor = torch.FloatTensor(y_test).squeeze()
    
    print(f"\nTensor shape verification:")
    print(f"X_train_tensor: {X_train_tensor.shape}")
    print(f"y_train_tensor: {y_train_tensor.shape}")
    print(f"X_test_tensor: {X_test_tensor.shape}")
    print(f"y_test_tensor: {y_test_tensor.shape}")
    
    # Create data loaders
    batch_size = 32
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    print("\nData preparation completed!")
    
    return (
        train_loader, test_loader, 
        X_train_tensor, y_train_tensor, 
        X_test_tensor, y_test_tensor,
        train_size, sequence_length
    )

# Optimized GRU model (simplified version)
class OptimizedGRU(nn.Module):
    """Simplified GRU model with optimal hyperparameters"""
    def __init__(self, input_size, hidden_size=32, num_layers=1, dropout=0.1):
        super(OptimizedGRU, self).__init__()
        
        # Simplified GRU architecture
        self.gru = nn.GRU(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
        
        # Single fully connected layer
        self.fc = nn.Linear(hidden_size, 1)
    
    def forward(self, x):
        out, _ = self.gru(x)
        out = out[:, -1, :]
        out = self.fc(out)
        return out.squeeze()

# Optimized LSTM model (simplified)
class OptimizedLSTM(nn.Module):
    """Simplified LSTM model with optimal hyperparameters"""
    def __init__(self, input_size, hidden_size=32, num_layers=1, dropout=0.1):
        super(OptimizedLSTM, self).__init__()
        
        # Simplified LSTM architecture
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
        
        # Single fully connected layer
        self.fc = nn.Linear(hidden_size, 1)
    
    def forward(self, x):
        out, _ = self.lstm(x)
        out = out[:, -1, :]
        out = self.fc(out)
        return out.squeeze()

# Optimized Transformer model (simplified)
class OptimizedTransformer(nn.Module):
    """Simplified Transformer model with optimal hyperparameters"""
    def __init__(self, input_size, hidden_size=32, num_layers=1, num_heads=2, dropout=0.1):
        super(OptimizedTransformer, self).__init__()
        
        # Input projection
        self.input_proj = nn.Linear(input_size, hidden_size)
        
        # Transformer encoder
        encoder_layers = nn.TransformerEncoderLayer(
            d_model=hidden_size, 
            nhead=num_heads, 
            dropout=dropout,
            dim_feedforward=hidden_size*2
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers=num_layers)
        
        # Output layer
        self.fc = nn.Linear(hidden_size, 1)
    
    def forward(self, x):
        # Input projection
        x = self.input_proj(x)
        
        # Transformer expects (seq_len, batch, features)
        x = x.permute(1, 0, 2)
        
        # Transformer forward pass
        out = self.transformer_encoder(x)
        
        # Take last token output
        out = out.permute(1, 0, 2)[:, -1, :]
        
        # Output layer
        out = self.fc(out)
        return out.squeeze()

# Optimized Attention LSTM model (simplified)
class OptimizedAttentionLSTM(nn.Module):
    """Simplified Attention LSTM model with optimal hyperparameters"""
    def __init__(self, input_size, hidden_size=32, num_layers=1, dropout=0.1):
        super(OptimizedAttentionLSTM, self).__init__()
        
        # LSTM layer
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
        
        # Attention mechanism
        self.attention = nn.Sequential(
            nn.Linear(hidden_size, 1),
            nn.Softmax(dim=1)
        )
        
        # Output layer
        self.fc = nn.Linear(hidden_size, 1)
    
    def forward(self, x):
        # LSTM forward pass
        out, _ = self.lstm(x)
        
        # Attention weights
        attention_weights = self.attention(out)
        
        # Weighted sum
        weighted_out = torch.sum(out * attention_weights, dim=1)
        
        # Output layer
        out = self.fc(weighted_out)
        return out.squeeze()

# Train model
def train_model(model, train_loader, test_loader, criterion, optimizer, scheduler, device, model_name, epochs=50, patience=10):
    """
    Train model
    
    Parameters:
        model: Model to train
        train_loader: Training data loader
        test_loader: Testing data loader
        criterion: Loss function
        optimizer: Optimizer
        scheduler: Learning rate scheduler
        device: Device (CPU/GPU)
        model_name: Model name (for saving)
        epochs: Number of training epochs
        patience: Early stopping patience
        
    Returns:
        Trained model and loss history
    """
    model = model.to(device)
    
    train_losses = []
    test_losses = []
    best_test_loss = float('inf')
    patience_counter = 0
    
    print(f"Starting training {model_name}...")
    
    for epoch in range(epochs):
        # Training mode
        model.train()
        train_loss = 0.0
        
        for batch_X, batch_y in train_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            
            optimizer.zero_grad()
            
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            optimizer.step()
            
            train_loss += loss.item() * batch_X.size(0)
        
        train_loss /= len(train_loader.dataset)
        train_losses.append(train_loss)
        
        # Validation mode
        model.eval()
        test_loss = 0.0
        
        with torch.no_grad():
            for batch_X, batch_y in test_loader:
                batch_X, batch_y = batch_X.to(device), batch_y.to(device)
                
                outputs = model(batch_X)
                loss = criterion(outputs, batch_y)
                
                test_loss += loss.item() * batch_X.size(0)
        
        test_loss /= len(test_loader.dataset)
        test_losses.append(test_loss)
        
        scheduler.step(test_loss)
        
        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.6f}, Test Loss: {test_loss:.6f}, LR: {optimizer.param_groups[0]['lr']:.6f}")
        
        if test_loss < best_test_loss:
            best_test_loss = test_loss
            patience_counter = 0
            torch.save(model.state_dict(), f'best_model_{model_name}.pth')
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping triggered. Best test loss: {best_test_loss:.6f}")
                break
    
    model.load_state_dict(torch.load(f'best_model_{model_name}.pth'))
    
    return model, train_losses, test_losses

# Generate predictions
def make_predictions(model, data_loader, scaler, device, feature_columns):
    """Generate predictions and inverse scaling"""
    model.eval()
    predictions = []
    actuals = []
    
    with torch.no_grad():
        for batch_X, batch_y in data_loader:
            batch_X = batch_X.to(device)
            outputs = model(batch_X)
            predictions.extend(outputs.cpu().numpy())
            actuals.extend(batch_y.cpu().numpy())
    
    # Inverse scaling
    predictions = np.array(predictions).reshape(-1, 1)
    actuals = np.array(actuals).reshape(-1, 1)
    
    dummy_pred = np.zeros((len(predictions), len(feature_columns)))
    dummy_pred[:, 0] = predictions.flatten()
    predictions = scaler.inverse_transform(dummy_pred)[:, 0]
    
    dummy_actual = np.zeros((len(actuals), len(feature_columns)))
    dummy_actual[:, 0] = actuals.flatten()
    actuals = scaler.inverse_transform(dummy_actual)[:, 0]
    
    return predictions, actuals

# Multi-model comparison experiment
def run_model_comparison(models_to_compare, train_loader, test_loader, X_train, y_train, 
                        X_test, y_test, scaler, device, feature_columns):
    """
    Run multi-model comparison experiment
    
    Parameters:
        models_to_compare: List of models to compare
        train_loader: Training data loader
        test_loader: Testing data loader
        X_train: Training features
        y_train: Training targets
        X_test: Testing features
        y_test: Testing targets
        scaler: Data scaler
        device: Device (CPU/GPU)
        feature_columns: Feature column list
        
    Returns:
        Model comparison results dictionary
    """
    print("\n=== Starting Multi-Model Comparison Experiment ===")
    
    model_results = {}
    
    for model_name, model_class, model_params in models_to_compare:
        print(f"\n--- Training Model: {model_name} ---")
        
        model = model_class(**model_params)
        print("Model Architecture:")
        print(model)
        
        criterion = nn.MSELoss()
        optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-5)
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, min_lr=1e-6)
        
        model, train_losses, test_losses = train_model(
            model, train_loader, test_loader, criterion, optimizer, scheduler,
            device, model_name, epochs=50, patience=10
        )
        
        predictions, actuals = make_predictions(model, test_loader, scaler, device, feature_columns)
        
        metrics = calculate_metrics(actuals, predictions)
        
        model_results[model_name] = {
            'model': model,
            'metrics': metrics,
            'predictions': predictions,
            'actuals': actuals,
            'train_losses': train_losses,
            'test_losses': test_losses
        }
        
        print(f"\n{model_name} Evaluation Results:")
        for metric, value in metrics.items():
            if isinstance(value, (int, float)):
                print(f"{metric}: {value:.4f}")
            else:
                print(f"{metric}: {value}")
    
    print("\n=== Multi-Model Comparison Completed ===")
    
    return model_results

# Calculate evaluation metrics
def calculate_metrics(actual, predicted):
    """Calculate evaluation metrics"""
    mse = mean_squared_error(actual, predicted)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(actual, predicted)
    medae = np.median(np.abs(actual - predicted))
    r2 = r2_score(actual, predicted)
    
    # Calculate MAPE (avoid division by zero)
    mask = actual != 0
    mape = np.mean(np.abs((actual[mask] - predicted[mask]) / actual[mask])) * 100
    
    # Calculate SMAPE
    denominator = (np.abs(actual) + np.abs(predicted)) / 2.0
    smape = np.mean(np.where(denominator != 0, np.abs(actual - predicted) / denominator, 0)) * 100
    
    # Calculate accuracy (prediction direction)
    actual_direction = np.diff(actual) > 0
    predicted_direction = np.diff(predicted) > 0
    direction_accuracy = np.mean(actual_direction == predicted_direction) * 100
    
    return {
        'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'MedAE': medae, 
        'R2': r2, 'MAPE': mape, 'SMAPE': smape, 'Direction Accuracy': direction_accuracy
    }

# Main function
if __name__ == "__main__":
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    # Generate timestamp for unique filenames
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Load and preprocess data
    data, scaled_df, scaler, feature_columns = load_data(
        'HistoricalQuotes.csv',
        sequence_length=60,
        add_technical_indicators=True
    )
    
    # Create sequence dataset
    (train_loader, test_loader, 
     X_train_tensor, y_train_tensor, 
     X_test_tensor, y_test_tensor,
     train_size, sequence_length) = create_sequences(
        scaled_df, 
        sequence_length=60,
        feature_columns=feature_columns
    )
    
    # Prepare raw data for analysis
    X_train = X_train_tensor.cpu().numpy()
    y_train = y_train_tensor.cpu().numpy()
    X_test = X_test_tensor.cpu().numpy()
    y_test = y_test_tensor.cpu().numpy()
    
    # Create time indices
    test_start_idx = train_size + sequence_length
    test_end_idx = test_start_idx + len(y_test)
    test_dates = data['Date'].iloc[test_start_idx:test_end_idx].values
    
    # 1. Feature importance analysis
    print("\n=== Feature Importance Analysis ===")
    input_size = len(feature_columns)
    
    # Train a base model for feature analysis
    base_model = OptimizedGRU(input_size, hidden_size=32, num_layers=1, dropout=0.1).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.AdamW(base_model.parameters(), lr=0.001, weight_decay=1e-5)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, min_lr=1e-6)
    
    base_model, _, _ = train_model(base_model, train_loader, test_loader, criterion, optimizer, scheduler,
                                  device, "BaseModel", epochs=30, patience=5)
    
    # Analyze feature importance
    def analyze_feature_importance(model, X, y, feature_columns, device):
        """Analyze feature importance using correlation with last timestep"""
        X_np = X.cpu().numpy()
        y_np = y.cpu().numpy()
        
        n_samples = X_np.shape[0]
        n_features = X_np.shape[2]
        correlations = []
        y_flat = y_np.flatten()

        for i in range(n_features):
            feature_data = X_np[:, -1, i]
            
            if np.std(feature_data) < 1e-10 or np.std(y_flat) < 1e-10:
                corr = 0.0
            else:
                corr = np.corrcoef(feature_data, y_flat)[0, 1]
                if np.isnan(corr):
                    corr = 0.0
            correlations.append(abs(corr))

        feature_importance = pd.DataFrame({
            'Feature': feature_columns,
            'Importance': correlations
        }).sort_values('Importance', ascending=False)

        print("\nFeature Importance Analysis Results:")
        print(feature_importance)
        return feature_importance
    
    feature_importance = analyze_feature_importance(
        base_model, 
        X_test_tensor, 
        y_test_tensor, 
        feature_columns, 
        device
    )
    
    # Plot feature importance
    plt.figure(figsize=(12, 8))
    plt.barh(feature_importance['Feature'], feature_importance['Importance'], color='skyblue')
    plt.title('Feature Importance Analysis')
    plt.xlabel('Importance Score')
    plt.ylabel('Feature')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(f'/kaggle/working/feature_importance_{timestamp}.png')
    plt.close()
    
    # Plot feature correlation heatmap
    plt.figure(figsize=(15, 12))
    corr_matrix = data[feature_columns].corr()
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
    plt.title('Feature Correlation Heatmap')
    plt.tight_layout()
    plt.savefig(f'/kaggle/working/feature_correlation_heatmap_{timestamp}.png')
    plt.close()
    
    # 2. Multi-model comparison experiment
    print("\n=== Multi-Model Comparison Experiment ===")
    models_to_compare = [
        ("Baseline GRU", OptimizedGRU, {
            'input_size': input_size,
            'hidden_size': 32,
            'num_layers': 1,
            'dropout': 0.1
        }),
        ("Optimized LSTM", OptimizedLSTM, {
            'input_size': input_size,
            'hidden_size': 32,
            'num_layers': 1,
            'dropout': 0.1
        }),
        ("Optimized Transformer", OptimizedTransformer, {
            'input_size': input_size,
            'hidden_size': 32,
            'num_layers': 1,
            'num_heads': 2,
            'dropout': 0.1
        }),
        ("Optimized Attention LSTM", OptimizedAttentionLSTM, {
            'input_size': input_size,
            'hidden_size': 32,
            'num_layers': 1,
            'dropout': 0.1
        })
    ]
    
    # Run multi-model comparison
    model_results = run_model_comparison(
        models_to_compare, 
        train_loader, test_loader, 
        X_train, y_train, X_test, y_test,
        scaler, device, feature_columns
    )
    
    # 3. Visualize comparison results
    print("\n=== Visualization Results ===")
    
    # Multi-model metrics comparison
    metrics_list = []
    for model_name, result in model_results.items():
        metrics = result['metrics']
        metrics['Model'] = model_name
        metrics_list.append(metrics)
    
    df = pd.DataFrame(metrics_list)
    
    metrics_to_compare = ['RMSE', 'MAE', 'MAPE', 'Direction Accuracy', 'R2']
    
    fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(18, 12))
    axes = axes.flatten()
    
    for i, metric in enumerate(metrics_to_compare):
        if i < len(axes) and metric in df.columns:
            ax = axes[i]
            df.plot(x='Model', y=metric, kind='bar', ax=ax, legend=False)
            ax.set_title(f'{metric} Comparison')
            ax.set_ylabel(metric)
            ax.grid(True, alpha=0.3)
            plt.setp(ax.get_xticklabels(), rotation=45, ha='right')
    
    for i in range(len(metrics_to_compare), len(axes)):
        fig.delaxes(axes[i])
    
    plt.tight_layout()
    plt.savefig(f'/kaggle/working/model_metrics_comparison_{timestamp}.png')
    plt.close()
    
    # Best model detailed analysis
    best_model_name = min(model_results.keys(), key=lambda k: model_results[k]['metrics']['RMSE'])
    print(f"\nBest Model: {best_model_name}")
    print("Best Model Metrics:")
    for metric, value in model_results[best_model_name]['metrics'].items():
        if isinstance(value, (int, float)):
            print(f"{metric}: {value:.4f}")
        else:
            print(f"{metric}: {value}")
    
    # Plot best model predictions
    plt.figure(figsize=(18, 10))
    plt.plot(test_dates, model_results[best_model_name]['actuals'], label='Actual Price', linewidth=3, color='blue', alpha=0.8)
    plt.plot(test_dates, model_results[best_model_name]['predictions'], label='Predicted Price', linewidth=2, color='red', linestyle='--')
    plt.title(f'AAPL Stock Price Prediction - {best_model_name}')
    plt.xlabel('Date')
    plt.ylabel('Close Price ($)')
    plt.legend(fontsize=12)
    plt.grid(True, alpha=0.3)
    
    metrics_text = f"RMSE: {model_results[best_model_name]['metrics']['RMSE']:.2f}, MAE: {model_results[best_model_name]['metrics']['MAE']:.2f}, "
    metrics_text += f"MAPE: {model_results[best_model_name]['metrics']['MAPE']:.2f}%, Direction Accuracy: {model_results[best_model_name]['metrics']['Direction Accuracy']:.2f}%"
    plt.figtext(0.5, 0.01, metrics_text, ha='center', fontsize=12, 
                bbox=dict(facecolor='white', alpha=0.8, edgecolor='gray'))
    
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.savefig(f'/kaggle/working/best_model_prediction_{timestamp}.png')
    plt.close()
    
    # 4. Compare predictions from all models
    plt.figure(figsize=(20, 12))
    plt.plot(test_dates, model_results[best_model_name]['actuals'], label='Actual Price', linewidth=4, color='black', alpha=0.9)
    
    colors = ['red', 'green', 'blue', 'purple']
    linestyles = ['--', '-.', ':', '-']
    
    for i, (model_name, result) in enumerate(model_results.items()):
        plt.plot(test_dates, result['predictions'], 
                label=model_name, 
                linewidth=2, 
                color=colors[i % len(colors)],
                linestyle=linestyles[i % len(linestyles)],
                alpha=0.7)
    
    plt.title('AAPL Stock Price Prediction - All Models Comparison')
    plt.xlabel('Date')
    plt.ylabel('Close Price ($)')
    plt.legend(fontsize=12, loc='upper left')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(f'/kaggle/working/all_models_prediction_comparison_{timestamp}.png')
    plt.close()
    
    print("\n=== Experiment Completed ===")
    print("\nOptimized AAPL stock price prediction model experiment completed!")
    print(f"All plots have been saved to /kaggle/working/ directory.")

Using device: cpu
Loading AAPL historical stock data...

Data shape: (2518, 6)

First 5 rows:
         Date Close/Last     Volume      Open      High       Low
0  02/28/2020    $273.36  106721200   $257.26   $278.41   $256.37
1  02/27/2020    $273.52   80151380    $281.1      $286   $272.96
2  02/26/2020    $292.65   49678430   $286.53   $297.88    $286.5
3  02/25/2020    $288.08   57668360   $300.95   $302.53   $286.13
4  02/24/2020    $298.18   55548830   $297.26   $304.18   $289.23

Calculating technical indicators...

Cleaned data shape: (2518, 18)

Basic data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2518 entries, 0 to 2517
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Date           2518 non-null   datetime64[ns]
 1   Close          2518 non-null   float64       
 2   Open           2518 non-null   float64       
 3   High           2518 non-null   float64       
 4   Low  