In [2]:
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller
from pmdarima import auto_arima
from prophet import Prophet
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from datetime import timedelta
from scipy.stats import trim_mean
import os
import logging
import cmdstanpy
import re
from pathlib import Path
from tqdm import tqdm

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Suppress cmdstanpy debug logs to reduce console clutter
cmdstanpy.utils.get_logger().setLevel(logging.WARNING)

In [None]:
def load_and_prepare_data(stock_csv, balance_csv, cashflow_csv, income_csv, ticker):
    """
    Load and preprocess stock and financial data, aligning them for modeling.
    """
    try:
        # Load stock data
        stock_df = pd.read_csv(stock_csv)
        stock_df['Date'] = pd.to_datetime(stock_df['Date'])
        stock_df = stock_df[stock_df['Ticker'] == ticker][['Date', 'Close', 'Volume']].sort_values('Date')
        stock_df = stock_df.drop_duplicates(subset='Date')
        
        # Validate stock data
        if stock_df['Close'].min() <= 0:
            raise ValueError("Invalid stock data: Close prices contain zero or negative values")
        
        # Create daily date range and fill missing dates
        date_range = pd.date_range(start=stock_df['Date'].min(), end=stock_df['Date'].max(), freq='D')
        stock_df = stock_df.set_index('Date').reindex(date_range, method='ffill').reset_index()
        stock_df = stock_df.rename(columns={'index': 'Date'})
        stock_df['Ticker'] = ticker
        
        print(f"Loaded {len(stock_df)} daily stock data points for {ticker}")
        
        # Load financial data
        balance_df = pd.read_csv(balance_csv)
        cashflow_df = pd.read_csv(cashflow_csv)
        income_df = pd.read_csv(income_csv)
        
        # Select key financial metrics
        financial_metrics = {
            'Diluted EPS': income_df[income_df['index'] == 'Diluted EPS'][['2024-09-30 00:00:00']].iloc[0, 0],
            'Free Cash Flow': cashflow_df[cashflow_df['index'] == 'Free Cash Flow'][['2024-09-30 00:00:00']].iloc[0, 0],
            'Net Debt': balance_df[balance_df['index'] == 'Net Debt'][['2024-09-30 00:00:00']].iloc[0, 0],
            'EBITDA': income_df[income_df['index'] == 'EBITDA'][['2024-09-30 00:00:00']].iloc[0, 0]
        }
        
        # Create financial DataFrame aligned with stock data
        financial_df = pd.DataFrame(index=stock_df['Date'])
        for metric, value in financial_metrics.items():
            financial_df[metric] = value
        
        # Merge stock and financial data
        merged_df = stock_df.merge(financial_df.reset_index(), on='Date', how='left')
        merged_df = merged_df.set_index('Date')
        
        # Scale financial features
        scaler = StandardScaler()
        financial_cols = ['Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA']
        merged_df[financial_cols] = scaler.fit_transform(merged_df[financial_cols])
        
        print(f"Prepared data with {len(merged_df)} rows, including financial features: {list(financial_metrics.keys())}")
        return merged_df, scaler
    
    except FileNotFoundError as e:
        print(f"Error: File not found - {e}")
        return None
    except Exception as e:
        print(f"Error preparing data: {e}")
        return None

In [4]:
def clean_text(text):
    """
    Clean text by removing URLs and special characters.
    Added: Text preprocessing for sentiment analysis.
    """
    if not isinstance(text, str):
        return ""
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    return text.strip()

In [None]:
def load_news_data(news_csv, ticker, stock_dates):
    """
    Load and preprocess news data, compute sentiment scores, and align with stock data dates.
    Adjusted: Neutral fallback sentiment, text cleaning, performance optimization, date validation.
    """
    try:
        if not os.path.exists(news_csv):
            logging.error(f"News file not found: {news_csv}")
            raise FileNotFoundError(f"News file not found: {news_csv}")
        
        # Load news data
        news_df = pd.read_csv(news_csv)
        logging.info(f"Raw news data shape: {news_df.shape}, columns: {list(news_df.columns)}")
        
        # Parse dates
        news_df['Date'] = pd.to_datetime(news_df['Date'])
        invalid_dates = news_df['Date'].isna().sum()
        if invalid_dates > 0:
            logging.warning(f"Dropped {invalid_dates} rows due to invalid dates")
        news_df = news_df.dropna(subset=['Date'])
        logging.info(f"After date parsing, news data shape: {news_df.shape}")
        
        # Filter for ticker
        news_df['Ticker'] = news_df['Ticker'].str.strip().str.upper()
        ticker = ticker.strip().upper()
        news_df = news_df[news_df['Ticker'] == ticker]
        logging.info(f"After filtering for ticker '{ticker}', news data shape: {news_df.shape}")
        
        if news_df.empty:
            logging.warning(f"No news articles found for {ticker}. Using neutral sentiment scores.")
            return pd.DataFrame({'Date': stock_dates, 'Sentiment_Score': 0.0})
        
        # Initialize sentiment analyzer
        analyzer = SentimentIntensityAnalyzer()
        
        # Compute sentiment with progress bar
        def get_sentiment(row):
            text = (clean_text(row['Content']) if pd.notna(row['Content']) else
                    clean_text(row['Description']) if pd.notna(row['Description']) else
                    clean_text(row['Title']) if pd.notna(row['Title']) else "")
            if not text:
                logging.debug(f"Empty text for row: {row.name}. Returning 0.0")
                return 0.0
            scores = analyzer.polarity_scores(text)
            return scores['compound']
        
        news_df['Sentiment_Score'] = [get_sentiment(row) for row in tqdm(news_df.to_dict('records'), desc="Sentiment Analysis")]
        logging.info(f"Sentiment score summary: {news_df['Sentiment_Score'].describe().to_dict()}")
        
        # Aggregate sentiment by date
        sentiment_df = news_df.groupby('Date')['Sentiment_Score'].mean().reset_index()
        sentiment_df['Date'] = pd.to_datetime(sentiment_df['Date'])
        logging.info(f"Aggregated sentiment data shape: {sentiment_df.shape}")
        
        # Create full sentiment DataFrame
        sentiment_full = pd.DataFrame({'Date': stock_dates, 'Sentiment_Score': 0.0})
        sentiment_full = sentiment_full.merge(sentiment_df, on='Date', how='left', suffixes=('', '_news'))
        sentiment_full['Sentiment_Score'] = sentiment_full['Sentiment_Score_news'].fillna(0.0)
        
        # Forward-fill sentiment after first non-zero score
        first_news_date = sentiment_full[sentiment_full['Sentiment_Score'] != 0.0]['Date'].min()
        if pd.notna(first_news_date):
            mask = sentiment_full['Date'] >= first_news_date
            sentiment_full.loc[mask, 'Sentiment_Score'] = sentiment_full.loc[mask, 'Sentiment_Score'].ffill()
        
        sentiment_full = sentiment_full[['Date', 'Sentiment_Score']]
        logging.info(f"Final sentiment data shape: {sentiment_full.shape}, non-zero scores: {sentiment_full['Sentiment_Score'].ne(0).sum()}")
        return sentiment_full
    
    except FileNotFoundError as e:
        logging.error(f"File not found: {e}")
        return pd.DataFrame({'Date': stock_dates, 'Sentiment_Score': 0.0})
    except Exception as e:
        logging.error(f"Error processing news data: {e}")
        return pd.DataFrame({'Date': stock_dates, 'Sentiment_Score': 0.0})


In [6]:
def calculate_metrics(actual, predicted):
    """
    Calculate RMSE, MAE, and MAPE for model evaluation.
    Adjusted: Log filtered data points.
    """
    actual = np.array(actual)
    predicted = np.array(predicted)
    mask = (actual > 0) & (~np.isnan(actual)) & (~np.isnan(predicted))
    actual = actual[mask]
    predicted = predicted[mask]
    
    if len(actual) < len(np.array(actual, copy=True)):
        logging.info(f"Filtered {len(np.array(actual, copy=True)) - len(actual)} invalid data points for metrics calculation")
    
    if len(actual) == 0:
        logging.warning("No valid data for metrics calculation")
        return {'RMSE': np.nan, 'MAE': np.nan, 'MAPE': np.nan}
    
    rmse = np.sqrt(mean_squared_error(actual, predicted))
    mae = mean_absolute_error(actual, predicted)
    mape = np.mean(np.abs((actual - predicted) / actual)) * 100 if np.all(actual != 0) else np.nan
    return {'RMSE': rmse, 'MAE': mae, 'MAPE': mape}

In [7]:
def arima_forecast(data, forecast_horizon=7):
    """
    Fit ARIMA model with stationarity check.
    Adjusted: Validate data, handle insufficient data, robust error handling.
    """
    try:
        # Validate data
        data = data.dropna()
        if data.isna().any() or np.isinf(data).any():
            logging.error("Invalid data: Contains NaN or inf values")
            return None, None, None
        if len(data) < 30:
            logging.error("Insufficient data for ARIMA (<30 days)")
            return None, None, None
        
        # Check stationarity
        result = adfuller(data)
        if result[1] > 0.05:
            logging.info("Data is non-stationary, applying differencing")
            data_diff = data.diff().dropna()
            if len(data_diff) < 10:
                logging.error("Insufficient data after differencing")
                return None, None, None
            model = auto_arima(data_diff, seasonal=False, max_p=7, max_q=7, max_d=2,
                              stepwise=True, trace=True, error_action='ignore')
            best_order = model.order
            arima_model = ARIMA(data_diff, order=best_order).fit()
            forecast_diff = arima_model.forecast(steps=forecast_horizon)
            forecast = data.iloc[-1] + forecast_diff.cumsum()
        else:
            model = auto_arima(data, seasonal=False, max_p=7, max_q=7, max_d=2,
                              stepwise=True, trace=True, error_action='ignore')
            best_order = model.order
            arima_model = ARIMA(data, order=best_order).fit()
            forecast = arima_model.forecast(steps=forecast_horizon)
        
        # Evaluate
        if len(data) >= forecast_horizon:
            test_data = data[-forecast_horizon:]
            forecast_test = arima_model.forecast(steps=forecast_horizon)[-forecast_horizon:]
            metrics = calculate_metrics(test_data, forecast_test)
        else:
            metrics = {'RMSE': np.nan, 'MAE': np.nan, 'MAPE': np.nan}
        
        logging.info(f"ARIMA Metrics: {metrics}")
        return forecast, metrics, best_order
    except Exception as e:
        logging.error(f"Error in ARIMA forecasting: {e}")
        return None, None, None

In [8]:
def prophet_forecast(data, forecast_horizon=7, changepoint_prior_scale=0.05):
    """
    Fit Prophet model with dynamic regressors.
    Adjusted: Forecast regressors, handle duplicates, enable MCMC for small datasets.
    """
    try:
        logging.info(f"Prophet input DataFrame columns: {list(data.columns)}")
        prophet_df = data.reset_index()[['Date', 'Close', 'Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA', 'Sentiment_Score']]
        prophet_df = prophet_df.drop_duplicates(subset='Date')
        prophet_df = prophet_df.rename(columns={'Date': 'ds', 'Close': 'y'})
        
        logging.info(f"Prophet input data summary:\n{prophet_df.describe()}")
        
        # Fit Prophet model
        model = Prophet(daily_seasonality=True, yearly_seasonality=True, weekly_seasonality=True,
                       changepoint_prior_scale=changepoint_prior_scale, mcmc_samples=300 if len(prophet_df) < 1000 else 0)
        for regressor in ['Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA', 'Sentiment_Score']:
            model.add_regressor(regressor)
        model.fit(prophet_df)
        
        # Create future dataframe with dynamic regressors
        future = model.make_future_dataframe(periods=forecast_horizon)
        for regressor in ['Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA']:
            future[regressor] = prophet_df[regressor].mean()  # Use mean of historical data
        future['Sentiment_Score'] = prophet_df['Sentiment_Score'].iloc[-10:].mean()  # Recent sentiment trend
        forecast_df = model.predict(future)
        
        # Extract forecast
        forecast = forecast_df[['ds', 'yhat']].tail(forecast_horizon).set_index('ds')['yhat']
        
        # Evaluate
        test_data = prophet_df['y'][-forecast_horizon:]
        forecast_test = model.predict(prophet_df[-forecast_horizon:])['yhat']
        metrics = calculate_metrics(test_data, forecast_test)
        
        logging.info(f"Prophet Metrics (changepoint_prior_scale={changepoint_prior_scale}): {metrics}")
        return forecast, metrics, forecast_df
    except Exception as e:
        logging.error(f"Error in Prophet forecasting: {e}")
        return None, None, None

In [9]:
def tune_prophet(data, forecast_horizon=7):
    """
    Tune Prophet model by testing changepoint_prior_scale values.
    """
    scales = [0.05, 0.1, 0.5]
    best_metrics = {'RMSE': float('inf')}
    best_forecast = None
    best_forecast_df = None
    best_scale = 0.05
    
    for scale in scales:
        try:
            forecast, metrics, forecast_df = prophet_forecast(data, forecast_horizon, changepoint_prior_scale=scale)
            if forecast is not None and metrics is not None and not np.any(np.isnan(forecast)):
                logging.info(f"Prophet Metrics (changepoint_prior_scale={scale}): {metrics}")
                if metrics['RMSE'] < best_metrics['RMSE']:
                    best_metrics = metrics
                    best_forecast = forecast
                    best_forecast_df = forecast_df
                    best_scale = scale
            else:
                logging.warning(f"Prophet failed for scale={scale}. Skipping.")
        except Exception as e:
            logging.warning(f"Prophet failed for scale={scale}: {e}")
            continue
    
    if best_forecast is None:
        logging.warning("All Prophet models failed. Using default scale=0.05.")
        forecast, metrics, forecast_df = prophet_forecast(data, forecast_horizon, changepoint_prior_scale=0.05)
        best_metrics = metrics if metrics else {'RMSE': np.nan, 'MAE': np.nan, 'MAPE': np.nan}
        best_forecast = forecast
        best_forecast_df = forecast_df
        best_scale = 0.05
    
    logging.info(f"Best Prophet changepoint_prior_scale: {best_scale}")
    logging.info(f"Best Prophet Metrics: {best_metrics}")
    return best_forecast, best_metrics, best_forecast_df, best_scale

In [10]:
def backtest_strategy(data, predictions, model_name, threshold=None):
    """
    Backtest a trading strategy with volatility-based threshold.
    Adjusted: Volatility-based threshold, prevent lookahead bias, add transaction costs.
    """
    try:
        logging.info(f"Backtest input lengths - Data: {len(data)}, Predictions: {len(predictions)}")
        if len(predictions) != len(data):
            logging.warning(f"Predictions length ({len(predictions)}) does not match data length ({len(data)}). Truncating.")
            min_length = min(len(data), len(predictions))
            data = data.iloc[:min_length]
            predictions = predictions[:min_length]
        
        pred_df = pd.DataFrame({'Date': data.index, 'Close': data['Close'], 'Prediction': predictions})
        pred_df = pred_df.dropna()
        
        # Volatility-based threshold
        volatility = pred_df['Close'].pct_change().std()
        threshold = volatility * 2 if threshold is None else threshold
        
        # Generate signals without lookahead bias
        pred_df['Signal'] = 0
        pred_df.loc[1:, 'Signal'] = np.where(
            pred_df['Prediction'][1:] > pred_df['Close'][:-1] * (1 + threshold), 1,
            np.where(pred_df['Prediction'][1:] < pred_df['Close'][:-1] * (1 - threshold), -1, 0)
        )
        
        # Calculate returns with transaction costs
        pred_df['Return'] = pred_df['Close'].pct_change()
        transaction_cost = 0.001  # 0.1% per trade
        pred_df['Strategy_Return'] = pred_df['Signal'].shift(1) * pred_df['Return'] - \
                                    pred_df['Signal'].abs().shift(1) * transaction_cost
        
        # Cumulative return
        cumulative_return = (1 + pred_df['Strategy_Return'].dropna()).cumprod().iloc[-1] - 1
        num_trades = pred_df['Signal'].abs().sum()
        
        results = {
            'Cumulative Return (%)': cumulative_return * 100,
            'Number of Trades': num_trades
        }
        
        logging.info(f"Backtest Results for {model_name}: {results}")
        return results, pred_df
    except Exception as e:
        logging.error(f"Error in backtesting: {e}")
        return None, None


In [11]:
def walk_forward_validation(data, forecast_horizon=7, n_folds=10):
    """
    Perform walk-forward validation with robust averaging.
    Adjusted: Increase folds, handle missing regressors, robust averaging.
    """
    n_folds = min(n_folds, len(data) // forecast_horizon)
    arima_metrics_list = []
    prophet_metrics_list = []
    
    for i in range(n_folds):
        train_end = len(data) - (n_folds - i) * forecast_horizon
        if train_end <= forecast_horizon:
            continue
        
        train_data = data.iloc[:train_end]
        test_data = data.iloc[train_end:train_end + forecast_horizon]['Close']
        
        if len(test_data) != forecast_horizon:
            continue
        
        # ARIMA
        arima_model = auto_arima(train_data['Close'], seasonal=False, max_p=7, max_q=7, max_d=2,
                                stepwise=True, error_action='ignore')
        arima_fit = ARIMA(train_data['Close'], order=arima_model.order).fit()
        arima_pred = arima_fit.forecast(steps=forecast_horizon)
        arima_metrics = calculate_metrics(test_data, arima_pred)
        arima_metrics_list.append(arima_metrics)
        
        # Prophet
        prophet_df = train_data.reset_index()[['Date', 'Close', 'Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA', 'Sentiment_Score']]
        prophet_df = prophet_df.rename(columns={'Date': 'ds', 'Close': 'y'})
        
        prophet_model = Prophet(daily_seasonality=True, yearly_seasonality=True, weekly_seasonality=True,
                               changepoint_prior_scale=0.05, mcmc_samples=0)
        for regressor in ['Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA', 'Sentiment_Score']:
            prophet_model.add_regressor(regressor)
        prophet_model.fit(prophet_df)
        future = prophet_model.make_future_dataframe(periods=forecast_horizon)
        for regressor in ['Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA', 'Sentiment_Score']:
            future[regressor] = prophet_df[regressor].ffill().iloc[-1]
        prophet_pred_df = prophet_model.predict(future)
        prophet_pred = prophet_pred_df['yhat'].tail(forecast_horizon)
        prophet_metrics = calculate_metrics(test_data, prophet_pred)
        prophet_metrics_list.append(prophet_metrics)
    
    # Robust averaging
    avg_metrics = {
        'ARIMA': {
            'RMSE': trim_mean([m['RMSE'] for m in arima_metrics_list], proportiontocut=0.1) if arima_metrics_list else np.nan,
            'MAE': trim_mean([m['MAE'] for m in arima_metrics_list], proportiontocut=0.1) if arima_metrics_list else np.nan,
            'MAPE': trim_mean([m['MAPE'] for m in arima_metrics_list if not np.isnan(m['MAPE'])], proportiontocut=0.1) if any(not np.isnan(m['MAPE']) for m in arima_metrics_list) else np.nan
        },
        'Prophet': {
            'RMSE': trim_mean([m['RMSE'] for m in prophet_metrics_list], proportiontocut=0.1) if prophet_metrics_list else np.nan,
            'MAE': trim_mean([m['MAE'] for m in prophet_metrics_list], proportiontocut=0.1) if prophet_metrics_list else np.nan,
            'MAPE': trim_mean([m['MAPE'] for m in prophet_metrics_list if not np.isnan(m['MAPE'])], proportiontocut=0.1) if any(not np.isnan(m['MAPE']) for m in prophet_metrics_list) else np.nan
        }
    }
    
    logging.info(f"Walk-Forward Validation Results: {avg_metrics}")
    return avg_metrics

In [12]:
def plot_predictions(data, arima_forecast, prophet_forecast, prophet_forecast_df, backtest_arima_df, backtest_prophet_df, ticker, forecast_horizon=7):
    """
    Plot predictions with dynamic titles and interactive sentiment visualization using Plotly.
    Adjusted: Uses scatter points for significant sentiment, interactive subplots, saves HTML and PNG.
    """
    # Calculate future dates
    last_date = data.index[-1]
    future_dates = [last_date + timedelta(days=i+1) for i in range(forecast_horizon)]
    
    # Create 1x2 subplot with secondary y-axes
    fig = make_subplots(rows=1, cols=2, 
                        subplot_titles=(f'ARIMA: {ticker} Stock Price Prediction', 
                                        f'Prophet: {ticker} Stock Price Prediction'),
                        shared_yaxes=True,
                        specs=[[{'secondary_y': True}, {'secondary_y': True}]])
    
    # --- ARIMA Subplot (Left, Column 1) ---
    # Historical Close
    fig.add_trace(
        go.Scatter(x=data.index, y=data['Close'], name='Historical Close', line=dict(color='blue')),
        row=1, col=1, secondary_y=False
    )
    
    # ARIMA Forecast
    if arima_forecast is not None:
        fig.add_trace(
            go.Scatter(x=future_dates, y=arima_forecast, name='ARIMA Forecast', 
                       line=dict(color='red', dash='dash')),
            row=1, col=1, secondary_y=False
        )
    
    # ARIMA Buy/Sell Signals
    if backtest_arima_df is not None:
        buy_signals = backtest_arima_df[backtest_arima_df['Signal'] == 1]
        sell_signals = backtest_arima_df[backtest_arima_df['Signal'] == -1]
        fig.add_trace(
            go.Scatter(x=buy_signals['Date'], y=buy_signals['Close'], name='ARIMA Buy Signal',
                       mode='markers', marker=dict(color='green', symbol='triangle-up', size=10, opacity=0.7)),
            row=1, col=1, secondary_y=False
        )
        fig.add_trace(
            go.Scatter(x=sell_signals['Date'], y=sell_signals['Close'], name='ARIMA Sell Signal',
                       mode='markers', marker=dict(color='red', symbol='triangle-down', size=10, opacity=0.7)),
            row=1, col=1, secondary_y=False
        )
    
    # Sentiment Scores (Scatter Points)
    if 'Sentiment_Score' in data.columns:
        sentiment_mask = (data['Sentiment_Score'] != 0.0) & (data['Sentiment_Score'].abs() > 0.3)
        if sentiment_mask.any():
            fig.add_trace(
                go.Scatter(
                    x=data.index[sentiment_mask], 
                    y=data['Sentiment_Score'][sentiment_mask],
                    name='Significant Sentiment',
                    mode='markers',
                    marker=dict(color='purple', size=8, opacity=0.5),
                    text=[f"Score: {s:.2f}" for s in data['Sentiment_Score'][sentiment_mask]],
                    hoverinfo='x+text'
                ),
                row=1, col=1, secondary_y=True
            )
    
    # Historical Close
    fig.add_trace(
        go.Scatter(x=data.index, y=data['Close'], name='Historical Close', line=dict(color='blue'), showlegend=False),
        row=1, col=2, secondary_y=False
    )
    
    # Prophet Forecast
    if prophet_forecast is not None:
        fig.add_trace(
            go.Scatter(x=future_dates, y=prophet_forecast, name='Prophet Forecast', 
                       line=dict(color='green', dash='dash')),
            row=1, col=2, secondary_y=False
        )
        # Confidence Interval
        fig.add_trace(
            go.Scatter(
                x=future_dates + future_dates[::-1],
                y=list(prophet_forecast_df['yhat_upper'].tail(forecast_horizon)) + 
                  list(prophet_forecast_df['yhat_lower'].tail(forecast_horizon))[::-1],
                fill='toself',
                fillcolor='rgba(0, 128, 0, 0.1)',
                line=dict(color='rgba(255,255,255,0)'),
                name='Prophet Confidence Interval',
                hoverinfo='skip'
            ),
            row=1, col=2, secondary_y=False
        )
    
    # Prophet Buy/Sell Signals
    if backtest_prophet_df is not None:
        buy_signals = backtest_prophet_df[backtest_prophet_df['Signal'] == 1]
        sell_signals = backtest_prophet_df[backtest_prophet_df['Signal'] == -1]
        fig.add_trace(
            go.Scatter(x=buy_signals['Date'], y=buy_signals['Close'], name='Prophet Buy Signal',
                       mode='markers', marker=dict(color='lime', symbol='triangle-up', size=10, opacity=0.7)),
            row=1, col=2, secondary_y=False
        )
        fig.add_trace(
            go.Scatter(x=sell_signals['Date'], y=sell_signals['Close'], name='Prophet Sell Signal',
                       mode='markers', marker=dict(color='darkred', symbol='triangle-down', size=10, opacity=0.7)),
            row=1, col=2, secondary_y=False
        )
    
    # Sentiment Scores (Scatter Points)
    if 'Sentiment_Score' in data.columns:
        if sentiment_mask.any():
            fig.add_trace(
                go.Scatter(
                    x=data.index[sentiment_mask], 
                    y=data['Sentiment_Score'][sentiment_mask],
                    name='Significant Sentiment',
                    mode='markers',
                    marker=dict(color='purple', size=8, opacity=0.5),
                    text=[f"Score: {s:.2f}" for s in data['Sentiment_Score'][sentiment_mask]],
                    hoverinfo='x+text',
                    showlegend=False
                ),
                row=1, col=2, secondary_y=True
            )
    
    # Update layout
    fig.update_layout(
        title=dict(text=f'{ticker} Stock Price Prediction and Backtest Signals (7-Day Forecast)', 
                   x=0.5, xanchor='center', y=0.98),
        height=600, width=1200,
        showlegend=True,
        legend=dict(orientation='h', yanchor='bottom', y=-0.2, xanchor='center', x=0.5),
        hovermode='x unified'
    )
    
    # Update y-axes
    fig.update_yaxes(title_text='Close Price (USD)', row=1, col=1, secondary_y=False)
    fig.update_yaxes(title_text='Sentiment Score (-1 to 1)', row=1, col=1, secondary_y=True, 
                     showgrid=False, tickfont=dict(color='purple'))
    fig.update_yaxes(title_text='Close Price (USD)', row=1, col=2, secondary_y=False)
    fig.update_yaxes(title_text='Sentiment Score (-1 to 1)', row=1, col=2, secondary_y=True, 
                     showgrid=False, tickfont=dict(color='purple'))
    
    # Add range slider and selectors
    fig.update_xaxes(
        rangeslider_visible=True,
        rangeselector=dict(
            buttons=list([
                dict(count=1, label='1m', step='month', stepmode='backward'),
                dict(count=6, label='6m', step='month', stepmode='backward'),
                dict(count=1, label='1y', step='year', stepmode='backward'),
                dict(step='all', label='All')
            ])
        ),
        row=1, col=1
    )
    fig.update_xaxes(
        rangeslider_visible=True,
        rangeselector=dict(
            buttons=list([
                dict(count=1, label='1m', step='month', stepmode='backward'),
                dict(count=6, label='6m', step='month', stepmode='backward'),
                dict(count=1, label='1y', step='year', stepmode='backward'),
                dict(step='all', label='All')
            ])
        ),
        row=1, col=2
    )
    
    # Save interactive HTML and static PNG
    fig.write_image(f'{ticker}_stock_price_predictions.png', scale=2) 
    
    # Display in notebook
    fig.show()

**Main**

In [13]:
# Parameters
forecast_horizon = 7
data_dir = Path('../data')
stock_csv = data_dir / 'cleaned_stock_data.csv'

# Configure cmdstanpy
cmdstan_dir = os.path.expanduser('~/.cmdstan')
os.makedirs(cmdstan_dir, exist_ok=True)
os.environ['CMDSTAN'] = cmdstan_dir
logging.info(f"Set CMDSTAN directory to {cmdstan_dir}")

# Get tickers
tickers = pd.read_csv(stock_csv)['Ticker'].unique()[:5]  # Process up to 5 tickers
results = []

for ticker in tickers:
    logging.info(f"Processing ticker: {ticker}")
    balance_csv = data_dir / f'balance_sheet_{ticker}.csv'
    cashflow_csv = data_dir / f'cash_flow_{ticker}.csv'
    income_csv = data_dir / f'income_statement_{ticker}.csv'
    news_csv = data_dir / 'news_data.csv'
    
    # Load and prepare data
    data, scaler = load_and_prepare_data(stock_csv, balance_csv, cashflow_csv, income_csv, ticker)
    
    # Load news data and merge
    sentiment_df = load_news_data(news_csv, ticker, data.index)
    if sentiment_df is not None:
        data = data.reset_index().merge(sentiment_df[['Date', 'Sentiment_Score']], on='Date', how='left')
        data['Sentiment_Score'] = data['Sentiment_Score'].fillna(0.0)
        data = data.set_index('Date')
        logging.info(f"Added Sentiment_Score to data for {ticker}. Columns: {list(data.columns)}")
    else:
        logging.warning(f"No sentiment data for {ticker}. Using neutral scores.")
        data['Sentiment_Score'] = 0.0

2025-05-05 22:03:23,810 - INFO - Set CMDSTAN directory to C:\Users\nguye/.cmdstan
2025-05-05 22:03:23,817 - INFO - Processing ticker: AAPL
2025-05-05 22:03:23,854 - INFO - Raw news data shape: (200, 7), columns: ['Ticker', 'Date', 'Title', 'Description', 'Source', 'URL', 'Content']
2025-05-05 22:03:23,854 - INFO - After date parsing, news data shape: (200, 7)
2025-05-05 22:03:23,854 - INFO - After filtering for ticker 'AAPL', news data shape: (200, 7)


Loaded 726 daily stock data points for AAPL
Prepared data with 726 rows, including financial features: ['Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA']


Sentiment Analysis: 100%|██████████| 200/200 [00:02<00:00, 84.11it/s]
2025-05-05 22:03:26,253 - INFO - Sentiment score summary: {'count': 200.0, 'mean': 0.6353595, 'std': 0.5866429150331083, 'min': -0.9746, '25%': 0.33455, '50%': 0.9899, '75%': 0.9982, 'max': 0.9996}
2025-05-05 22:03:26,253 - INFO - Aggregated sentiment data shape: (5, 2)
2025-05-05 22:03:26,253 - INFO - Final sentiment data shape: (726, 2), non-zero scores: 5
2025-05-05 22:03:26,253 - INFO - Added Sentiment_Score to data for AAPL. Columns: ['Close', 'Volume', 'Ticker', 'Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA', 'Sentiment_Score']


In [14]:
data

Unnamed: 0_level_0,Close,Volume,Ticker,Diluted EPS,Free Cash Flow,Net Debt,EBITDA,Sentiment_Score
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2023-05-08,173.500,55962634.0,AAPL,0.0,0.0,0.0,0.0,0.000000
2023-05-09,171.770,45326874.0,AAPL,0.0,0.0,0.0,0.0,0.000000
2023-05-10,173.555,53724501.0,AAPL,0.0,0.0,0.0,0.0,0.000000
2023-05-11,173.750,49473076.0,AAPL,0.0,0.0,0.0,0.0,0.000000
2023-05-12,172.570,45533138.0,AAPL,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...
2025-04-28,210.140,38737224.0,AAPL,0.0,0.0,0.0,0.0,0.804225
2025-04-29,211.210,36827633.0,AAPL,0.0,0.0,0.0,0.0,0.803630
2025-04-30,212.500,52286454.0,AAPL,0.0,0.0,0.0,0.0,0.756735
2025-05-01,213.320,57364925.0,AAPL,0.0,0.0,0.0,0.0,0.500205


**Models**

In [13]:
# ARIMA forecast
arima_forecast, arima_metrics, arima_order = arima_forecast(data['Close'], forecast_horizon)

2025-05-05 21:56:42,253 - INFO - Data is non-stationary, applying differencing


Performing stepwise search to minimize aic
 ARIMA(2,0,2)(0,0,0)[0]             : AIC=3639.272, Time=0.17 sec
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=3655.321, Time=0.01 sec
 ARIMA(1,0,0)(0,0,0)[0]             : AIC=3646.713, Time=0.02 sec
 ARIMA(0,0,1)(0,0,0)[0]             : AIC=3646.090, Time=0.03 sec
 ARIMA(1,0,2)(0,0,0)[0]             : AIC=3643.767, Time=0.09 sec
 ARIMA(2,0,1)(0,0,0)[0]             : AIC=3641.632, Time=0.08 sec
 ARIMA(3,0,2)(0,0,0)[0]             : AIC=3640.161, Time=0.16 sec
 ARIMA(2,0,3)(0,0,0)[0]             : AIC=3639.780, Time=0.09 sec
 ARIMA(1,0,1)(0,0,0)[0]             : AIC=3648.085, Time=0.04 sec
 ARIMA(1,0,3)(0,0,0)[0]             : AIC=3637.914, Time=0.09 sec
 ARIMA(0,0,3)(0,0,0)[0]             : AIC=3637.281, Time=0.04 sec
 ARIMA(0,0,2)(0,0,0)[0]             : AIC=3648.073, Time=0.03 sec
 ARIMA(0,0,4)(0,0,0)[0]             : AIC=3637.470, Time=0.07 sec


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
2025-05-05 21:56:43,498 - INFO - ARIMA Metrics: {'RMSE': 210.11717332126253, 'MAE': 210.1029578810291, 'MAPE': 99.97586966264268}


 ARIMA(1,0,4)(0,0,0)[0]             : AIC=3638.421, Time=0.16 sec
 ARIMA(0,0,3)(0,0,0)[0] intercept   : AIC=3639.109, Time=0.08 sec

Best model:  ARIMA(0,0,3)(0,0,0)[0]          
Total fit time: 1.171 seconds


In [14]:
# Prophet forecast with tuning
prophet_forecast, prophet_metrics, prophet_forecast_df, prophet_scale = tune_prophet(data, forecast_horizon)

2025-05-05 21:56:43,514 - INFO - Prophet input DataFrame columns: ['Close', 'Volume', 'Ticker', 'Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA', 'Sentiment_Score']
2025-05-05 21:56:43,543 - INFO - Prophet input data summary:
                        ds           y  Diluted EPS  Free Cash Flow  Net Debt  \
count                  726  726.000000        726.0           726.0     726.0   
mean   2024-05-04 12:00:00  202.270620          0.0             0.0       0.0   
min    2023-05-08 00:00:00  165.000000          0.0             0.0       0.0   
25%    2023-11-05 06:00:00  180.950000          0.0             0.0       0.0   
50%    2024-05-04 12:00:00  193.500000          0.0             0.0       0.0   
75%    2024-11-01 18:00:00  225.860000          0.0             0.0       0.0   
max    2025-05-02 00:00:00  259.020000          0.0             0.0       0.0   
std                    NaN   24.831609          0.0             0.0       0.0   

       EBITDA  Sentiment_Score  
count 

chain 1 |          | 00:00 Status

chain 2 |          | 00:00 Status

chain 3 |          | 00:00 Status

chain 4 |          | 00:00 Status

                                                                                                                                                                                                                                                                                                                                

Exception: normal_id_glm_lpdf: Matrix of independent variables is inf, but must be finite! (in 'prophet.stan', line 137, column 2 to line 142, column 4)
	Exception: normal_id_glm_lpdf: Matrix of independent variables is inf, but must be finite! (in 'prophet.stan', line 137, column 2 to line 142, column 4)
	Exception: normal_id_glm_lpdf: Scale vector is 0, but must be positive finite! (in 'prophet.stan', line 137, column 2 to line 142, column 4)
	Exception: normal_id_glm_lpdf: Scale vector is 0, but must be positive finite! (in 'prophet.stan', line 137, column 2 to line 142, column 4)
Exception: normal_id_glm_lpdf: Matrix of independent variables is inf, but must be finite! (in 'prophet.stan', line 137, column 2 to line 142, column 4)
	Exception: normal_id_glm_lpdf: Matrix of independent variables is inf, but must be finite! (in 'prophet.stan', line 137, column 2 to line 142, column 4)
	Exception: normal_id_glm_lpdf: Scale vector is 0, but must be positive finite! (in 'prophet.stan', li




	Chain 1 had 150 iterations at max treedepth (100.0%)
	Chain 2 had 150 iterations at max treedepth (100.0%)
	Chain 3 had 150 iterations at max treedepth (100.0%)
	Chain 4 had 150 iterations at max treedepth (100.0%)
	Use the "diagnose()" method on the CmdStanMCMC object to see further information.
2025-05-05 21:57:59,449 - INFO - Prophet Metrics (changepoint_prior_scale=0.05): {'RMSE': 4.268181115498104, 'MAE': 3.4594639534728953, 'MAPE': 1.6560626873078026}
2025-05-05 21:57:59,450 - INFO - Prophet Metrics (changepoint_prior_scale=0.05): {'RMSE': 4.268181115498104, 'MAE': 3.4594639534728953, 'MAPE': 1.6560626873078026}
2025-05-05 21:57:59,451 - INFO - Prophet input DataFrame columns: ['Close', 'Volume', 'Ticker', 'Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA', 'Sentiment_Score']
2025-05-05 21:57:59,469 - INFO - Prophet input data summary:
                        ds           y  Diluted EPS  Free Cash Flow  Net Debt  \
count                  726  726.000000        726.0          

chain 1 |          | 00:00 Status

chain 2 |          | 00:00 Status

chain 3 |          | 00:00 Status

chain 4 |          | 00:00 Status

                                                                                                                                                                                                                                                                                                                                

Exception: normal_id_glm_lpdf: Matrix of independent variables is inf, but must be finite! (in 'prophet.stan', line 137, column 2 to line 142, column 4)
	Exception: normal_id_glm_lpdf: Matrix of independent variables is inf, but must be finite! (in 'prophet.stan', line 137, column 2 to line 142, column 4)
	Exception: normal_id_glm_lpdf: Scale vector is 0, but must be positive finite! (in 'prophet.stan', line 137, column 2 to line 142, column 4)
	Exception: normal_id_glm_lpdf: Scale vector is 0, but must be positive finite! (in 'prophet.stan', line 137, column 2 to line 142, column 4)
Exception: normal_id_glm_lpdf: Matrix of independent variables is inf, but must be finite! (in 'prophet.stan', line 137, column 2 to line 142, column 4)
	Exception: normal_id_glm_lpdf: Matrix of independent variables is inf, but must be finite! (in 'prophet.stan', line 137, column 2 to line 142, column 4)
	Exception: normal_id_glm_lpdf: Scale vector is 0, but must be positive finite! (in 'prophet.stan', li




2025-05-05 21:59:07,098 - INFO - Prophet Metrics (changepoint_prior_scale=0.1): {'RMSE': 5.148379687563651, 'MAE': 4.061348219056567, 'MAPE': 1.941080155714326}
2025-05-05 21:59:07,100 - INFO - Prophet Metrics (changepoint_prior_scale=0.1): {'RMSE': 5.148379687563651, 'MAE': 4.061348219056567, 'MAPE': 1.941080155714326}
2025-05-05 21:59:07,102 - INFO - Prophet input DataFrame columns: ['Close', 'Volume', 'Ticker', 'Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA', 'Sentiment_Score']
2025-05-05 21:59:07,139 - INFO - Prophet input data summary:
                        ds           y  Diluted EPS  Free Cash Flow  Net Debt  \
count                  726  726.000000        726.0           726.0     726.0   
mean   2024-05-04 12:00:00  202.270620          0.0             0.0       0.0   
min    2023-05-08 00:00:00  165.000000          0.0             0.0       0.0   
25%    2023-11-05 06:00:00  180.950000          0.0             0.0       0.0   
50%    2024-05-04 12:00:00  193.500000    

chain 1 |          | 00:00 Status

chain 2 |          | 00:00 Status

chain 3 |          | 00:00 Status

chain 4 |          | 00:00 Status

                                                                                                                                                                                                                                                                                                                                

Exception: normal_id_glm_lpdf: Matrix of independent variables is inf, but must be finite! (in 'prophet.stan', line 137, column 2 to line 142, column 4)
	Exception: normal_id_glm_lpdf: Matrix of independent variables is inf, but must be finite! (in 'prophet.stan', line 137, column 2 to line 142, column 4)
	Exception: normal_id_glm_lpdf: Scale vector is 0, but must be positive finite! (in 'prophet.stan', line 137, column 2 to line 142, column 4)
Exception: normal_id_glm_lpdf: Matrix of independent variables is inf, but must be finite! (in 'prophet.stan', line 137, column 2 to line 142, column 4)
	Exception: normal_id_glm_lpdf: Matrix of independent variables is inf, but must be finite! (in 'prophet.stan', line 137, column 2 to line 142, column 4)
	Exception: normal_id_glm_lpdf: Scale vector is 0, but must be positive finite! (in 'prophet.stan', line 137, column 2 to line 142, column 4)
Exception: normal_id_glm_lpdf: Matrix of independent variables is inf, but must be finite! (in 'prophe




	Chain 1 had 150 iterations at max treedepth (100.0%)
	Chain 2 had 150 iterations at max treedepth (100.0%)
	Chain 3 had 150 iterations at max treedepth (100.0%)
	Chain 4 had 150 iterations at max treedepth (100.0%)
	Use the "diagnose()" method on the CmdStanMCMC object to see further information.
	Chain 1 had 150 iterations at max treedepth (100.0%)
	Chain 2 had 150 iterations at max treedepth (100.0%)
	Chain 3 had 150 iterations at max treedepth (100.0%)
	Chain 4 had 150 iterations at max treedepth (100.0%)
	Use the "diagnose()" method on the CmdStanMCMC object to see further information.
2025-05-05 22:00:14,979 - INFO - Prophet Metrics (changepoint_prior_scale=0.5): {'RMSE': 5.186403168276778, 'MAE': 4.1981014741654485, 'MAPE': 2.009013101538352}
2025-05-05 22:00:14,983 - INFO - Prophet Metrics (changepoint_prior_scale=0.5): {'RMSE': 5.186403168276778, 'MAE': 4.1981014741654485, 'MAPE': 2.009013101538352}
2025-05-05 22:00:14,984 - INFO - Best Prophet changepoint_prior_scale: 0.05
20

**Walk-forward validation**

In [15]:
avg_metrics = walk_forward_validation(data, forecast_horizon)

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
2025-05-05 22:00:45,869 - INFO - Walk-Fo

**Backtesting**

In [16]:
# Backtesting
if len(data) >= 100:
    # ARIMA historical predictions
    arima_model = auto_arima(data['Close'][:-forecast_horizon], seasonal=False, max_p=7, max_q=7, max_d=2)
    arima_fit = ARIMA(data['Close'][:-forecast_horizon], order=arima_model.order).fit()
    arima_hist_pred = arima_fit.predict(start=0, end=len(data)-forecast_horizon-1)
    arima_backtest_results, arima_backtest_df = backtest_strategy(data.iloc[:-forecast_horizon], arima_hist_pred, 'ARIMA')
    
    # Prophet historical predictions
    train_data = data.iloc[:-forecast_horizon]
    prophet_df = train_data.reset_index()[['Date', 'Close', 'Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA', 'Sentiment_Score']]
    prophet_df = prophet_df.drop_duplicates(subset='Date')
    prophet_df = prophet_df.rename(columns={'Date': 'ds', 'Close': 'y'})
    
    prophet_model = Prophet(daily_seasonality=True, yearly_seasonality=True, weekly_seasonality=True,
                            changepoint_prior_scale=prophet_scale, mcmc_samples=0)
    for regressor in ['Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA', 'Sentiment_Score']:
        prophet_model.add_regressor(regressor)
    prophet_model.fit(prophet_df)
    
    prophet_pred_df = prophet_model.predict(prophet_df)
    prophet_pred_df = prophet_pred_df.set_index('ds').reindex(train_data.index, method='ffill')
    prophet_hist_pred = prophet_pred_df['yhat']
    
    prophet_backtest_results, prophet_backtest_df = backtest_strategy(train_data, prophet_hist_pred, 'Prophet')
else:
    arima_backtest_results, arima_backtest_df = None, None
    prophet_backtest_results, prophet_backtest_df = None, None
    logging.warning(f"Insufficient data for backtesting {ticker}.")

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
2025-05-05 22:00:48,929 - INFO - Backtest input lengths - Data: 719, Predictions: 719
2025-05-05 22:00:48,934 - ERROR - Error in backtesting: Can only compare identically-labeled Series objects
2025-05-05 22:00:49,434 - INFO - Backtest input lengths - Data: 719, Predictions: 719
2025-05-05 22:00:49,434 - ERROR - Error in backtesting: Can only compare identically-labeled Series objects


**Plot predictions**

In [None]:
# Plot predictions
plot_predictions(data, arima_forecast, prophet_forecast, prophet_forecast_df,
                arima_backtest_df, prophet_backtest_df, ticker, forecast_horizon)

In [None]:
# Save predictions
if arima_forecast is not None and prophet_forecast is not None:
    future_dates = [data.index[-1] + timedelta(days=i+1) for i in range(forecast_horizon)]
    pred_df = pd.DataFrame({
        'Date': future_dates,
        'ARIMA_Prediction': arima_forecast,
        'Prophet_Prediction': prophet_forecast
    })
    pred_df.to_csv(data_dir / f'{ticker}_stock_price_predictions.csv', index=False)
    logging.info(f"Saved predictions for {ticker}")

# Save metrics
metrics_df = pd.DataFrame({
    'Model': ['ARIMA', 'Prophet'],
    'RMSE': [arima_metrics.get('RMSE', np.nan), prophet_metrics.get('RMSE', np.nan)],
    'MAE': [arima_metrics.get('MAE', np.nan), prophet_metrics.get('MAE', np.nan)],
    'MAPE': [arima_metrics.get('MAPE', np.nan), prophet_metrics.get('MAPE', np.nan)],
    'Best Parameters': [f"Order: {arima_order}", f"changepoint_prior_scale: {prophet_scale or 0.05}"]
})
metrics_df.to_csv(data_dir / f'{ticker}_model_metrics.csv', index=False)
logging.info(f"Saved metrics for {ticker}")

# Save backtest results
if arima_backtest_results and prophet_backtest_results:
    backtest_df = pd.DataFrame({
        'Model': ['ARIMA', 'Prophet'],
        'Cumulative Return (%)': [arima_backtest_results['Cumulative Return (%)'], 
                                prophet_backtest_results['Cumulative Return (%)']],
        'Number of Trades': [arima_backtest_results['Number of Trades'], 
                            prophet_backtest_results['Number of Trades']]
    })
    backtest_df.to_csv(data_dir / f'{ticker}_backtest_results.csv', index=False)
    logging.info(f"Saved backtest results for {ticker}")

# Select best model
best_model = 'Prophet' if avg_metrics['Prophet']['RMSE'] < avg_metrics['ARIMA']['RMSE'] else 'ARIMA'
results.append({'Ticker': ticker, 'Best_Model': best_model, 'Metrics': avg_metrics})

# Save summary
summary_df = pd.DataFrame(results)
summary_df.to_csv(data_dir / 'summary_results.csv', index=False)
logging.info("Saved summary results")

2025-05-05 21:18:53,810 - INFO - Saved predictions for AAPL
2025-05-05 21:18:53,814 - INFO - Saved metrics for AAPL
2025-05-05 21:18:53,819 - INFO - Saved summary results
