In [152]:
import pandas as pd
import numpy as np

from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller
from pmdarima import auto_arima

from prophet import Prophet

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler

from scipy.stats import trim_mean

import plotly.graph_objects as go
from plotly.subplots import make_subplots


import os
import logging
import re
from datetime import timedelta
from pathlib import Path
from tqdm import tqdm

import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.nn.functional import softmax

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [153]:
def load_and_prepare_data(stock_csv, balance_csv, cashflow_csv, income_csv, ticker):
    """
    Load and preprocess stock and financial data, aligning financial metrics by year.
    Adjusted: Add technical indicators (SMA, RSI) for LSTM.
    """
    try:
        # Load stock data
        stock_df = pd.read_csv(stock_csv)
        stock_df['Date'] = pd.to_datetime(stock_df['Date']).dt.date
        stock_df = stock_df[stock_df['Ticker'] == ticker][['Date', 'Close', 'Volume']].sort_values('Date')

        # Create daily date range and fill missing dates
        date_range = pd.date_range(start=stock_df['Date'].min(), end=stock_df['Date'].max(), freq='D')
        stock_df = stock_df.set_index('Date').reindex(date_range, method='ffill').reset_index()
        stock_df = stock_df.rename(columns={'index': 'Date'})
        stock_df['Ticker'] = ticker
        stock_df['Year'] = pd.to_datetime(stock_df['Date']).dt.year
        
        # Load financial data
        balance_df = pd.read_csv(balance_csv)
        cashflow_df = pd.read_csv(cashflow_csv)
        income_df = pd.read_csv(income_csv)
        
        # Get available years (excluding index, Ticker)
        year_cols = [col for col in balance_df.columns if col not in ['index', 'Ticker']]
        
        # Select key financial metrics for all years
        financial_metrics = {}
        for metric, source_df in [
            ('Diluted EPS', income_df),
            ('EBITDA', income_df),
            ('Free Cash Flow', cashflow_df),
            ('Net Debt', balance_df)
        ]:
            metric_row = source_df[source_df['index'] == metric]
            if metric_row.empty:
                logging.warning(f"No data for {metric}")
                continue
            values = metric_row[year_cols].iloc[0].to_dict()
            financial_metrics[metric] = values
        
        # Create financial DataFrame with year-based values
        financial_df = pd.DataFrame([
            {'Year': int(year), **{metric: values.get(year, np.nan) for metric, values in financial_metrics.items()}}
            for year in year_cols
        ])
        logging.info(f"Financial data years: {financial_df['Year'].unique()}")
        logging.info(f"Financial data sample:\n{financial_df.head().to_string()}")
        
        # Merge with stock data by year
        merged_df = stock_df.merge(financial_df, on='Year', how='left')
        merged_df = merged_df.set_index('Date').asfreq('D')  # Set daily frequency
        
        # Forward-fill financial metrics
        financial_cols = ['Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA']
        available_cols = [col for col in financial_cols if col in merged_df.columns]
        if not available_cols:
            logging.error("No financial columns available after merge.")
            return None
        
        merged_df[available_cols] = merged_df[available_cols].ffill().fillna(merged_df[available_cols].mean())
        
        # Validate for NaN and inf
        if merged_df[available_cols].isna().any().any():
            logging.error("NaN values remain in financial columns after filling.")
            return None
        if merged_df[available_cols].apply(lambda x: np.isinf(x)).any().any():
            logging.error("Inf values in financial columns.")
            return None
        
        # Check variance
        for col in available_cols + ['Close', 'Volume']:
            if merged_df[col].var() < 1e-10:
                logging.warning(f"Column '{col}' has near-zero variance: {merged_df[col].var()}. Adding noise.")
                merged_df[col] += np.random.normal(0, 0.01, len(merged_df))
        
        # Add technical indicators
        # 20-day Simple Moving Average (SMA)
        merged_df['SMA_20'] = merged_df['Close'].rolling(window=20).mean().fillna(merged_df['Close'])
        # 14-day Relative Strength Index (RSI)
        delta = merged_df['Close'].diff()
        gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
        loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
        rs = gain / loss
        merged_df['RSI'] = 100 - (100 / (1 + rs))
        merged_df['RSI'] = merged_df['RSI'].fillna(50)  # Neutral RSI
        
        return merged_df 
    
    except FileNotFoundError as e:
        logging.error(f"File not found: {e}")
        return None
    except Exception as e:
        logging.error(f"Error preparing data: {e}")
        return None

In [154]:
def clean_text(text):
    """
    Clean text by removing URLs and special characters.
    """
    if not isinstance(text, str):
        return ""
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    return text.strip()

def prepare_text(row):
    text = (clean_text(row['Content']) if pd.notna(row['Content']) else
            clean_text(row['Description']) if pd.notna(row['Description']) else
            clean_text(row['Title']) if pd.notna(row['Title']) else "")
    return text if text else "neutral"

In [155]:
def load_news_data(news_csv, ticker, stock_dates):
    """
    Load and preprocess news data, compute sentiment scores using FinBERT, and align with stock data dates.
    """
    try:
        if not os.path.exists(news_csv):
            logging.error(f"News file not found: {news_csv}")
            raise FileNotFoundError(f"News file not found: {news_csv}")
        
        # Load news data
        news_df = pd.read_csv(news_csv)
        logging.info(f"Raw news data shape: {news_df.shape}, columns: {list(news_df.columns)}")
        
        # Parse dates
        news_df['Date'] = pd.to_datetime(news_df['Date'])
        invalid_dates = news_df['Date'].isna().sum()
        if invalid_dates > 0:
            logging.warning(f"Dropped {invalid_dates} rows due to invalid dates")
        news_df = news_df.dropna(subset=['Date'])
        logging.info(f"After date parsing, news data shape: {news_df.shape}")
        
        # Filter for ticker
        news_df['Ticker'] = news_df['Ticker'].str.strip().str.upper()
        ticker = ticker.strip().upper()
        news_df = news_df[news_df['Ticker'] == ticker]
        logging.info(f"After filtering for ticker '{ticker}', news data shape: {news_df.shape}")
        
        if news_df.empty:
            logging.warning(f"No news articles found for {ticker}. Using neutral sentiment scores.")
            return pd.DataFrame({'Date': stock_dates, 'Sentiment_Score': 0.0})
        
        # Initialize FinBERT model and tokenizer
        logging.info("Loading FinBERT model and tokenizer...")
        tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
        model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone')
        model.eval()
        if torch.cuda.is_available():
            model.cuda()
            logging.info("Using GPU for FinBERT inference")
        else:
            logging.info("Using CPU for FinBERT inference")
        
        # Prepare texts for sentiment analysis
        def prepare_text(row):
            text = (clean_text(row['Content']) if pd.notna(row['Content']) else
                    clean_text(row['Description']) if pd.notna(row['Description']) else
                    clean_text(row['Title']) if pd.notna(row['Title']) else "")
            return text if text else "neutral"
        
        texts = [prepare_text(row) for row in news_df.to_dict('records')]
        
        # Batch processing for FinBERT
        batch_size = 16
        sentiment_scores = []
        for i in tqdm(range(0, len(texts), batch_size), desc="FinBERT Sentiment Analysis"):
            batch_texts = texts[i:i + batch_size]
            inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
            if torch.cuda.is_available():
                inputs = {k: v.cuda() for k, v in inputs.items()}
            
            with torch.no_grad():
                outputs = model(**inputs)
                logits = outputs.logits
                probs = softmax(logits, dim=1)
            
            probs = probs.cpu().numpy()
            for prob in probs:
                score = (prob[0] * 1) + (prob[1] * 0) + (prob[2] * -1)  # positive, neutral, negative
                sentiment_scores.append(score)
        
        news_df['Sentiment_Score'] = sentiment_scores
        logging.info(f"FinBERT Sentiment score summary: {news_df['Sentiment_Score'].describe().to_dict()}")
        
        # Aggregate sentiment by date
        sentiment_df = news_df.groupby('Date')['Sentiment_Score'].mean().reset_index()
        sentiment_df['Date'] = pd.to_datetime(sentiment_df['Date'])
        logging.info(f"Aggregated sentiment data shape: {sentiment_df.shape}")
        
        # Create full sentiment DataFrame
        sentiment_full = pd.DataFrame({'Date': stock_dates, 'Sentiment_Score': 0.0})
        sentiment_full = sentiment_full.merge(sentiment_df, on='Date', how='left', suffixes=('', '_news'))
        sentiment_full['Sentiment_Score'] = sentiment_full['Sentiment_Score_news'].fillna(0.0)
        
        # Forward-fill sentiment after first non-zero score
        first_news_date = sentiment_full[sentiment_full['Sentiment_Score'] != 0.0]['Date'].min()
        if pd.notna(first_news_date):
            mask = sentiment_full['Date'] >= first_news_date
            sentiment_full.loc[mask, 'Sentiment_Score'] = sentiment_full.loc[mask, 'Sentiment_Score'].ffill()
        
        sentiment_full = sentiment_full[['Date', 'Sentiment_Score']]
        logging.info(f"Final sentiment data shape: {sentiment_full.shape}, non-zero scores: {sentiment_full['Sentiment_Score'].ne(0).sum()}")
        return sentiment_full
    
    except FileNotFoundError as e:
        logging.error(f"File not found: {e}")
        return pd.DataFrame({'Date': stock_dates, 'Sentiment_Score': 0.0})
    except Exception as e:
        logging.error(f"Error processing news data with FinBERT: {e}")
        return pd.DataFrame({'Date': stock_dates, 'Sentiment_Score': 0.0})

In [156]:
def calculate_metrics(actual, predicted):
    """
    Calculate RMSE, MAE, and MAPE for model evaluation.
    """
    actual = np.array(actual)
    predicted = np.array(predicted)
    mask = (actual > 0) & (~np.isnan(actual)) & (~np.isnan(predicted))
    actual = actual[mask]
    predicted = predicted[mask]
    
    if len(actual) == 0:
        logging.warning("No valid data for metrics calculation")
        return {'RMSE': np.nan, 'MAE': np.nan, 'MAPE': np.nan}
    
    rmse = np.sqrt(mean_squared_error(actual, predicted))
    mae = mean_absolute_error(actual, predicted)
    mape = np.mean(np.abs((actual - predicted) / actual)) * 100 if np.all(actual != 0) else np.nan
    return {'RMSE': rmse, 'MAE': mae, 'MAPE': mape}

In [157]:
def create_sequences(data, seq_length, feature_cols, target_col):
    """
    Create sequences for LSTM input.
    """
    X, y = [], []
    data_array = data[feature_cols].values
    target_array = data[target_col].values
    for i in range(len(data) - seq_length):
        X.append(data_array[i:i + seq_length])
        y.append(target_array[i + seq_length])
    return np.array(X), np.array(y)

In [158]:
def arima_forecast(data, forecast_horizon=7):
    """
    Fit ARIMA model with stationarity check.
    Adjusted: Validate data, handle insufficient data, robust error handling.
    """
    try:
        # Check stationarity
        result = adfuller(data)
        if result[1] > 0.05:
            logging.info("Data is non-stationary, applying differencing")
            data_diff = data.diff().dropna()
            if len(data_diff) < 10:
                logging.error("Insufficient data after differencing")
                return None, None, None
            model = auto_arima(data_diff, seasonal=False, max_p=7, max_q=7, max_d=2,
                              stepwise=True, trace=True, error_action='ignore')
            best_order = model.order
            arima_model = ARIMA(data_diff, order=best_order).fit()
            forecast_diff = arima_model.forecast(steps=forecast_horizon)
            forecast = data.iloc[-1] + forecast_diff.cumsum()
        else:
            model = auto_arima(data, seasonal=False, max_p=7, max_q=7, max_d=2,
                              stepwise=True, trace=True, error_action='ignore')
            best_order = model.order
            arima_model = ARIMA(data, order=best_order).fit()
            forecast = arima_model.forecast(steps=forecast_horizon)
        
        if len(data) >= forecast_horizon:
            test_data = data[-forecast_horizon:]
            forecast_test = arima_model.forecast(steps=forecast_horizon)[-forecast_horizon:]
            metrics = calculate_metrics(test_data, forecast_test)
        else:
            metrics = {'RMSE': np.nan, 'MAE': np.nan, 'MAPE': np.nan}
        
        logging.info(f"ARIMA Metrics: {metrics}")
        return forecast, metrics, best_order
    except Exception as e:
        logging.error(f"Error in ARIMA forecasting: {e}")
        return None, None, None

In [159]:
def prophet_forecast(data, forecast_horizon=7, changepoint_prior_scale=0.05):
    """
    Fit Prophet model with dynamic regressors.
    """
    try:
        prophet_df = data.reset_index()[['Date', 'Close', 'Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA', 'Sentiment_Score', 'SMA_20', 'RSI']]
        prophet_df = prophet_df.rename(columns={'Date': 'ds', 'Close': 'y'})
        
        model = Prophet(daily_seasonality=True, yearly_seasonality=True, weekly_seasonality=True,
                       changepoint_prior_scale=changepoint_prior_scale, mcmc_samples=300 if len(prophet_df) < 1000 else 0)
        for regressor in ['Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA', 'Sentiment_Score', 'SMA_20', 'RSI']:
            model.add_regressor(regressor)
        model.fit(prophet_df)
        
        future = model.make_future_dataframe(periods=forecast_horizon)
        for regressor in ['Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA', 'SMA_20', 'RSI']:
            future[regressor] = prophet_df[regressor].mean()
        future['Sentiment_Score'] = prophet_df['Sentiment_Score'].iloc[-10:].mean()
        forecast_df = model.predict(future)
        
        forecast = forecast_df[['ds', 'yhat']].tail(forecast_horizon).set_index('ds')['yhat']
        
        test_data = prophet_df['y'][-forecast_horizon:]
        forecast_test = model.predict(prophet_df[-forecast_horizon:])['yhat']
        metrics = calculate_metrics(test_data, forecast_test)
        
        logging.info(f"Prophet Metrics (changepoint_prior_scale={changepoint_prior_scale}): {metrics}")
        return forecast, metrics, forecast_df
    except Exception as e:
        logging.error(f"Error in Prophet forecasting: {e}")
        return None, None, None

In [160]:
def tune_prophet(data, forecast_horizon=7):
    """
    Tune Prophet model by testing changepoint_prior_scale values.
    """
    scales = [0.05, 0.1, 0.5]
    best_metrics = {'RMSE': float('inf')}
    best_forecast = None
    best_forecast_df = None
    best_scale = 0.05
    
    for scale in scales:
        forecast, metrics, forecast_df = prophet_forecast(data, forecast_horizon, changepoint_prior_scale=scale)
        if forecast is not None and metrics is not None and not np.any(np.isnan(forecast)) and metrics['RMSE'] < best_metrics['RMSE']:
                best_metrics = metrics
                best_forecast = forecast
                best_forecast_df = forecast_df
                best_scale = scale
    
    if best_forecast is None:
        logging.warning("All Prophet models failed. Using default scale=0.05.")
        forecast, metrics, forecast_df = prophet_forecast(data, forecast_horizon, changepoint_prior_scale=0.05)
        best_metrics = metrics if metrics else {'RMSE': np.nan, 'MAE': np.nan, 'MAPE': np.nan}
        best_forecast = forecast
        best_forecast_df = forecast_df
        best_scale = 0.05
    
    logging.info(f"Best Prophet changepoint_prior_scale: {best_scale}")
    logging.info(f"Best Prophet Metrics: {best_metrics}")
    return best_forecast, best_metrics, best_forecast_df, best_scale

In [161]:
def lstm_forecast(data, forecast_horizon=7, seq_length=60, epochs=50, batch_size=32):
    """
    Fit LSTM model for stock price prediction.
    """
    try:
        feature_cols = ['Close', 'Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA', 'Sentiment_Score', 'SMA_20', 'RSI']
        target_col = 'Close'
        
        data = data.dropna()
        # Check for NaN or inf only on numeric feature columns
        numeric_data = data[feature_cols]
        if numeric_data.isna().any().any() or np.isinf(numeric_data).any().any():
            logging.error("Invalid data: Contains NaN or inf values in numeric columns")
            return None, None
        
        if len(data) < seq_length + forecast_horizon:
            logging.error(f"Insufficient data for LSTM (need at least {seq_length + forecast_horizon} days)")
            return None, None
        
        scaler_features = MinMaxScaler()
        scaler_target = MinMaxScaler()
        scaled_data = data.copy()
        scaled_data[feature_cols] = scaler_features.fit_transform(data[feature_cols])
        scaled_data[target_col] = scaler_target.fit_transform(data[[target_col]])
        
        X, y = create_sequences(scaled_data, seq_length, feature_cols, target_col)
        if len(X) == 0:
            logging.error("No sequences created for LSTM")
            return None, None
        
        train_size = int(len(X) * 0.8)
        X_train, X_test = X[:train_size], X[train_size:]
        y_train, y_test = y[:train_size], y[train_size:]
        
        model = Sequential([
            Input((seq_length, len(feature_cols))),
            LSTM(50, return_sequences=True),
            Dropout(0.2),
            LSTM(50),
            Dropout(0.2),
            Dense(25),
            Dense(1)
        ])
        model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
        
        model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=1)
        
        test_predictions = model.predict(X_test, verbose=0)
        test_predictions = scaler_target.inverse_transform(test_predictions)
        y_test_actual = scaler_target.inverse_transform([y_test])[0]
        metrics = calculate_metrics(y_test_actual, test_predictions.flatten())
        logging.info(f"LSTM Metrics: {metrics}")
        
        last_sequence = scaled_data[feature_cols].values[-seq_length:]
        future_predictions = []
        current_sequence = last_sequence.copy()
        
        for _ in range(forecast_horizon):
            current_sequence_reshaped = current_sequence.reshape((1, seq_length, len(feature_cols)))
            next_pred = model.predict(current_sequence_reshaped, verbose=0)
            future_predictions.append(next_pred[0, 0])
            new_row = current_sequence[-1].copy()
            new_row[0] = next_pred[0, 0]
            current_sequence = np.vstack((current_sequence[1:], new_row))
        
        future_predictions = np.array(future_predictions).reshape(-1, 1)
        future_predictions = scaler_target.inverse_transform(future_predictions).flatten()
        
        return future_predictions, metrics
    
    except Exception as e:
        logging.error(f"Error in LSTM forecasting: {e}")
        return None, None

In [162]:
def backtest_strategy(data, predictions, model_name, threshold=None):
    """
    Backtest a trading strategy with volatility-based threshold.
    """
    try:
        # Ensure predictions is a Pandas Series with the same index as data
        if not isinstance(predictions, pd.Series):
            predictions = pd.Series(predictions, index=data.index)
        else:
            predictions = predictions.reindex(data.index, method='ffill').bfill()

        # Create pred_df with aligned indices
        pred_df = pd.DataFrame({
            'Date': data.index,
            'Close': data['Close'],
            'Prediction': predictions
        }).dropna()

        # Calculate volatility for threshold
        volatility = pred_df['Close'].pct_change().std()
        threshold = volatility * 2 if threshold is None else threshold

        # Reset index to ensure continuous integer indices for slicing
        pred_df_reset = pred_df.reset_index(drop=True)

        # Generate signals using positional slicing on the reset index
        pred_df_reset['Signal'] = 0
        if len(pred_df_reset) > 1:
            pred_df_reset.loc[1:, 'Signal'] = np.where(
                pred_df_reset['Prediction'][1:].values > pred_df_reset['Close'][:-1].values * (1 + threshold), 1,
                np.where(pred_df_reset['Prediction'][1:].values < pred_df_reset['Close'][:-1].values * (1 - threshold), -1, 0)
            )

        # Restore the original index
        pred_df_reset.index = pred_df.index
        pred_df = pred_df_reset

        # Calculate returns
        transaction_cost = 0.001
        pred_df['Return'] = pred_df['Close'].pct_change()
        pred_df['Strategy_Return'] = pred_df['Signal'].shift(1) * pred_df['Return'] - \
                                    pred_df['Signal'].abs().shift(1) * transaction_cost

        cumulative_return = (1 + pred_df['Strategy_Return'].dropna()).cumprod().iloc[-1] - 1
        num_trades = pred_df['Signal'].abs().sum()

        results = {
            'Cumulative Return (%)': cumulative_return * 100,
            'Number of Trades': num_trades
        }

        logging.info(f"Backtest Results for {model_name}: {results}")
        return results, pred_df

    except Exception as e:
        logging.error(f"Error in backtesting: {e}")
        return None, None

In [163]:
def walk_forward_validation(data, forecast_horizon=7, n_folds=10, seq_length=60):
    """
    Perform walk-forward validation for ARIMA, Prophet, and LSTM.
    """
    # Debug: Confirm lstm_forecast is a function before starting
    logging.debug(f"lstm_forecast type at start of walk_forward_validation: {type(lstm_forecast)}")
    if not callable(lstm_forecast):
        raise ValueError("lstm_forecast is not a function! It has been overwritten with type: " + str(type(lstm_forecast)))

    n_folds = min(n_folds, len(data) // forecast_horizon)
    arima_metrics_list = []
    prophet_metrics_list = []
    lstm_metrics_list = []
    
    for i in range(n_folds):
        train_end = len(data) - (n_folds - i) * forecast_horizon
        if train_end <= max(forecast_horizon, seq_length):
            continue
        
        train_data = data.iloc[:train_end]
        test_data = data.iloc[train_end:train_end + forecast_horizon]['Close']
        
        if len(test_data) != forecast_horizon:
            continue
        
        # ARIMA
        arima_model = auto_arima(train_data['Close'], seasonal=False, max_p=7, max_q=7, max_d=2,
                                stepwise=True, error_action='ignore')
        arima_fit = ARIMA(train_data['Close'], order=arima_model.order).fit()
        arima_pred = arima_fit.forecast(steps=forecast_horizon)
        arima_metrics = calculate_metrics(test_data, arima_pred)
        arima_metrics_list.append(arima_metrics)
        
        # Prophet
        prophet_df = train_data.reset_index()[['Date', 'Close', 'Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA', 'Sentiment_Score', 'SMA_20', 'RSI']]
        prophet_df = prophet_df.rename(columns={'Date': 'ds', 'Close': 'y'})
        
        prophet_model = Prophet(daily_seasonality=True, yearly_seasonality=True, weekly_seasonality=True,
                               changepoint_prior_scale=0.05, mcmc_samples=0)
        for regressor in ['Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA', 'Sentiment_Score', 'SMA_20', 'RSI']:
            prophet_model.add_regressor(regressor)
        prophet_model.fit(prophet_df)
        future = prophet_model.make_future_dataframe(periods=forecast_horizon)
        for regressor in ['Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA', 'SMA_20', 'RSI']:
            future[regressor] = prophet_df[regressor].ffill().iloc[-1]
        future['Sentiment_Score'] = prophet_df['Sentiment_Score'].ffill().iloc[-1]
        prophet_pred_df = prophet_model.predict(future)
        prophet_pred = prophet_pred_df['yhat'].tail(forecast_horizon).values
        prophet_metrics = calculate_metrics(test_data.values, prophet_pred)
        prophet_metrics_list.append(prophet_metrics)
        
        # LSTM
        # Debug: Confirm lstm_forecast is still a function before calling
        logging.debug(f"lstm_forecast type before call in fold {i}: {type(lstm_forecast)}")
        lstm_pred, lstm_metrics = lstm_forecast(train_data, forecast_horizon=forecast_horizon, seq_length=seq_length, epochs=10)
        if lstm_pred is not None:
            lstm_metrics = calculate_metrics(test_data.values, lstm_pred)
            lstm_metrics_list.append(lstm_metrics)
    
    # Robust averaging
    avg_metrics = {
        'ARIMA': {
            'RMSE': trim_mean([m['RMSE'] for m in arima_metrics_list], proportiontocut=0.1) if arima_metrics_list else np.nan,
            'MAE': trim_mean([m['MAE'] for m in arima_metrics_list], proportiontocut=0.1) if arima_metrics_list else np.nan,
            'MAPE': trim_mean([m['MAPE'] for m in arima_metrics_list if not np.isnan(m['MAPE'])], proportiontocut=0.1) if any(not np.isnan(m['MAPE']) for m in arima_metrics_list) else np.nan
        },
        'Prophet': {
            'RMSE': trim_mean([m['RMSE'] for m in prophet_metrics_list], proportiontocut=0.1) if prophet_metrics_list else np.nan,
            'MAE': trim_mean([m['MAE'] for m in prophet_metrics_list], proportiontocut=0.1) if prophet_metrics_list else np.nan,
            'MAPE': trim_mean([m['MAPE'] for m in prophet_metrics_list if not np.isnan(m['MAPE'])], proportiontocut=0.1) if any(not np.isnan(m['MAPE']) for m in prophet_metrics_list) else np.nan
        },
        'LSTM': {
            'RMSE': trim_mean([m['RMSE'] for m in lstm_metrics_list], proportiontocut=0.1) if lstm_metrics_list else np.nan,
            'MAE': trim_mean([m['MAE'] for m in lstm_metrics_list], proportiontocut=0.1) if lstm_metrics_list else np.nan,
            'MAPE': trim_mean([m['MAPE'] for m in lstm_metrics_list if not np.isnan(m['MAPE'])], proportiontocut=0.1) if any(not np.isnan(m['MAPE']) for m in lstm_metrics_list) else np.nan
        }
    }
    
    logging.info(f"Walk-Forward Validation Results: {avg_metrics}")
    return avg_metrics

In [175]:
def plot_predictions(data, arima_forecast, prophet_forecast, prophet_forecast_df, lstm_forecast, 
                    backtest_arima_df, backtest_prophet_df, backtest_lstm_df, ticker, forecast_horizon=7):
    """
    Plot predictions with dynamic titles and interactive sentiment visualization using Plotly.
    Adjusted: Add LSTM subplot, 1x3 layout.
    """
    last_date = data.index[-1]
    future_dates = [last_date + timedelta(days=i+1) for i in range(forecast_horizon)]
    
    # Create 1x3 subplot with secondary y-axes
    fig = make_subplots(
        rows=1, cols=3, 
        subplot_titles=(f'ARIMA: {ticker} Stock Price Prediction',
                        f'Prophet: {ticker} Stock Price Prediction',
                        f'LSTM: {ticker} Stock Price Prediction'),
        shared_yaxes=True,
        specs=[[{'secondary_y': True}, {'secondary_y': True}, {'secondary_y': True}]]
    )
    
    # --- ARIMA Subplot (Column 1) ---
    fig.add_trace(
        go.Scatter(x=data.index, y=data['Close'], name='Historical Close', line=dict(color='blue')),
        row=1, col=1, secondary_y=False
    )
    
    if arima_forecast is not None:
        fig.add_trace(
            go.Scatter(x=future_dates, y=arima_forecast, name='ARIMA Forecast', 
                       line=dict(color='red', dash='dash')),
            row=1, col=1, secondary_y=False
        )
    
    if backtest_arima_df is not None:
        signals = backtest_arima_df[backtest_arima_df['Signal'].isin([1, -1])]
        marker_colors = ['green' if s == 1 else 'red' for s in signals['Signal']]
        marker_symbols = ['triangle-up' if s == 1 else 'triangle-down' for s in signals['Signal']]
        fig.add_trace(
            go.Scatter(
                x=signals['Date'], 
                y=signals['Close'], 
                name='ARIMA Signals',
                mode='markers', 
                marker=dict(color=marker_colors, symbol=marker_symbols, size=10, opacity=0.7),
                text=['Buy' if s == 1 else 'Sell' for s in signals['Signal']],
                hovertemplate='%{text}<br>Date: %{x}<br>Close: %{y:.2f}'
            ),
            row=1, col=1, secondary_y=False
        )
    
    if 'Sentiment_Score' in data.columns:
        sentiment_mask = (data['Sentiment_Score'] != 0.0) & (data['Sentiment_Score'].abs() > 0.3)
        if sentiment_mask.any():
            fig.add_trace(
                go.Scatter(
                    x=data.index[sentiment_mask], 
                    y=data['Sentiment_Score'][sentiment_mask],
                    name='Significant Sentiment',
                    mode='markers',
                    marker=dict(color='purple', size=8, opacity=0.5),
                    text=[f"Score: {s:.2f}" for s in data['Sentiment_Score'][sentiment_mask]],
                    hoverinfo='x+text'
                ),
                row=1, col=1, secondary_y=True
            )
    
    # --- Prophet Subplot (Column 2) ---
    fig.add_trace(
        go.Scatter(x=data.index, y=data['Close'], name='Historical Close', line=dict(color='blue'), showlegend=False),
        row=1, col=2, secondary_y=False
    )
    
    if prophet_forecast is not None:
        fig.add_trace(
            go.Scatter(x=future_dates, y=prophet_forecast, name='Prophet Forecast', 
                       line=dict(color='green', dash='dash')),
            row=1, col=2, secondary_y=False
        )
        if prophet_forecast_df is not None and 'yhat_upper' in prophet_forecast_df.columns:
            fig.add_trace(
                go.Scatter(
                    x=future_dates + future_dates[::-1],
                    y=list(prophet_forecast_df['yhat_upper'].tail(forecast_horizon)) + 
                      list(prophet_forecast_df['yhat_lower'].tail(forecast_horizon))[::-1],
                    fill='toself',
                    fillcolor='rgba(0, 128, 0, 0.1)',
                    line=dict(color='rgba(255,255,255,0)'),
                    name='Prophet Confidence Interval',
                    hoverinfo='skip'
                ),
                row=1, col=2, secondary_y=False
            )
    
    if backtest_prophet_df is not None:
        signals = backtest_prophet_df[backtest_prophet_df['Signal'].isin([1, -1])]
        marker_colors = ['lime' if s == 1 else 'darkred' for s in signals['Signal']]
        marker_symbols = ['triangle-up' if s == 1 else 'triangle-down' for s in signals['Signal']]
        fig.add_trace(
            go.Scatter(
                x=signals['Date'], 
                y=signals['Close'], 
                name='Prophet Signals',
                mode='markers', 
                marker=dict(color=marker_colors, symbol=marker_symbols, size=10, opacity=0.7),
                text=['Buy' if s == 1 else 'Sell' for s in signals['Signal']],
                hovertemplate='%{text}<br>Date: %{x}<br>Close: %{y:.2f}'
            ),
            row=1, col=2, secondary_y=False
        )
    
    if 'Sentiment_Score' in data.columns and sentiment_mask.any():
        fig.add_trace(
            go.Scatter(
                x=data.index[sentiment_mask], 
                y=data['Sentiment_Score'][sentiment_mask],
                name='Significant Sentiment',
                mode='markers',
                marker=dict(color='purple', size=8, opacity=0.5),
                text=[f"Score: {s:.2f}" for s in data['Sentiment_Score'][sentiment_mask]],
                hoverinfo='x+text',
                showlegend=False
            ),
            row=1, col=2, secondary_y=True
        )
    
    # --- LSTM Subplot (Column 3) ---
    fig.add_trace(
        go.Scatter(x=data.index, y=data['Close'], name='Historical Close', line=dict(color='blue'), showlegend=False),
        row=1, col=3, secondary_y=False
    )
    

    fig.add_trace(
        go.Scatter(x=future_dates, y=lstm_forecast, name='LSTM Forecast', 
                    line=dict(color='orange', dash='dash')),
        row=1, col=3, secondary_y=False
    )

    
    if backtest_lstm_df is not None:
        signals = backtest_lstm_df[backtest_lstm_df['Signal'].isin([1, -1])]
        marker_colors = ['yellow' if s == 1 else 'purple' for s in signals['Signal']]
        marker_symbols = ['triangle-up' if s == 1 else 'triangle-down' for s in signals['Signal']]
        fig.add_trace(
            go.Scatter(
                x=signals['Date'], 
                y=signals['Close'], 
                name='LSTM Signals',
                mode='markers', 
                marker=dict(color=marker_colors, symbol=marker_symbols, size=10, opacity=0.7),
                text=['Buy' if s == 1 else 'Sell' for s in signals['Signal']],
                hovertemplate='%{text}<br>Date: %{x}<br>Close: %{y:.2f}'
            ),
            row=1, col=3, secondary_y=False
        )
    
    if 'Sentiment_Score' in data.columns and sentiment_mask.any():
        fig.add_trace(
            go.Scatter(
                x=data.index[sentiment_mask], 
                y=data['Sentiment_Score'][sentiment_mask],
                name='Significant Sentiment',
                mode='markers',
                marker=dict(color='purple', size=8, opacity=0.5),
                text=[f"Score: {s:.2f}" for s in data['Sentiment_Score'][sentiment_mask]],
                hoverinfo='x+text',
                showlegend=False
            ),
            row=1, col=3, secondary_y=True
        )
    
    # Update layout
    fig.update_layout(
        title=dict(text=f'{ticker} Stock Price Prediction and Backtest Signals (7-Day Forecast)',
                   x=0.5, xanchor='center', y=0.98),
        height=1000, width=1800,  # Wider to accommodate 3 subplots
        showlegend=True,
        legend=dict(orientation='h', yanchor='bottom', y=-0.35, xanchor='center', x=0.5),
        hovermode='x unified',
        plot_bgcolor='white',
        margin=dict(l=50, r=50, t=100, b=100)
    )
    
    # Update x-axes
    for col in range(1, 4):
        fig.update_xaxes(title_text='Date', gridcolor='lightgrey', row=1, col=col)
        fig.update_xaxes(
            rangeslider_visible=True,
            rangeslider_thickness=0.15,
            rangeselector=dict(
                buttons=list([
                    dict(count=1, label='1mo', step='month', stepmode='backward'),
                    dict(count=6, label='6mo', step='month', stepmode='backward'),
                    dict(count=1, label='1y', step='year', stepmode='backward'),
                    dict(step='all', label='All'),
                ]),
                x=0.02 + (col-1) * 0.33,
                y=0.95,
            ),
            row=1, col=col
        )
    
    # Update y-axes
    for col in range(1, 4):
        fig.update_yaxes(title_text='Close Price (USD)', gridcolor='lightgrey', row=1, col=col, secondary_y=False)
        fig.update_yaxes(title_text='Sentiment Score (-1 to 1)', row=1, col=col, secondary_y=True, 
                         showgrid=False, tickfont=dict(color='purple'))
    
    fig.show()

**Main**

In [165]:
# Parameters
forecast_horizon = 7
seq_length = 60
data_dir = Path('../data')
stock_csv = data_dir / 'cleaned_stock_data.csv'

# Get tickers
tickers = pd.read_csv(stock_csv)['Ticker'].unique()[:5]
results = []

for ticker in tickers:
    logging.info(f"Processing ticker: {ticker}")
    balance_csv = data_dir / f'balance_sheet_{ticker}.csv'
    cashflow_csv = data_dir / f'cash_flow_{ticker}.csv'
    income_csv = data_dir / f'income_statement_{ticker}.csv'
    news_csv = data_dir / 'news_data.csv'
    
    # Load and prepare data
    data = load_and_prepare_data(stock_csv, balance_csv, cashflow_csv, income_csv, ticker)
    if data is None:
        logging.error(f"Skipping {ticker} due to data loading errors")
        continue
    
    # Load news data and merge
    sentiment_df = load_news_data(news_csv, ticker, data.index)
    if sentiment_df is not None:
        data = data.reset_index().merge(sentiment_df[['Date', 'Sentiment_Score']], on='Date', how='left')
        data['Sentiment_Score'] = data['Sentiment_Score'].fillna(0.0)
        data = data.set_index('Date')
        logging.info(f"Added Sentiment_Score to data for {ticker}. Columns: {list(data.columns)}")
    else:
        logging.warning(f"No sentiment data for {ticker}. Using neutral scores.")
        data['Sentiment_Score'] = 0.0

2025-05-08 21:22:45,019 - INFO - Processing ticker: AAPL
2025-05-08 21:22:45,150 - INFO - Financial data years: [2024 2023 2022 2021 2020]
2025-05-08 21:22:45,162 - INFO - Financial data sample:
   Year  Diluted EPS        EBITDA  Free Cash Flow      Net Debt
0  2024         6.08  1.346610e+11    1.088070e+11  7.668600e+10
1  2023         6.13  1.258200e+11    9.958400e+10  8.112300e+10
2  2022         6.11  1.305410e+11    1.114430e+11  9.642300e+10
3  2021         5.61  1.231360e+11    9.295300e+10  8.977900e+10
4  2020          NaN           NaN             NaN           NaN
2025-05-08 21:22:45,311 - INFO - Raw news data shape: (200, 7), columns: ['Ticker', 'Date', 'Title', 'Description', 'Source', 'URL', 'Content']
2025-05-08 21:22:45,316 - INFO - After date parsing, news data shape: (200, 7)
2025-05-08 21:22:45,320 - INFO - After filtering for ticker 'AAPL', news data shape: (200, 7)
2025-05-08 21:22:45,321 - INFO - Loading FinBERT model and tokenizer...
2025-05-08 21:22:48,202 - 

In [166]:
data

Unnamed: 0_level_0,Close,Volume,Ticker,Year,Diluted EPS,EBITDA,Free Cash Flow,Net Debt,SMA_20,RSI,Sentiment_Score
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2023-05-09,171.770,45326874.0,AAPL,2023,6.13,1.258200e+11,9.958400e+10,8.112300e+10,171.7700,50.000000,0.000000
2023-05-10,173.555,53724501.0,AAPL,2023,6.13,1.258200e+11,9.958400e+10,8.112300e+10,173.5550,50.000000,0.000000
2023-05-11,173.750,49473076.0,AAPL,2023,6.13,1.258200e+11,9.958400e+10,8.112300e+10,173.7500,50.000000,0.000000
2023-05-12,172.570,45533138.0,AAPL,2023,6.13,1.258200e+11,9.958400e+10,8.112300e+10,172.5700,50.000000,0.000000
2023-05-13,172.570,45533138.0,AAPL,2023,6.13,1.258200e+11,9.958400e+10,8.112300e+10,172.5700,50.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
2025-04-28,210.140,38737224.0,AAPL,2025,6.08,1.346610e+11,1.088070e+11,7.668600e+10,199.3995,61.996222,0.000000
2025-04-29,211.210,36827633.0,AAPL,2025,6.08,1.346610e+11,1.088070e+11,7.668600e+10,201.3390,63.975347,0.699596
2025-04-30,212.500,52286454.0,AAPL,2025,6.08,1.346610e+11,1.088070e+11,7.668600e+10,202.4430,85.233862,0.490177
2025-05-01,213.320,57364925.0,AAPL,2025,6.08,1.346610e+11,1.088070e+11,7.668600e+10,203.2015,84.070058,0.478522


**Models**

In [167]:
# ARIMA forecast
arima_forecast, arima_metrics, arima_order = arima_forecast(data['Close'], forecast_horizon)

2025-05-08 21:25:31,835 - INFO - Data is non-stationary, applying differencing


Performing stepwise search to minimize aic
 ARIMA(2,0,2)(0,0,0)[0]             : AIC=3634.728, Time=0.25 sec
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=3650.950, Time=0.01 sec
 ARIMA(1,0,0)(0,0,0)[0]             : AIC=3642.263, Time=0.02 sec
 ARIMA(0,0,1)(0,0,0)[0]             : AIC=3641.639, Time=0.04 sec
 ARIMA(1,0,2)(0,0,0)[0]             : AIC=3639.339, Time=0.15 sec
 ARIMA(2,0,1)(0,0,0)[0]             : AIC=3637.173, Time=0.14 sec
 ARIMA(3,0,2)(0,0,0)[0]             : AIC=3635.632, Time=0.27 sec
 ARIMA(2,0,3)(0,0,0)[0]             : AIC=3635.262, Time=0.15 sec
 ARIMA(1,0,1)(0,0,0)[0]             : AIC=3643.635, Time=0.06 sec
 ARIMA(1,0,3)(0,0,0)[0]             : AIC=3633.402, Time=0.13 sec
 ARIMA(0,0,3)(0,0,0)[0]             : AIC=3632.777, Time=0.06 sec
 ARIMA(0,0,2)(0,0,0)[0]             : AIC=3643.624, Time=0.06 sec
 ARIMA(0,0,4)(0,0,0)[0]             : AIC=3632.953, Time=0.10 sec
 ARIMA(1,0,4)(0,0,0)[0]             : AIC=3633.902, Time=0.22 sec
 ARIMA(0,0,3)(0,0,0)[0] intercept


No frequency information was provided, so inferred frequency D will be used.


No frequency information was provided, so inferred frequency D will be used.


No frequency information was provided, so inferred frequency D will be used.

2025-05-08 21:25:33,779 - INFO - ARIMA Metrics: {'RMSE': 210.1147205183507, 'MAE': 210.10050252357516, 'MAPE': 99.97470268155942}


In [168]:
# Prophet forecast with tuning
prophet_forecast, prophet_metrics, prophet_forecast_df, prophet_scale = tune_prophet(data, forecast_horizon)

2025-05-08 21:25:33,902 - DEBUG - cmd: where.exe tbb.dll
cwd: None
2025-05-08 21:25:34,398 - DEBUG - TBB already found in load path
2025-05-08 21:25:34,440 - DEBUG - cmd: C:\Users\nguye\AppData\Local\Programs\Python\Python312\Lib\site-packages\prophet\stan_model\prophet_model.bin info
cwd: None
2025-05-08 21:25:34,465 - DEBUG - input tempfile: C:\Users\nguye\AppData\Local\Temp\tmp7iu757we\i6qyoki2.json
2025-05-08 21:25:34,546 - DEBUG - input tempfile: C:\Users\nguye\AppData\Local\Temp\tmp7iu757we\5pv7fuxf.json
21:25:34 - cmdstanpy - INFO - CmdStan start processing
2025-05-08 21:25:34,557 - INFO - CmdStan start processing


chain 1 |          | 00:00 Status

chain 2 |          | 00:00 Status

chain 3 |          | 00:00 Status

chain 4 |          | 00:00 Status

2025-05-08 21:25:34,635 - DEBUG - idx 0
2025-05-08 21:25:34,645 - DEBUG - running CmdStan, num_threads: 1
2025-05-08 21:25:34,647 - DEBUG - idx 1
2025-05-08 21:25:34,648 - DEBUG - CmdStan args: ['C:\\Users\\nguye\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\prophet\\stan_model\\prophet_model.bin', 'id=1', 'random', 'seed=79770', 'data', 'file=C:\\Users\\nguye\\AppData\\Local\\Temp\\tmp7iu757we\\i6qyoki2.json', 'init=C:\\Users\\nguye\\AppData\\Local\\Temp\\tmp7iu757we\\5pv7fuxf.json', 'output', 'file=C:\\Users\\nguye\\AppData\\Local\\Temp\\tmp7iu757we\\prophet_modelaai5_v51\\prophet_model-20250508212534_1.csv', 'method=sample', 'num_samples=150', 'num_warmup=150', 'algorithm=hmc', 'adapt', 'engaged=1']
2025-05-08 21:25:34,654 - DEBUG - idx 2
2025-05-08 21:25:34,656 - DEBUG - running CmdStan, num_threads: 1
2025-05-08 21:25:34,664 - DEBUG - idx 3
2025-05-08 21:25:34,666 - DEBUG - running CmdStan, num_threads: 1
2025-05-08 21:25:34,670 - DEBUG - CmdStan args: ['C:\\Us

                                                                                                                                                                                                                                                                                                                                

21:26:36 - cmdstanpy - INFO - CmdStan done processing.
2025-05-08 21:26:36,916 - INFO - CmdStan done processing.
2025-05-08 21:26:36,920 - DEBUG - runset
RunSet: chains=4, chain_ids=[1, 2, 3, 4], num_processes=4
 cmd (chain 1):
	['C:\\Users\\nguye\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\prophet\\stan_model\\prophet_model.bin', 'id=1', 'random', 'seed=79770', 'data', 'file=C:\\Users\\nguye\\AppData\\Local\\Temp\\tmp7iu757we\\i6qyoki2.json', 'init=C:\\Users\\nguye\\AppData\\Local\\Temp\\tmp7iu757we\\5pv7fuxf.json', 'output', 'file=C:\\Users\\nguye\\AppData\\Local\\Temp\\tmp7iu757we\\prophet_modelaai5_v51\\prophet_model-20250508212534_1.csv', 'method=sample', 'num_samples=150', 'num_warmup=150', 'algorithm=hmc', 'adapt', 'engaged=1']
 retcodes=[0, 0, 0, 0]
 per-chain output files (showing chain 1 only):
 csv_file:
	C:\Users\nguye\AppData\Local\Temp\tmp7iu757we\prophet_modelaai5_v51\prophet_model-20250508212534_1.csv
 console_msgs (if any):
	C:\Users\nguye\AppData




	Chain 1 had 150 iterations at max treedepth (100.0%)
	Chain 2 had 150 iterations at max treedepth (100.0%)
	Chain 3 had 150 iterations at max treedepth (100.0%)
	Chain 4 had 150 iterations at max treedepth (100.0%)
	Use the "diagnose()" method on the CmdStanMCMC object to see further information.
	Chain 1 had 150 iterations at max treedepth (100.0%)
	Chain 2 had 150 iterations at max treedepth (100.0%)
	Chain 3 had 150 iterations at max treedepth (100.0%)
	Chain 4 had 150 iterations at max treedepth (100.0%)
	Use the "diagnose()" method on the CmdStanMCMC object to see further information.
2025-05-08 21:26:39,710 - INFO - Prophet Metrics (changepoint_prior_scale=0.05): {'RMSE': 5.52717579167394, 'MAE': 4.6044595488017865, 'MAPE': 2.1976442013451263}
2025-05-08 21:26:39,717 - DEBUG - cmd: where.exe tbb.dll
cwd: None
2025-05-08 21:26:39,806 - DEBUG - TBB already found in load path
2025-05-08 21:26:39,849 - DEBUG - cmd: C:\Users\nguye\AppData\Local\Programs\Python\Python312\Lib\site-pack

chain 1 |          | 00:00 Status

chain 2 |          | 00:00 Status

chain 3 |          | 00:00 Status

chain 4 |          | 00:00 Status

2025-05-08 21:26:40,051 - DEBUG - idx 0
2025-05-08 21:26:40,063 - DEBUG - idx 1
2025-05-08 21:26:40,078 - DEBUG - running CmdStan, num_threads: 1
2025-05-08 21:26:40,076 - DEBUG - idx 2
2025-05-08 21:26:40,063 - DEBUG - running CmdStan, num_threads: 1
2025-05-08 21:26:40,080 - DEBUG - CmdStan args: ['C:\\Users\\nguye\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\prophet\\stan_model\\prophet_model.bin', 'id=2', 'random', 'seed=22471', 'data', 'file=C:\\Users\\nguye\\AppData\\Local\\Temp\\tmp7iu757we\\gtwtgrdo.json', 'init=C:\\Users\\nguye\\AppData\\Local\\Temp\\tmp7iu757we\\r1iywwlk.json', 'output', 'file=C:\\Users\\nguye\\AppData\\Local\\Temp\\tmp7iu757we\\prophet_modelku7k50n8\\prophet_model-20250508212640_2.csv', 'method=sample', 'num_samples=150', 'num_warmup=150', 'algorithm=hmc', 'adapt', 'engaged=1']
2025-05-08 21:26:40,084 - DEBUG - idx 3
2025-05-08 21:26:40,085 - DEBUG - running CmdStan, num_threads: 1
2025-05-08 21:26:40,088 - DEBUG - CmdStan args: ['C:\\Us

                                                                                                                                                                                                                                                                                                                                

21:27:48 - cmdstanpy - INFO - CmdStan done processing.
2025-05-08 21:27:48,025 - INFO - CmdStan done processing.
2025-05-08 21:27:48,028 - DEBUG - runset
RunSet: chains=4, chain_ids=[1, 2, 3, 4], num_processes=4
 cmd (chain 1):
	['C:\\Users\\nguye\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\prophet\\stan_model\\prophet_model.bin', 'id=1', 'random', 'seed=22471', 'data', 'file=C:\\Users\\nguye\\AppData\\Local\\Temp\\tmp7iu757we\\gtwtgrdo.json', 'init=C:\\Users\\nguye\\AppData\\Local\\Temp\\tmp7iu757we\\r1iywwlk.json', 'output', 'file=C:\\Users\\nguye\\AppData\\Local\\Temp\\tmp7iu757we\\prophet_modelku7k50n8\\prophet_model-20250508212640_1.csv', 'method=sample', 'num_samples=150', 'num_warmup=150', 'algorithm=hmc', 'adapt', 'engaged=1']
 retcodes=[0, 0, 0, 0]
 per-chain output files (showing chain 1 only):
 csv_file:
	C:\Users\nguye\AppData\Local\Temp\tmp7iu757we\prophet_modelku7k50n8\prophet_model-20250508212640_1.csv
 console_msgs (if any):
	C:\Users\nguye\AppData




Exception: normal_id_glm_lpdf: Matrix of independent variables is inf, but must be finite! (in 'prophet.stan', line 137, column 2 to line 142, column 4)
	Exception: normal_id_glm_lpdf: Matrix of independent variables is inf, but must be finite! (in 'prophet.stan', line 137, column 2 to line 142, column 4)
	Exception: normal_id_glm_lpdf: Scale vector is 0, but must be positive finite! (in 'prophet.stan', line 137, column 2 to line 142, column 4)
	Exception: normal_id_glm_lpdf: Scale vector is 0, but must be positive finite! (in 'prophet.stan', line 137, column 2 to line 142, column 4)
Exception: normal_id_glm_lpdf: Matrix of independent variables is inf, but must be finite! (in 'prophet.stan', line 137, column 2 to line 142, column 4)
	Exception: normal_id_glm_lpdf: Matrix of independent variables is inf, but must be finite! (in 'prophet.stan', line 137, column 2 to line 142, column 4)
	Exception: normal_id_glm_lpdf: Scale vector is 0, but must be positive finite! (in 'prophet.stan', li

chain 1 |          | 00:00 Status

chain 2 |          | 00:00 Status

chain 3 |          | 00:00 Status

chain 4 |          | 00:00 Status

2025-05-08 21:27:51,454 - DEBUG - idx 0
2025-05-08 21:27:51,473 - DEBUG - running CmdStan, num_threads: 1
2025-05-08 21:27:51,475 - DEBUG - CmdStan args: ['C:\\Users\\nguye\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\prophet\\stan_model\\prophet_model.bin', 'id=1', 'random', 'seed=53060', 'data', 'file=C:\\Users\\nguye\\AppData\\Local\\Temp\\tmp7iu757we\\qkohq3_v.json', 'init=C:\\Users\\nguye\\AppData\\Local\\Temp\\tmp7iu757we\\uzhvjz76.json', 'output', 'file=C:\\Users\\nguye\\AppData\\Local\\Temp\\tmp7iu757we\\prophet_model232lwfj0\\prophet_model-20250508212751_1.csv', 'method=sample', 'num_samples=150', 'num_warmup=150', 'algorithm=hmc', 'adapt', 'engaged=1']
2025-05-08 21:27:51,562 - DEBUG - idx 1
2025-05-08 21:27:51,584 - DEBUG - running CmdStan, num_threads: 1
2025-05-08 21:27:51,586 - DEBUG - CmdStan args: ['C:\\Users\\nguye\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\prophet\\stan_model\\prophet_model.bin', 'id=2', 'random', 'seed=5306

                                                                                                                                                                                                                                                                                                                                

21:28:53 - cmdstanpy - INFO - CmdStan done processing.
2025-05-08 21:28:53,593 - INFO - CmdStan done processing.
2025-05-08 21:28:53,596 - DEBUG - runset
RunSet: chains=4, chain_ids=[1, 2, 3, 4], num_processes=4
 cmd (chain 1):
	['C:\\Users\\nguye\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\prophet\\stan_model\\prophet_model.bin', 'id=1', 'random', 'seed=53060', 'data', 'file=C:\\Users\\nguye\\AppData\\Local\\Temp\\tmp7iu757we\\qkohq3_v.json', 'init=C:\\Users\\nguye\\AppData\\Local\\Temp\\tmp7iu757we\\uzhvjz76.json', 'output', 'file=C:\\Users\\nguye\\AppData\\Local\\Temp\\tmp7iu757we\\prophet_model232lwfj0\\prophet_model-20250508212751_1.csv', 'method=sample', 'num_samples=150', 'num_warmup=150', 'algorithm=hmc', 'adapt', 'engaged=1']
 retcodes=[0, 0, 0, 0]
 per-chain output files (showing chain 1 only):
 csv_file:
	C:\Users\nguye\AppData\Local\Temp\tmp7iu757we\prophet_model232lwfj0\prophet_model-20250508212751_1.csv
 console_msgs (if any):
	C:\Users\nguye\AppData




	Chain 1 had 150 iterations at max treedepth (100.0%)
	Chain 2 had 150 iterations at max treedepth (100.0%)
	Chain 3 had 150 iterations at max treedepth (100.0%)
	Chain 4 had 150 iterations at max treedepth (100.0%)
	Use the "diagnose()" method on the CmdStanMCMC object to see further information.
	Chain 1 had 150 iterations at max treedepth (100.0%)
	Chain 2 had 150 iterations at max treedepth (100.0%)
	Chain 3 had 150 iterations at max treedepth (100.0%)
	Chain 4 had 150 iterations at max treedepth (100.0%)
	Use the "diagnose()" method on the CmdStanMCMC object to see further information.
2025-05-08 21:28:56,837 - INFO - Prophet Metrics (changepoint_prior_scale=0.5): {'RMSE': 4.841590662004366, 'MAE': 4.416525297816825, 'MAPE': 2.1101327116323167}
2025-05-08 21:28:56,842 - INFO - Best Prophet changepoint_prior_scale: 0.5
2025-05-08 21:28:56,843 - INFO - Best Prophet Metrics: {'RMSE': 4.841590662004366, 'MAE': 4.416525297816825, 'MAPE': 2.1101327116323167}


In [169]:
# LSTM forecast
lstm_pred, lstm_metrics = lstm_forecast(data, forecast_horizon=forecast_horizon, seq_length=seq_length)
logging.info(f"LSTM Predictions: {lstm_pred}, Metrics: {lstm_metrics}")

Epoch 1/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 47ms/step - loss: 0.1224
Epoch 2/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 49ms/step - loss: 0.0146
Epoch 3/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 48ms/step - loss: 0.0096
Epoch 4/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 46ms/step - loss: 0.0066
Epoch 5/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 54ms/step - loss: 0.0051
Epoch 6/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 55ms/step - loss: 0.0045
Epoch 7/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 51ms/step - loss: 0.0046
Epoch 8/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 51ms/step - loss: 0.0045
Epoch 9/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 47ms/step - loss: 0.0044
Epoch 10/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 44ms/step - loss: 0.0043

2025-05-08 21:29:51,714 - INFO - LSTM Metrics: {'RMSE': 8.95358219494315, 'MAE': 5.610927697662122, 'MAPE': 2.6182989052380012}
2025-05-08 21:29:52,564 - INFO - LSTM Predictions: [210.51025 210.04193 209.46649 208.93642 208.51233 208.2084  208.01523], Metrics: {'RMSE': 8.95358219494315, 'MAE': 5.610927697662122, 'MAPE': 2.6182989052380012}


**Walk-forward validation**

In [170]:
avg_metrics = walk_forward_validation(data, forecast_horizon=forecast_horizon, seq_length=seq_length)
logging.info(f"Average Metrics: {avg_metrics}")


No frequency information was provided, so inferred frequency D will be used.


No frequency information was provided, so inferred frequency D will be used.


No frequency information was provided, so inferred frequency D will be used.

2025-05-08 21:29:53,997 - DEBUG - cmd: where.exe tbb.dll
cwd: None
2025-05-08 21:29:54,055 - DEBUG - TBB already found in load path
2025-05-08 21:29:54,086 - DEBUG - input tempfile: C:\Users\nguye\AppData\Local\Temp\tmp7iu757we\5oyiffwm.json
2025-05-08 21:29:54,171 - DEBUG - input tempfile: C:\Users\nguye\AppData\Local\Temp\tmp7iu757we\_a0bz5lu.json
2025-05-08 21:29:54,175 - DEBUG - idx 0
2025-05-08 21:29:54,176 - DEBUG - running CmdStan, num_threads: 1
2025-05-08 21:29:54,177 - DEBUG - CmdStan args: ['C:\\Users\\nguye\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\prophet\\stan_model\\prophet_model.bin', 'random', 'seed=67136', 'data', 'file=C:\\Users\\nguye\\AppData\\Local\\Temp\\tmp7iu757we\\5oyiffwm.json', 'init=C:\\Users\\nguye\\

Epoch 1/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 51ms/step - loss: 0.0580
Epoch 2/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 52ms/step - loss: 0.0097
Epoch 3/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 69ms/step - loss: 0.0061
Epoch 4/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 67ms/step - loss: 0.0051
Epoch 5/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 77ms/step - loss: 0.0046
Epoch 6/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 67ms/step - loss: 0.0049
Epoch 7/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 65ms/step - loss: 0.0044
Epoch 8/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 67ms/step - loss: 0.0045
Epoch 9/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 53ms/step - loss: 0.0038
Epoch 10/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 60ms/step - loss: 0.0038

2025-05-08 21:30:14,126 - INFO - LSTM Metrics: {'RMSE': 5.693864495992882, 'MAE': 4.576021092519042, 'MAPE': 1.909264944414242}

No frequency information was provided, so inferred frequency D will be used.


No frequency information was provided, so inferred frequency D will be used.


No frequency information was provided, so inferred frequency D will be used.

2025-05-08 21:30:16,045 - DEBUG - cmd: where.exe tbb.dll
cwd: None
2025-05-08 21:30:16,103 - DEBUG - TBB already found in load path
2025-05-08 21:30:16,136 - DEBUG - input tempfile: C:\Users\nguye\AppData\Local\Temp\tmp7iu757we\5b48bczk.json
2025-05-08 21:30:16,216 - DEBUG - input tempfile: C:\Users\nguye\AppData\Local\Temp\tmp7iu757we\ycjjbwrk.json
2025-05-08 21:30:16,220 - DEBUG - idx 0
2025-05-08 21:30:16,221 - DEBUG - running CmdStan, num_threads: 1
2025-05-08 21:30:16,222 - DEBUG - CmdStan args: ['C:\\Users\\nguye\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\prophet\\stan_model\\prophet_model.bin', 'ra

Epoch 1/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 51ms/step - loss: 0.1000
Epoch 2/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 55ms/step - loss: 0.0136
Epoch 3/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 54ms/step - loss: 0.0119
Epoch 4/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 54ms/step - loss: 0.0084
Epoch 5/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 56ms/step - loss: 0.0071
Epoch 6/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 54ms/step - loss: 0.0058
Epoch 7/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 48ms/step - loss: 0.0053
Epoch 8/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 56ms/step - loss: 0.0091
Epoch 9/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 48ms/step - loss: 0.0057
Epoch 10/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 49ms/step - loss: 0.0043

2025-05-08 21:30:30,291 - INFO - LSTM Metrics: {'RMSE': 6.9175514565443015, 'MAE': 5.544061314606464, 'MAPE': 2.3782144731523185}

No frequency information was provided, so inferred frequency D will be used.


No frequency information was provided, so inferred frequency D will be used.


No frequency information was provided, so inferred frequency D will be used.

2025-05-08 21:30:32,160 - DEBUG - cmd: where.exe tbb.dll
cwd: None
2025-05-08 21:30:32,211 - DEBUG - TBB already found in load path
2025-05-08 21:30:32,247 - DEBUG - input tempfile: C:\Users\nguye\AppData\Local\Temp\tmp7iu757we\ty4owk9k.json
2025-05-08 21:30:32,341 - DEBUG - input tempfile: C:\Users\nguye\AppData\Local\Temp\tmp7iu757we\0l3qhx35.json
2025-05-08 21:30:32,345 - DEBUG - idx 0
2025-05-08 21:30:32,346 - DEBUG - running CmdStan, num_threads: 1
2025-05-08 21:30:32,348 - DEBUG - CmdStan args: ['C:\\Users\\nguye\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\prophet\\stan_model\\prophet_model.bin', '

Epoch 1/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 46ms/step - loss: 0.0557
Epoch 2/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 55ms/step - loss: 0.0117
Epoch 3/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 46ms/step - loss: 0.0068
Epoch 4/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 52ms/step - loss: 0.0056
Epoch 5/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 53ms/step - loss: 0.0045
Epoch 6/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 65ms/step - loss: 0.0048
Epoch 7/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 60ms/step - loss: 0.0050
Epoch 8/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 55ms/step - loss: 0.0041
Epoch 9/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 57ms/step - loss: 0.0047
Epoch 10/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 52ms/step - loss: 0.0042

2025-05-08 21:30:46,736 - INFO - LSTM Metrics: {'RMSE': 5.5405306034042265, 'MAE': 4.413617273549567, 'MAPE': 1.8292123235083324}

No frequency information was provided, so inferred frequency D will be used.


No frequency information was provided, so inferred frequency D will be used.


No frequency information was provided, so inferred frequency D will be used.

2025-05-08 21:30:49,104 - DEBUG - cmd: where.exe tbb.dll
cwd: None
2025-05-08 21:30:49,153 - DEBUG - TBB already found in load path
2025-05-08 21:30:49,184 - DEBUG - input tempfile: C:\Users\nguye\AppData\Local\Temp\tmp7iu757we\1akm4q78.json
2025-05-08 21:30:49,265 - DEBUG - input tempfile: C:\Users\nguye\AppData\Local\Temp\tmp7iu757we\xyzgzsmk.json
2025-05-08 21:30:49,268 - DEBUG - idx 0
2025-05-08 21:30:49,269 - DEBUG - running CmdStan, num_threads: 1
2025-05-08 21:30:49,270 - DEBUG - CmdStan args: ['C:\\Users\\nguye\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\prophet\\stan_model\\prophet_model.bin', '

Epoch 1/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 44ms/step - loss: 0.0914
Epoch 2/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 53ms/step - loss: 0.0129
Epoch 3/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 52ms/step - loss: 0.0075
Epoch 4/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 51ms/step - loss: 0.0069
Epoch 5/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 55ms/step - loss: 0.0051
Epoch 6/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 51ms/step - loss: 0.0044
Epoch 7/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 51ms/step - loss: 0.0054
Epoch 8/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 60ms/step - loss: 0.0044
Epoch 9/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 51ms/step - loss: 0.0053
Epoch 10/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 56ms/step - loss: 0.0048

2025-05-08 21:31:03,322 - INFO - LSTM Metrics: {'RMSE': 6.180399100607303, 'MAE': 4.663429304553614, 'MAPE': 1.9781381631752557}

No frequency information was provided, so inferred frequency D will be used.


No frequency information was provided, so inferred frequency D will be used.


No frequency information was provided, so inferred frequency D will be used.

2025-05-08 21:31:04,831 - DEBUG - cmd: where.exe tbb.dll
cwd: None
2025-05-08 21:31:04,897 - DEBUG - TBB already found in load path
2025-05-08 21:31:04,937 - DEBUG - input tempfile: C:\Users\nguye\AppData\Local\Temp\tmp7iu757we\2cbznwzh.json
2025-05-08 21:31:05,051 - DEBUG - input tempfile: C:\Users\nguye\AppData\Local\Temp\tmp7iu757we\sb9hi8j8.json
2025-05-08 21:31:05,055 - DEBUG - idx 0
2025-05-08 21:31:05,058 - DEBUG - running CmdStan, num_threads: 1
2025-05-08 21:31:05,059 - DEBUG - CmdStan args: ['C:\\Users\\nguye\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\prophet\\stan_model\\prophet_model.bin', 'r

Epoch 1/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 43ms/step - loss: 0.0803
Epoch 2/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 46ms/step - loss: 0.0180
Epoch 3/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 47ms/step - loss: 0.0084
Epoch 4/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 56ms/step - loss: 0.0078
Epoch 5/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 45ms/step - loss: 0.0072
Epoch 6/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 45ms/step - loss: 0.0054
Epoch 7/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 54ms/step - loss: 0.0046
Epoch 8/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 45ms/step - loss: 0.0053
Epoch 9/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 46ms/step - loss: 0.0050
Epoch 10/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 54ms/step - loss: 0.0050

2025-05-08 21:31:18,467 - INFO - LSTM Metrics: {'RMSE': 7.67733404307637, 'MAE': 6.3543982373046894, 'MAPE': 2.684887867217528}

No frequency information was provided, so inferred frequency D will be used.


No frequency information was provided, so inferred frequency D will be used.


No frequency information was provided, so inferred frequency D will be used.

2025-05-08 21:31:19,645 - DEBUG - cmd: where.exe tbb.dll
cwd: None
2025-05-08 21:31:19,696 - DEBUG - TBB already found in load path
2025-05-08 21:31:19,726 - DEBUG - input tempfile: C:\Users\nguye\AppData\Local\Temp\tmp7iu757we\1e6pak4d.json
2025-05-08 21:31:19,819 - DEBUG - input tempfile: C:\Users\nguye\AppData\Local\Temp\tmp7iu757we\k488ll2p.json
2025-05-08 21:31:19,823 - DEBUG - idx 0
2025-05-08 21:31:19,824 - DEBUG - running CmdStan, num_threads: 1
2025-05-08 21:31:19,825 - DEBUG - CmdStan args: ['C:\\Users\\nguye\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\prophet\\stan_model\\prophet_model.bin', 'ra

Epoch 1/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 51ms/step - loss: 0.0642
Epoch 2/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 61ms/step - loss: 0.0122
Epoch 3/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 45ms/step - loss: 0.0070
Epoch 4/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 58ms/step - loss: 0.0057
Epoch 5/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 56ms/step - loss: 0.0055
Epoch 6/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 55ms/step - loss: 0.0051
Epoch 7/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 52ms/step - loss: 0.0058
Epoch 8/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 47ms/step - loss: 0.0057
Epoch 9/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 59ms/step - loss: 0.0042
Epoch 10/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 52ms/step - loss: 0.0049

2025-05-08 21:31:34,130 - INFO - LSTM Metrics: {'RMSE': 7.27729661236646, 'MAE': 6.014407159714474, 'MAPE': 2.5431929622049796}

No frequency information was provided, so inferred frequency D will be used.


No frequency information was provided, so inferred frequency D will be used.


No frequency information was provided, so inferred frequency D will be used.

2025-05-08 21:31:36,154 - DEBUG - cmd: where.exe tbb.dll
cwd: None
2025-05-08 21:31:36,209 - DEBUG - TBB already found in load path
2025-05-08 21:31:36,244 - DEBUG - input tempfile: C:\Users\nguye\AppData\Local\Temp\tmp7iu757we\fm_a_9co.json
2025-05-08 21:31:36,338 - DEBUG - input tempfile: C:\Users\nguye\AppData\Local\Temp\tmp7iu757we\d2cm22ig.json
2025-05-08 21:31:36,341 - DEBUG - idx 0
2025-05-08 21:31:36,342 - DEBUG - running CmdStan, num_threads: 1
2025-05-08 21:31:36,344 - DEBUG - CmdStan args: ['C:\\Users\\nguye\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\prophet\\stan_model\\prophet_model.bin', 'ra

Epoch 1/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 45ms/step - loss: 0.0455
Epoch 2/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 52ms/step - loss: 0.0132
Epoch 3/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 50ms/step - loss: 0.0081
Epoch 4/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 54ms/step - loss: 0.0049
Epoch 5/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 50ms/step - loss: 0.0047
Epoch 6/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 53ms/step - loss: 0.0052
Epoch 7/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 53ms/step - loss: 0.0045
Epoch 8/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 60ms/step - loss: 0.0042
Epoch 9/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 56ms/step - loss: 0.0040
Epoch 10/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 50ms/step - loss: 0.0043

2025-05-08 21:31:50,122 - INFO - LSTM Metrics: {'RMSE': 7.260255017750833, 'MAE': 5.8956951141357425, 'MAPE': 2.48850139509957}

No frequency information was provided, so inferred frequency D will be used.


No frequency information was provided, so inferred frequency D will be used.


No frequency information was provided, so inferred frequency D will be used.

2025-05-08 21:31:54,402 - DEBUG - cmd: where.exe tbb.dll
cwd: None
2025-05-08 21:31:54,454 - DEBUG - TBB already found in load path
2025-05-08 21:31:54,489 - DEBUG - input tempfile: C:\Users\nguye\AppData\Local\Temp\tmp7iu757we\0qko3094.json
2025-05-08 21:31:54,581 - DEBUG - input tempfile: C:\Users\nguye\AppData\Local\Temp\tmp7iu757we\bn3m33sl.json
2025-05-08 21:31:54,584 - DEBUG - idx 0
2025-05-08 21:31:54,587 - DEBUG - running CmdStan, num_threads: 1
2025-05-08 21:31:54,587 - DEBUG - CmdStan args: ['C:\\Users\\nguye\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\prophet\\stan_model\\prophet_model.bin', 'ra

Epoch 1/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 59ms/step - loss: 0.1484
Epoch 2/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 61ms/step - loss: 0.0248
Epoch 3/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 51ms/step - loss: 0.0089
Epoch 4/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 57ms/step - loss: 0.0082
Epoch 5/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 50ms/step - loss: 0.0066
Epoch 6/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 53ms/step - loss: 0.0063
Epoch 7/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 64ms/step - loss: 0.0058
Epoch 8/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 54ms/step - loss: 0.0055
Epoch 9/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 58ms/step - loss: 0.0043
Epoch 10/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 52ms/step - loss: 0.0057

2025-05-08 21:32:10,191 - INFO - LSTM Metrics: {'RMSE': 10.65066561728899, 'MAE': 6.921468184123666, 'MAPE': 3.1777820882291987}

No frequency information was provided, so inferred frequency D will be used.


No frequency information was provided, so inferred frequency D will be used.


No frequency information was provided, so inferred frequency D will be used.

2025-05-08 21:32:15,786 - DEBUG - cmd: where.exe tbb.dll
cwd: None
2025-05-08 21:32:15,933 - DEBUG - TBB already found in load path
2025-05-08 21:32:15,969 - DEBUG - input tempfile: C:\Users\nguye\AppData\Local\Temp\tmp7iu757we\k_48v13c.json
2025-05-08 21:32:16,074 - DEBUG - input tempfile: C:\Users\nguye\AppData\Local\Temp\tmp7iu757we\snqkzwer.json
2025-05-08 21:32:16,078 - DEBUG - idx 0
2025-05-08 21:32:16,079 - DEBUG - running CmdStan, num_threads: 1
2025-05-08 21:32:16,080 - DEBUG - CmdStan args: ['C:\\Users\\nguye\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\prophet\\stan_model\\prophet_model.bin', 'r

Epoch 1/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 52ms/step - loss: 0.1688
Epoch 2/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 51ms/step - loss: 0.0179
Epoch 3/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 56ms/step - loss: 0.0090
Epoch 4/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 56ms/step - loss: 0.0078
Epoch 5/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 56ms/step - loss: 0.0052
Epoch 6/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 50ms/step - loss: 0.0052
Epoch 7/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 49ms/step - loss: 0.0045
Epoch 8/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 59ms/step - loss: 0.0045
Epoch 9/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 52ms/step - loss: 0.0041
Epoch 10/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 51ms/step - loss: 0.0044

2025-05-08 21:32:31,120 - INFO - LSTM Metrics: {'RMSE': 10.575044267703577, 'MAE': 6.913842512523855, 'MAPE': 3.213235724064468}

No frequency information was provided, so inferred frequency D will be used.


No frequency information was provided, so inferred frequency D will be used.


No frequency information was provided, so inferred frequency D will be used.

2025-05-08 21:32:36,169 - DEBUG - cmd: where.exe tbb.dll
cwd: None
2025-05-08 21:32:36,238 - DEBUG - TBB already found in load path
2025-05-08 21:32:36,276 - DEBUG - input tempfile: C:\Users\nguye\AppData\Local\Temp\tmp7iu757we\ho7duohc.json
2025-05-08 21:32:36,389 - DEBUG - input tempfile: C:\Users\nguye\AppData\Local\Temp\tmp7iu757we\ajeie31l.json
2025-05-08 21:32:36,394 - DEBUG - idx 0
2025-05-08 21:32:36,395 - DEBUG - running CmdStan, num_threads: 1
2025-05-08 21:32:36,397 - DEBUG - CmdStan args: ['C:\\Users\\nguye\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\prophet\\stan_model\\prophet_model.bin', 'r

Epoch 1/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 53ms/step - loss: 0.0359
Epoch 2/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 61ms/step - loss: 0.0096
Epoch 3/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 48ms/step - loss: 0.0068
Epoch 4/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 55ms/step - loss: 0.0064
Epoch 5/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 59ms/step - loss: 0.0057
Epoch 6/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 48ms/step - loss: 0.0059
Epoch 7/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 56ms/step - loss: 0.0045
Epoch 8/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 59ms/step - loss: 0.0047
Epoch 9/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 55ms/step - loss: 0.0042
Epoch 10/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 56ms/step - loss: 0.0042

2025-05-08 21:32:51,658 - INFO - LSTM Metrics: {'RMSE': 10.40940658508059, 'MAE': 6.837089742024739, 'MAPE': 3.198917853322443}
2025-05-08 21:32:52,465 - INFO - Walk-Forward Validation Results: {'ARIMA': {'RMSE': 6.232461551035971, 'MAE': 4.940595416676027, 'MAPE': 2.276363201976886}, 'Prophet': {'RMSE': 7.749649131864493, 'MAE': 6.889598049921838, 'MAPE': 3.2015651859965484}, 'LSTM': {'RMSE': 8.94881084386845, 'MAE': 8.35446853637695, 'MAPE': 3.8850841737368076}}
2025-05-08 21:32:52,468 - INFO - Average Metrics: {'ARIMA': {'RMSE': 6.232461551035971, 'MAE': 4.940595416676027, 'MAPE': 2.276363201976886}, 'Prophet': {'RMSE': 7.749649131864493, 'MAE': 6.889598049921838, 'MAPE': 3.2015651859965484}, 'LSTM': {'RMSE': 8.94881084386845, 'MAE': 8.35446853637695, 'MAPE': 3.8850841737368076}}


**Backtesting**

In [171]:
# Backtesting
if len(data) >= 100 + seq_length:
    # ARIMA historical predictions
    arima_model = auto_arima(data['Close'][:-forecast_horizon], seasonal=False, max_p=7, max_q=7, max_d=2)
    arima_fit = ARIMA(data['Close'][:-forecast_horizon], order=arima_model.order).fit()
    arima_hist_pred = pd.Series(arima_fit.predict(start=0, end=len(data)-forecast_horizon-1), 
                               index=data.iloc[:-forecast_horizon].index)
    arima_backtest_results, arima_backtest_df = backtest_strategy(data.iloc[:-forecast_horizon], arima_hist_pred, 'ARIMA')
    
    # Prophet historical predictions
    train_data = data.iloc[:-forecast_horizon]
    prophet_df = train_data.reset_index()[['Date', 'Close', 'Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA', 'Sentiment_Score', 'SMA_20', 'RSI']]
    prophet_df = prophet_df.drop_duplicates(subset='Date')
    prophet_df = prophet_df.rename(columns={'Date': 'ds', 'Close': 'y'})
    
    prophet_model = Prophet(daily_seasonality=True, yearly_seasonality=True, weekly_seasonality=True,
                            changepoint_prior_scale=prophet_scale, mcmc_samples=0)
    for regressor in ['Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA', 'Sentiment_Score', 'SMA_20', 'RSI']:
        prophet_model.add_regressor(regressor)
    prophet_model.fit(prophet_df)
    
    prophet_pred_df = prophet_model.predict(prophet_df)
    prophet_pred_df = prophet_pred_df.set_index('ds').reindex(train_data.index)
    prophet_hist_pred = prophet_pred_df['yhat']
    prophet_backtest_results, prophet_backtest_df = backtest_strategy(train_data, prophet_hist_pred, 'Prophet')
    
    # LSTM historical predictions
    scaler_features = MinMaxScaler()
    scaler_target = MinMaxScaler()
    scaled_data = train_data.copy()
    feature_cols = ['Close', 'Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA', 'Sentiment_Score', 'SMA_20', 'RSI']
    scaled_data[feature_cols] = scaler_features.fit_transform(train_data[feature_cols])
    scaled_data['Close'] = scaler_target.fit_transform(train_data[['Close']])
    
    X, y = create_sequences(scaled_data, seq_length, feature_cols, 'Close')
    model = Sequential([
        Input((seq_length, len(feature_cols))),
        LSTM(50, return_sequences=True),
        Dropout(0.2),
        LSTM(50),
        Dropout(0.2),
        Dense(25),
        Dense(1)
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
    model.fit(X, y, epochs=50, batch_size=32, verbose=1)
    
    # Generate historical predictions
    lstm_hist_pred_scaled = model.predict(X, verbose=0)
    lstm_hist_pred = scaler_target.inverse_transform(lstm_hist_pred_scaled).flatten()
    lstm_hist_pred_series = pd.Series(lstm_hist_pred, index=train_data.iloc[seq_length:].index)
    lstm_backtest_results, lstm_backtest_df = backtest_strategy(train_data.iloc[seq_length:], lstm_hist_pred_series, 'LSTM')
else:
    arima_backtest_results, arima_backtest_df = None, None
    prophet_backtest_results, prophet_backtest_df = None, None
    lstm_backtest_results, lstm_backtest_df = None, None
    logging.warning(f"Insufficient data for backtesting {ticker}.")


No frequency information was provided, so inferred frequency D will be used.


No frequency information was provided, so inferred frequency D will be used.


No frequency information was provided, so inferred frequency D will be used.

2025-05-08 21:32:56,909 - INFO - Backtest Results for ARIMA: {'Cumulative Return (%)': 3.9594475370234417, 'Number of Trades': 1}
2025-05-08 21:32:56,918 - DEBUG - cmd: where.exe tbb.dll
cwd: None
2025-05-08 21:32:56,980 - DEBUG - TBB already found in load path
2025-05-08 21:32:57,019 - DEBUG - input tempfile: C:\Users\nguye\AppData\Local\Temp\tmp7iu757we\dyjdhzgd.json
2025-05-08 21:32:57,123 - DEBUG - input tempfile: C:\Users\nguye\AppData\Local\Temp\tmp7iu757we\lw94z553.json
2025-05-08 21:32:57,127 - DEBUG - idx 0
2025-05-08 21:32:57,136 - DEBUG - running CmdStan, num_threads: 1
2025-05-08 21:32:57,138 - DEBUG - CmdStan args: ['C:\\Users\\nguye\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\prophet\\stan_model\\prophet_model.bin', '

Epoch 1/50
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 69ms/step - loss: 0.0684
Epoch 2/50
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 65ms/step - loss: 0.0107
Epoch 3/50
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 65ms/step - loss: 0.0098
Epoch 4/50
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 57ms/step - loss: 0.0096
Epoch 5/50
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 59ms/step - loss: 0.0066
Epoch 6/50
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 60ms/step - loss: 0.0074
Epoch 7/50
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 62ms/step - loss: 0.0069
Epoch 8/50
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 55ms/step - loss: 0.0067
Epoch 9/50
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 61ms/step - loss: 0.0071
Epoch 10/50
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 60ms/step - loss: 0.0062

2025-05-08 21:34:16,006 - INFO - Backtest Results for LSTM: {'Cumulative Return (%)': 22.64587157840081, 'Number of Trades': 56}


**Plot predictions**

In [176]:
# Plot predictions
# Plot predictions with the LSTM forecast result
plot_predictions(data, arima_forecast, prophet_forecast, prophet_forecast_df, lstm_pred,
                 arima_backtest_df, prophet_backtest_df, lstm_backtest_df, ticker, forecast_horizon)

In [177]:
# Save predictions
if all(x is not None for x in [arima_forecast, prophet_forecast, lstm_forecast]):
    future_dates = [data.index[-1] + timedelta(days=i+1) for i in range(forecast_horizon)]
    pred_df = pd.DataFrame({
        'Date': future_dates,
        'ARIMA_Prediction': arima_forecast,
        'Prophet_Prediction': prophet_forecast,
        'LSTM_Prediction': lstm_forecast
    })
    pred_df.to_csv(data_dir / f'{ticker}_stock_price_predictions_finbert_lstm.csv', index=False)
    logging.info(f"Saved predictions for {ticker}")

# Save metrics
metrics_df = pd.DataFrame({
    'Model': ['ARIMA', 'Prophet', 'LSTM'],
    'RMSE': [arima_metrics.get('RMSE', np.nan), prophet_metrics.get('RMSE', np.nan), lstm_metrics.get('RMSE', np.nan)],
    'MAE': [arima_metrics.get('MAE', np.nan), prophet_metrics.get('MAE', np.nan), lstm_metrics.get('MAE', np.nan)],
    'MAPE': [arima_metrics.get('MAPE', np.nan), prophet_metrics.get('MAPE', np.nan), lstm_metrics.get('MAPE', np.nan)],
    'Best Parameters': [f"Order: {arima_order}", f"changepoint_prior_scale: {prophet_scale or 0.05}", 'LSTM (50 units)']
})
metrics_df.to_csv(data_dir / f'{ticker}_model_metrics_finbert_lstm.csv', index=False)
logging.info(f"Saved metrics for {ticker}")

# Save backtest results
if all(x is not None for x in [arima_backtest_results, prophet_backtest_results, lstm_backtest_results]):
    backtest_df = pd.DataFrame({
        'Model': ['ARIMA', 'Prophet', 'LSTM'],
        'Cumulative Return (%)': [
            arima_backtest_results['Cumulative Return (%)'],
            prophet_backtest_results['Cumulative Return (%)'],
            lstm_backtest_results['Cumulative Return (%)']
        ],
        'Number of Trades': [
            arima_backtest_results['Number of Trades'],
            prophet_backtest_results['Number of Trades'],
            lstm_backtest_results['Number of Trades']
        ]
    })
    backtest_df.to_csv(data_dir / f'{ticker}_backtest_results_finbert_lstm.csv', index=False)
    logging.info(f"Saved backtest results for {ticker}")

# Select best model
rmse_values = {
    'ARIMA': avg_metrics['ARIMA']['RMSE'],
    'Prophet': avg_metrics['Prophet']['RMSE'],
    'LSTM': avg_metrics['LSTM']['RMSE']
}
best_model = min(rmse_values, key=rmse_values.get)
results.append({'Ticker': ticker, 'Best_Model': best_model, 'Metrics': avg_metrics})

# Save summary
summary_df = pd.DataFrame(results)
summary_df.to_csv(data_dir / 'summary_results_finbert_lstm.csv', index=False)
logging.info("Saved summary results")

2025-05-08 21:36:52,874 - INFO - Saved predictions for AAPL
2025-05-08 21:36:52,878 - INFO - Saved metrics for AAPL
2025-05-08 21:36:52,881 - INFO - Saved backtest results for AAPL
2025-05-08 21:36:52,885 - INFO - Saved summary results
