In [None]:
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller
from pmdarima import auto_arima
from prophet import Prophet
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from datetime import timedelta
from scipy.stats import trim_mean
import os
import logging
import re
from pathlib import Path
from tqdm import tqdm

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [None]:
def load_and_prepare_data(stock_csv, balance_csv, cashflow_csv, income_csv, ticker):
    """
    Load and preprocess stock and financial data, aligning financial metrics by year.
    """
    try:
        # Load stock data
        stock_df = pd.read_csv(stock_csv)
        stock_df['Date'] = pd.to_datetime(stock_df['Date']).dt.date
        stock_df = stock_df[stock_df['Ticker'] == ticker][['Date', 'Close', 'Volume']].sort_values('Date')

        # Create daily date range and fill missing dates
        date_range = pd.date_range(start=stock_df['Date'].min(), end=stock_df['Date'].max(), freq='D')
        stock_df = stock_df.set_index('Date').reindex(date_range, method='ffill').reset_index()
        stock_df = stock_df.rename(columns={'index': 'Date'})
        stock_df['Ticker'] = ticker
        stock_df['Year'] = pd.to_datetime(stock_df['Date']).dt.year
        
        # Load financial data
        balance_df = pd.read_csv(balance_csv)
        cashflow_df = pd.read_csv(cashflow_csv)
        income_df = pd.read_csv(income_csv)
        
        # Get available years (excluding index, Ticker)
        year_cols = [col for col in balance_df.columns if col not in ['index', 'Ticker']]
        
        # Select key financial metrics for all years
        financial_metrics = {}
        for metric, source_df in [
            ('Diluted EPS', income_df),
            ('EBITDA', income_df),
            ('Free Cash Flow', cashflow_df),
            ('Net Debt', balance_df)
        ]:
            metric_row = source_df[source_df['index'] == metric]
            if metric_row.empty:
                logging.warning(f"No data for {metric}")
                continue
            # Extract values for all years
            values = metric_row[year_cols].iloc[0].to_dict()
            financial_metrics[metric] = values
        
        # Create financial DataFrame with year-based values
        financial_df = pd.DataFrame([
            {'Year': int(year), **{metric: values.get(year, np.nan) for metric, values in financial_metrics.items()}}
            for year in year_cols
        ])
        logging.info(f"Financial data years: {financial_df['Year'].unique()}")
        logging.info(f"Financial data sample:\n{financial_df.head().to_string()}")
        
        # Merge with stock data by year
        merged_df = stock_df.merge(financial_df, on='Year', how='left')
        merged_df = merged_df.set_index('Date').asfreq('D')  # Set daily frequency
        
        # Forward-fill financial metrics
        financial_cols = ['Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA']
        available_cols = [col for col in financial_cols if col in merged_df.columns]
        
        merged_df[available_cols] = merged_df[available_cols].ffill().fillna(merged_df[available_cols].mean())
        
        # Check variance
        for col in available_cols + ['Close', 'Volume']:
            if merged_df[col].var() < 1e-10:
                logging.warning(f"Column '{col}' has near-zero variance: {merged_df[col].var()}. Adding noise.")
                merged_df[col] += np.random.normal(0, 0.01, len(merged_df))
        
        return merged_df 
    
    except FileNotFoundError as e:
        logging.error(f"File not found: {e}")
        return None
    except Exception as e:
        logging.error(f"Error preparing data: {e}")
        return None

In [None]:
def clean_text(text):
    """
    Clean text by removing URLs and special characters.
    Added: Text preprocessing for sentiment analysis.
    """
    if not isinstance(text, str):
        return ""
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    return text.strip()

def get_sentiment(row):
    # Initialize sentiment analyzer
    analyzer = SentimentIntensityAnalyzer()
    
    text = (clean_text(row['Content']) if pd.notna(row['Content']) else
            clean_text(row['Description']) if pd.notna(row['Description']) else
            clean_text(row['Title']) if pd.notna(row['Title']) else "")
    if not text:
        logging.debug(f"Empty text for row: {row.name}. Returning 0.0")
        return 0.0
    scores = analyzer.polarity_scores(text)
    return scores['compound']

In [None]:
def load_news_data(news_csv, ticker, stock_dates):
    """
    Load and preprocess news data, compute sentiment scores, and align with stock data dates.
    Adjusted: Neutral fallback sentiment, text cleaning, performance optimization, date validation.
    """
    try:
        if not os.path.exists(news_csv):
            logging.error(f"News file not found: {news_csv}")
            raise FileNotFoundError(f"News file not found: {news_csv}")
        
        # Load news data
        news_df = pd.read_csv(news_csv)
        logging.info(f"Raw news data shape: {news_df.shape}, columns: {list(news_df.columns)}")
        
        # Parse dates
        news_df['Date'] = pd.to_datetime(news_df['Date'])
        invalid_dates = news_df['Date'].isna().sum()
        if invalid_dates > 0:
            logging.warning(f"Dropped {invalid_dates} rows due to invalid dates")
        news_df = news_df.dropna(subset=['Date'])
        logging.info(f"After date parsing, news data shape: {news_df.shape}")
        
        # Filter for ticker
        news_df['Ticker'] = news_df['Ticker'].str.strip().str.upper()
        ticker = ticker.strip().upper()
        news_df = news_df[news_df['Ticker'] == ticker]
        logging.info(f"After filtering for ticker '{ticker}', news data shape: {news_df.shape}")
        
        if news_df.empty:
            logging.warning(f"No news articles found for {ticker}. Using neutral sentiment scores.")
            return pd.DataFrame({'Date': stock_dates, 'Sentiment_Score': 0.0})
        
        news_df['Sentiment_Score'] = [get_sentiment(row) for row in tqdm(news_df.to_dict('records'), desc="Sentiment Analysis")]
        logging.info(f"Sentiment score summary: {news_df['Sentiment_Score'].describe().to_dict()}")
        
        # Aggregate sentiment by date
        sentiment_df = news_df.groupby('Date')['Sentiment_Score'].mean().reset_index()
        sentiment_df['Date'] = pd.to_datetime(sentiment_df['Date'])
        logging.info(f"Aggregated sentiment data shape: {sentiment_df.shape}")
        
        # Create full sentiment DataFrame
        sentiment_full = pd.DataFrame({'Date': stock_dates, 'Sentiment_Score': 0.0})
        sentiment_full = sentiment_full.merge(sentiment_df, on='Date', how='left', suffixes=('', '_news'))
        sentiment_full['Sentiment_Score'] = sentiment_full['Sentiment_Score_news'].fillna(0.0)
        
        # Forward-fill sentiment after first non-zero score
        first_news_date = sentiment_full[sentiment_full['Sentiment_Score'] != 0.0]['Date'].min()
        if pd.notna(first_news_date):
            mask = sentiment_full['Date'] >= first_news_date
            sentiment_full.loc[mask, 'Sentiment_Score'] = sentiment_full.loc[mask, 'Sentiment_Score'].ffill()
        
        sentiment_full = sentiment_full[['Date', 'Sentiment_Score']]
        logging.info(f"Final sentiment data shape: {sentiment_full.shape}, non-zero scores: {sentiment_full['Sentiment_Score'].ne(0).sum()}")
        return sentiment_full
    
    except FileNotFoundError as e:
        logging.error(f"File not found: {e}")
        return pd.DataFrame({'Date': stock_dates, 'Sentiment_Score': 0.0})
    except Exception as e:
        logging.error(f"Error processing news data: {e}")
        return pd.DataFrame({'Date': stock_dates, 'Sentiment_Score': 0.0})


In [5]:
def calculate_metrics(actual, predicted):
    """
    Calculate RMSE, MAE, and MAPE for model evaluation.
    Adjusted: Log filtered data points.
    """
    actual = np.array(actual)
    predicted = np.array(predicted)
    mask = (actual > 0) & (~np.isnan(actual)) & (~np.isnan(predicted))
    actual = actual[mask]
    predicted = predicted[mask]
    
    if len(actual) < len(np.array(actual, copy=True)):
        logging.info(f"Filtered {len(np.array(actual, copy=True)) - len(actual)} invalid data points for metrics calculation")
    
    if len(actual) == 0:
        logging.warning("No valid data for metrics calculation")
        return {'RMSE': np.nan, 'MAE': np.nan, 'MAPE': np.nan}
    
    rmse = np.sqrt(mean_squared_error(actual, predicted))
    mae = mean_absolute_error(actual, predicted)
    mape = np.mean(np.abs((actual - predicted) / actual)) * 100 if np.all(actual != 0) else np.nan
    return {'RMSE': rmse, 'MAE': mae, 'MAPE': mape}

In [None]:
def arima_forecast(data, forecast_horizon=7):
    """
    Fit ARIMA model with stationarity check.
    Adjusted: Validate data, handle insufficient data, robust error handling.
    """
    try:
        # Check stationarity
        result = adfuller(data)
        if result[1] > 0.05:
            logging.info("Data is non-stationary, applying differencing")
            data_diff = data.diff().dropna()
            if len(data_diff) < 10:
                logging.error("Insufficient data after differencing")
                return None, None, None
            model = auto_arima(data_diff, seasonal=False, max_p=7, max_q=7, max_d=2,
                              stepwise=True, trace=True, error_action='ignore')
            best_order = model.order
            arima_model = ARIMA(data_diff, order=best_order).fit()
            forecast_diff = arima_model.forecast(steps=forecast_horizon)
            forecast = data.iloc[-1] + forecast_diff.cumsum()
        else:
            model = auto_arima(data, seasonal=False, max_p=7, max_q=7, max_d=2,
                              stepwise=True, trace=True, error_action='ignore')
            best_order = model.order
            arima_model = ARIMA(data, order=best_order).fit()
            forecast = arima_model.forecast(steps=forecast_horizon)
        
        # Evaluate
        if len(data) >= forecast_horizon:
            test_data = data[-forecast_horizon:]
            forecast_test = arima_model.forecast(steps=forecast_horizon)[-forecast_horizon:]
            metrics = calculate_metrics(test_data, forecast_test)
        else:
            metrics = {'RMSE': np.nan, 'MAE': np.nan, 'MAPE': np.nan}
        
        logging.info(f"ARIMA Metrics: {metrics}")
        return forecast, metrics, best_order
    except Exception as e:
        logging.error(f"Error in ARIMA forecasting: {e}")
        return None, None, None

In [7]:
def prophet_forecast(data, forecast_horizon=7, changepoint_prior_scale=0.05):
    """
    Fit Prophet model with dynamic regressors.
    Adjusted: Forecast regressors, handle duplicates, enable MCMC for small datasets.
    """
    try:
        prophet_df = data.reset_index()[['Date', 'Close', 'Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA', 'Sentiment_Score']]
        prophet_df = prophet_df.rename(columns={'Date': 'ds', 'Close': 'y'})
        
        # Fit Prophet model
        model = Prophet(daily_seasonality=True, yearly_seasonality=True, weekly_seasonality=True,
                       changepoint_prior_scale=changepoint_prior_scale, mcmc_samples=300 if len(prophet_df) < 1000 else 0)
        for regressor in ['Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA', 'Sentiment_Score']:
            model.add_regressor(regressor)
        model.fit(prophet_df)
        
        # Create future dataframe with dynamic regressors
        future = model.make_future_dataframe(periods=forecast_horizon)
        for regressor in ['Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA']:
            future[regressor] = prophet_df[regressor].mean()  # Use mean of historical data
        future['Sentiment_Score'] = prophet_df['Sentiment_Score'].iloc[-10:].mean()  # Recent sentiment trend
        forecast_df = model.predict(future)
        
        # Extract forecast
        forecast = forecast_df[['ds', 'yhat']].tail(forecast_horizon).set_index('ds')['yhat']
        
        # Evaluate
        test_data = prophet_df['y'][-forecast_horizon:]
        forecast_test = model.predict(prophet_df[-forecast_horizon:])['yhat']
        metrics = calculate_metrics(test_data, forecast_test)
        
        logging.info(f"Prophet Metrics (changepoint_prior_scale={changepoint_prior_scale}): {metrics}")
        return forecast, metrics, forecast_df
    except Exception as e:
        logging.error(f"Error in Prophet forecasting: {e}")
        return None, None, None

In [8]:
def tune_prophet(data, forecast_horizon=7):
    """
    Tune Prophet model by testing changepoint_prior_scale values.
    """
    scales = [0.05, 0.1, 0.5]
    best_metrics = {'RMSE': float('inf')}
    best_forecast = None
    best_forecast_df = None
    best_scale = 0.05
    
    for scale in scales:
        forecast, metrics, forecast_df = prophet_forecast(data, forecast_horizon, changepoint_prior_scale=scale)
        if forecast is not None and metrics is not None and not np.any(np.isnan(forecast)) and metrics['RMSE'] < best_metrics['RMSE']:
                best_metrics = metrics
                best_forecast = forecast
                best_forecast_df = forecast_df
                best_scale = scale

    
    if best_forecast is None:
        logging.warning("All Prophet models failed. Using default scale=0.05.")
        forecast, metrics, forecast_df = prophet_forecast(data, forecast_horizon, changepoint_prior_scale=0.05)
        best_metrics = metrics if metrics else {'RMSE': np.nan, 'MAE': np.nan, 'MAPE': np.nan}
        best_forecast = forecast
        best_forecast_df = forecast_df
        best_scale = 0.05
    
    logging.info(f"Best Prophet changepoint_prior_scale: {best_scale}")
    logging.info(f"Best Prophet Metrics: {best_metrics}")
    return best_forecast, best_metrics, best_forecast_df, best_scale

In [9]:
def backtest_strategy(data, predictions, model_name, threshold=None):
    """
    Backtest a trading strategy with volatility-based threshold.
    Adjusted: Volatility-based threshold, prevent lookahead bias, add transaction costs.
    """
    try:
        logging.info(f"Backtest input lengths - Data: {len(data)}, Predictions: {len(predictions)}")
        if len(predictions) != len(data):
            logging.warning(f"Predictions length ({len(predictions)}) does not match data length ({len(data)}). Truncating.")
            min_length = min(len(data), len(predictions))
            data = data.iloc[:min_length]
            predictions = predictions[:min_length]
        
        pred_df = pd.DataFrame({'Date': data.index, 'Close': data['Close'], 'Prediction': predictions})
        pred_df = pred_df.dropna()
        
        # Volatility-based threshold
        volatility = pred_df['Close'].pct_change().std()
        threshold = volatility * 2 if threshold is None else threshold
        
        # Generate signals without lookahead bias
        pred_df['Signal'] = 0
        pred_df.loc[1:, 'Signal'] = np.where(
            pred_df['Prediction'][1:] > pred_df['Close'][:-1] * (1 + threshold), 1,
            np.where(pred_df['Prediction'][1:] < pred_df['Close'][:-1] * (1 - threshold), -1, 0)
        )
        
        # Calculate returns with transaction costs
        pred_df['Return'] = pred_df['Close'].pct_change()
        transaction_cost = 0.001  # 0.1% per trade
        pred_df['Strategy_Return'] = pred_df['Signal'].shift(1) * pred_df['Return'] - \
                                    pred_df['Signal'].abs().shift(1) * transaction_cost
        
        # Cumulative return
        cumulative_return = (1 + pred_df['Strategy_Return'].dropna()).cumprod().iloc[-1] - 1
        num_trades = pred_df['Signal'].abs().sum()
        
        results = {
            'Cumulative Return (%)': cumulative_return * 100,
            'Number of Trades': num_trades
        }
        
        logging.info(f"Backtest Results for {model_name}: {results}")
        return results, pred_df
    except Exception as e:
        logging.error(f"Error in backtesting: {e}")
        return None, None


In [10]:
def walk_forward_validation(data, forecast_horizon=7, n_folds=10):
    """
    Perform walk-forward validation with robust averaging.
    Adjusted: Increase folds, handle missing regressors, robust averaging.
    """
    n_folds = min(n_folds, len(data) // forecast_horizon)
    arima_metrics_list = []
    prophet_metrics_list = []
    
    for i in range(n_folds):
        train_end = len(data) - (n_folds - i) * forecast_horizon
        if train_end <= forecast_horizon:
            continue
        
        train_data = data.iloc[:train_end]
        test_data = data.iloc[train_end:train_end + forecast_horizon]['Close']
        
        if len(test_data) != forecast_horizon:
            continue
        
        # ARIMA
        arima_model = auto_arima(train_data['Close'], seasonal=False, max_p=7, max_q=7, max_d=2,
                                stepwise=True, error_action='ignore')
        arima_fit = ARIMA(train_data['Close'], order=arima_model.order).fit()
        arima_pred = arima_fit.forecast(steps=forecast_horizon)
        arima_metrics = calculate_metrics(test_data, arima_pred)
        arima_metrics_list.append(arima_metrics)
        
        # Prophet
        prophet_df = train_data.reset_index()[['Date', 'Close', 'Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA', 'Sentiment_Score']]
        prophet_df = prophet_df.rename(columns={'Date': 'ds', 'Close': 'y'})
        
        prophet_model = Prophet(daily_seasonality=True, yearly_seasonality=True, weekly_seasonality=True,
                               changepoint_prior_scale=0.05, mcmc_samples=0)
        for regressor in ['Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA', 'Sentiment_Score']:
            prophet_model.add_regressor(regressor)
        prophet_model.fit(prophet_df)
        future = prophet_model.make_future_dataframe(periods=forecast_horizon)
        for regressor in ['Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA', 'Sentiment_Score']:
            future[regressor] = prophet_df[regressor].ffill().iloc[-1]
        prophet_pred_df = prophet_model.predict(future)
        prophet_pred = prophet_pred_df['yhat'].tail(forecast_horizon)
        prophet_metrics = calculate_metrics(test_data, prophet_pred)
        prophet_metrics_list.append(prophet_metrics)
    
    # Robust averaging
    avg_metrics = {
        'ARIMA': {
            'RMSE': trim_mean([m['RMSE'] for m in arima_metrics_list], proportiontocut=0.1) if arima_metrics_list else np.nan,
            'MAE': trim_mean([m['MAE'] for m in arima_metrics_list], proportiontocut=0.1) if arima_metrics_list else np.nan,
            'MAPE': trim_mean([m['MAPE'] for m in arima_metrics_list if not np.isnan(m['MAPE'])], proportiontocut=0.1) if any(not np.isnan(m['MAPE']) for m in arima_metrics_list) else np.nan
        },
        'Prophet': {
            'RMSE': trim_mean([m['RMSE'] for m in prophet_metrics_list], proportiontocut=0.1) if prophet_metrics_list else np.nan,
            'MAE': trim_mean([m['MAE'] for m in prophet_metrics_list], proportiontocut=0.1) if prophet_metrics_list else np.nan,
            'MAPE': trim_mean([m['MAPE'] for m in prophet_metrics_list if not np.isnan(m['MAPE'])], proportiontocut=0.1) if any(not np.isnan(m['MAPE']) for m in prophet_metrics_list) else np.nan
        }
    }
    
    logging.info(f"Walk-Forward Validation Results: {avg_metrics}")
    return avg_metrics

In [23]:
def plot_predictions(data, arima_forecast, prophet_forecast, prophet_forecast_df, backtest_arima_df, backtest_prophet_df, ticker, forecast_horizon=7):
    """
    Plot predictions with dynamic titles and interactive sentiment visualization using Plotly.
    Adjusted: Uses scatter points for significant sentiment on secondary y-axes, interactive subplots, saves HTML and PNG.
    """
    # Calculate future dates
    last_date = data.index[-1]
    future_dates = [last_date + timedelta(days=i+1) for i in range(forecast_horizon)]
    
    # Create 1x2 subplot with secondary y-axes
    fig = make_subplots(
        rows=1, cols=2, 
        subplot_titles=(f'ARIMA: {ticker} Stock Price Prediction', 
                        f'Prophet: {ticker} Stock Price Prediction'),
        shared_yaxes=True,
        specs=[[{'secondary_y': True}, {'secondary_y': True}]]  # Enable secondary y-axes
    )
    
    # --- ARIMA Subplot (Left, Column 1) ---
    # Historical Close (Primary Y-Axis)
    fig.add_trace(
        go.Scatter(x=data.index, y=data['Close'], name='Historical Close', line=dict(color='blue')),
        row=1, col=1,
        secondary_y=False
    )
    
    # ARIMA Forecast (Primary Y-Axis)
    if arima_forecast is not None:
        fig.add_trace(
            go.Scatter(x=future_dates, y=arima_forecast, name='ARIMA Forecast', 
                       line=dict(color='red', dash='dash')),
            row=1, col=1,
            secondary_y=False
        )
    
    # ARIMA Buy/Sell Signals (Primary Y-Axis)
    if backtest_arima_df is not None:
        signals = backtest_arima_df[backtest_arima_df['Signal'].isin([1, -1])]
        marker_colors = ['green' if s == 1 else 'red' for s in signals['Signal']]
        marker_symbols = ['triangle-up' if s == 1 else 'triangle-down' for s in signals['Signal']]
        fig.add_trace(
            go.Scatter(
                x=signals['Date'], 
                y=signals['Close'], 
                name='ARIMA Signals',
                mode='markers', 
                marker=dict(
                    color=marker_colors, 
                    symbol=marker_symbols, 
                    size=10, 
                    opacity=0.7
                ),
                text=['Buy' if s == 1 else 'Sell' for s in signals['Signal']],
                hovertemplate='%{text}<br>Date: %{x}<br>Close: %{y:.2f}'
            ),
            row=1, col=1,
            secondary_y=False
        )
    
    # Sentiment Scores (Secondary Y-Axis for ARIMA)
    if 'Sentiment_Score' in data.columns:
        sentiment_mask = (data['Sentiment_Score'] != 0.0) & (data['Sentiment_Score'].abs() > 0.3)
        if sentiment_mask.any():
            fig.add_trace(
                go.Scatter(
                    x=data.index[sentiment_mask], 
                    y=data['Sentiment_Score'][sentiment_mask],
                    name='Significant Sentiment',
                    mode='markers',
                    marker=dict(color='purple', size=8, opacity=0.5),
                    text=[f"Score: {s:.2f}" for s in data['Sentiment_Score'][sentiment_mask]],
                    hoverinfo='x+text'
                ),
                row=1, col=1,
                secondary_y=True  # Assign to secondary y-axis (y2)
            )
    
    # --- Prophet Subplot (Right, Column 2) ---
    # Historical Close (Primary Y-Axis)
    fig.add_trace(
        go.Scatter(x=data.index, y=data['Close'], name='Historical Close', line=dict(color='blue'), showlegend=False),
        row=1, col=2,
        secondary_y=False
    )
    
    # Prophet Forecast (Primary Y-Axis)
    if prophet_forecast is not None:
        fig.add_trace(
            go.Scatter(x=future_dates, y=prophet_forecast, name='Prophet Forecast', 
                       line=dict(color='green', dash='dash')),
            row=1, col=2,
            secondary_y=False
        )
        # Confidence Interval (Primary Y-Axis)
        if prophet_forecast_df is not None and 'yhat_upper' in prophet_forecast_df.columns:
            fig.add_trace(
                go.Scatter(
                    x=future_dates + future_dates[::-1],
                    y=list(prophet_forecast_df['yhat_upper'].tail(forecast_horizon)) + 
                      list(prophet_forecast_df['yhat_lower'].tail(forecast_horizon))[::-1],
                    fill='toself',
                    fillcolor='rgba(0, 128, 0, 0.1)',
                    line=dict(color='rgba(255,255,255,0)'),
                    name='Prophet Confidence Interval',
                    hoverinfo='skip'
                ),
                row=1, col=2,
                secondary_y=False
            )
    
    # Prophet Buy/Sell Signals (Primary Y-Axis)
    if backtest_prophet_df is not None:
        signals = backtest_prophet_df[backtest_prophet_df['Signal'].isin([1, -1])]
        marker_colors = ['lime' if s == 1 else 'darkred' for s in signals['Signal']]
        marker_symbols = ['triangle-up' if s == 1 else 'triangle-down' for s in signals['Signal']]
        fig.add_trace(
            go.Scatter(
                x=signals['Date'], 
                y=signals['Close'], 
                name='Prophet Signals',
                mode='markers', 
                marker=dict(
                    color=marker_colors, 
                    symbol=marker_symbols, 
                    size=10, 
                    opacity=0.7
                ),
                text=['Buy' if s == 1 else 'Sell' for s in signals['Signal']],
                hovertemplate='%{text}<br>Date: %{x}<br>Close: %{y:.2f}'
            ),
            row=1, col=2,
            secondary_y=False
        )
    
    # Sentiment Scores (Secondary Y-Axis for Prophet)
    if 'Sentiment_Score' in data.columns:
        if sentiment_mask.any():
            fig.add_trace(
                go.Scatter(
                    x=data.index[sentiment_mask], 
                    y=data['Sentiment_Score'][sentiment_mask],
                    name='Significant Sentiment',
                    mode='markers',
                    marker=dict(color='purple', size=8, opacity=0.5),
                    text=[f"Score: {s:.2f}" for s in data['Sentiment_Score'][sentiment_mask]],
                    hoverinfo='x+text',
                    showlegend=False
                ),
                row=1, col=2,
                secondary_y=True  # Assign to secondary y-axis (y4)
            )
    
    # Update layout
    fig.update_layout(
        title=dict(text=f'{ticker} Stock Price Prediction and Backtest Signals (7-Day Forecast)',
                   x=0.5, xanchor='center', y=0.98),
        height=1000, width=1400,
        showlegend=True,
        legend=dict(orientation='h', yanchor='bottom', y=-0.35, xanchor='center', x=0.5),
        hovermode='x unified',
        plot_bgcolor='white',
        margin=dict(l=50, r=50, t=100, b=100)
    )
    
    # Update x-axes
    fig.update_xaxes(title_text='Date', gridcolor='lightgrey', row=1, col=1)
    fig.update_xaxes(title_text='Date', gridcolor='lightgrey', row=1, col=2)
    
    # Update y-axes
    fig.update_yaxes(title_text='Close Price (USD)', gridcolor='lightgrey', row=1, col=1, secondary_y=False)
    fig.update_yaxes(title_text='Sentiment Score (-1 to 1)', row=1, col=1, secondary_y=True, 
                     showgrid=False, tickfont=dict(color='purple'))
    fig.update_yaxes(title_text='Close Price (USD)', gridcolor='lightgrey', row=1, col=2, secondary_y=False)
    fig.update_yaxes(title_text='Sentiment Score (-1 to 1)', row=1, col=2, secondary_y=True, 
                     showgrid=False, tickfont=dict(color='purple'))
    
    # Add range slider (on both subplots)
    fig.update_xaxes(
        rangeslider_visible=True,
        rangeslider_thickness=0.15,
        rangeselector=dict(
            buttons=list([
                dict(count=1, label='1mo', step='month', stepmode='backward'),
                dict(count=6, label='6mo', step='month', stepmode='backward'),
                dict(count=1, label='1y', step='year', stepmode='backward'),
                dict(step='all', label='All'),
            ]),
            x=0.055,
            y=0.95,
        ),
        row=1, col=1
    )
    
    fig.update_xaxes(
        rangeslider_visible=True,
        rangeslider_thickness=0.15,
        rangeselector=dict(
            buttons=list([
                dict(count=1, label='1mo', step='month', stepmode='backward'),
                dict(count=6, label='6mo', step='month', stepmode='backward'),
                dict(count=1, label='1y', step='year', stepmode='backward'),
                dict(step='all', label='All'),
            ]),
            x=0.595,
            y=0.95,
        ),
        row=1, col=2
    )
    
    fig.show()

**Main**

In [12]:
# Parameters
forecast_horizon = 7
data_dir = Path('../data')
stock_csv = data_dir / 'cleaned_stock_data.csv'

# Get tickers
tickers = pd.read_csv(stock_csv)['Ticker'].unique()[:5]  # Process up to 5 tickers
results = []

for ticker in tickers:
    logging.info(f"Processing ticker: {ticker}")
    balance_csv = data_dir / f'balance_sheet_{ticker}.csv'
    cashflow_csv = data_dir / f'cash_flow_{ticker}.csv'
    income_csv = data_dir / f'income_statement_{ticker}.csv'
    news_csv = data_dir / 'news_data.csv'
    
    # Load and prepare data
    data = load_and_prepare_data(stock_csv, balance_csv, cashflow_csv, income_csv, ticker)
    
    # Load news data and merge
    sentiment_df = load_news_data(news_csv, ticker, data.index)
    if sentiment_df is not None:
        data = data.reset_index().merge(sentiment_df[['Date', 'Sentiment_Score']], on='Date', how='left')
        data['Sentiment_Score'] = data['Sentiment_Score'].fillna(0.0)
        data = data.set_index('Date')
        logging.info(f"Added Sentiment_Score to data for {ticker}. Columns: {list(data.columns)}")
    else:
        logging.warning(f"No sentiment data for {ticker}. Using neutral scores.")
        data['Sentiment_Score'] = 0.0

2025-05-06 09:35:48,519 - INFO - Processing ticker: AAPL
2025-05-06 09:35:48,536 - INFO - Financial data years: [2024 2023 2022 2021 2020]
2025-05-06 09:35:48,539 - INFO - Financial data sample:
   Year  Diluted EPS        EBITDA  Free Cash Flow      Net Debt
0  2024         6.08  1.346610e+11    1.088070e+11  7.668600e+10
1  2023         6.13  1.258200e+11    9.958400e+10  8.112300e+10
2  2022         6.11  1.305410e+11    1.114430e+11  9.642300e+10
3  2021         5.61  1.231360e+11    9.295300e+10  8.977900e+10
4  2020          NaN           NaN             NaN           NaN
2025-05-06 09:35:48,564 - INFO - Raw news data shape: (200, 7), columns: ['Ticker', 'Date', 'Title', 'Description', 'Source', 'URL', 'Content']
2025-05-06 09:35:48,568 - INFO - After date parsing, news data shape: (200, 7)
2025-05-06 09:35:48,569 - INFO - After filtering for ticker 'AAPL', news data shape: (200, 7)
Sentiment Analysis: 100%|██████████| 200/200 [00:03<00:00, 54.18it/s]
2025-05-06 09:35:52,279 - IN

In [13]:
data

Unnamed: 0_level_0,Close,Volume,Ticker,Year,Diluted EPS,EBITDA,Free Cash Flow,Net Debt,Sentiment_Score
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2023-05-09,171.770,45326874.0,AAPL,2023,6.13,1.258200e+11,9.958400e+10,8.112300e+10,0.000000
2023-05-10,173.555,53724501.0,AAPL,2023,6.13,1.258200e+11,9.958400e+10,8.112300e+10,0.000000
2023-05-11,173.750,49473076.0,AAPL,2023,6.13,1.258200e+11,9.958400e+10,8.112300e+10,0.000000
2023-05-12,172.570,45533138.0,AAPL,2023,6.13,1.258200e+11,9.958400e+10,8.112300e+10,0.000000
2023-05-13,172.570,45533138.0,AAPL,2023,6.13,1.258200e+11,9.958400e+10,8.112300e+10,0.000000
...,...,...,...,...,...,...,...,...,...
2025-04-28,210.140,38737224.0,AAPL,2025,6.08,1.346610e+11,1.088070e+11,7.668600e+10,0.000000
2025-04-29,211.210,36827633.0,AAPL,2025,6.08,1.346610e+11,1.088070e+11,7.668600e+10,0.979700
2025-04-30,212.500,52286454.0,AAPL,2025,6.08,1.346610e+11,1.088070e+11,7.668600e+10,0.770557
2025-05-01,213.320,57364925.0,AAPL,2025,6.08,1.346610e+11,1.088070e+11,7.668600e+10,0.485988


**Models**

In [14]:
# ARIMA forecast
arima_forecast, arima_metrics, arima_order = arima_forecast(data['Close'], forecast_horizon)

2025-05-06 09:35:52,427 - INFO - Data is non-stationary, applying differencing


Performing stepwise search to minimize aic
 ARIMA(2,0,2)(0,0,0)[0]             : AIC=3634.728, Time=0.26 sec
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=3650.950, Time=0.01 sec
 ARIMA(1,0,0)(0,0,0)[0]             : AIC=3642.263, Time=0.03 sec
 ARIMA(0,0,1)(0,0,0)[0]             : AIC=3641.639, Time=0.04 sec
 ARIMA(1,0,2)(0,0,0)[0]             : AIC=3639.339, Time=0.20 sec
 ARIMA(2,0,1)(0,0,0)[0]             : AIC=3637.173, Time=0.19 sec
 ARIMA(3,0,2)(0,0,0)[0]             : AIC=3635.632, Time=0.40 sec
 ARIMA(2,0,3)(0,0,0)[0]             : AIC=3635.262, Time=0.20 sec
 ARIMA(1,0,1)(0,0,0)[0]             : AIC=3643.635, Time=0.18 sec
 ARIMA(1,0,3)(0,0,0)[0]             : AIC=3633.402, Time=0.15 sec
 ARIMA(0,0,3)(0,0,0)[0]             : AIC=3632.777, Time=0.08 sec
 ARIMA(0,0,2)(0,0,0)[0]             : AIC=3643.624, Time=0.06 sec
 ARIMA(0,0,4)(0,0,0)[0]             : AIC=3632.953, Time=0.08 sec
 ARIMA(1,0,4)(0,0,0)[0]             : AIC=3633.902, Time=0.23 sec
 ARIMA(0,0,3)(0,0,0)[0] intercept

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
2025-05-06 09:35:54,868 - INFO - ARIMA Metrics: {'RMSE': 210.1147205183507, 'MAE': 210.10050252357516, 'MAPE': 99.97470268155942}


In [15]:
# Prophet forecast with tuning
prophet_forecast, prophet_metrics, prophet_forecast_df, prophet_scale = tune_prophet(data, forecast_horizon)

2025-05-06 09:35:54,962 - DEBUG - cmd: where.exe tbb.dll
cwd: None
2025-05-06 09:35:55,026 - DEBUG - Adding TBB (c:\Users\nguye\AppData\Local\Programs\Python\Python312\Lib\site-packages\prophet\stan_model\cmdstan-2.33.1\stan\lib\stan_math\lib\tbb) to PATH
2025-05-06 09:35:55,065 - DEBUG - cmd: C:\Users\nguye\AppData\Local\Programs\Python\Python312\Lib\site-packages\prophet\stan_model\prophet_model.bin info
cwd: None
2025-05-06 09:35:55,087 - DEBUG - input tempfile: C:\Users\nguye\AppData\Local\Temp\tmpbgj8u7h_\2qexr4da.json
2025-05-06 09:35:55,191 - DEBUG - input tempfile: C:\Users\nguye\AppData\Local\Temp\tmpbgj8u7h_\bxwwpq_4.json
09:35:55 - cmdstanpy - INFO - CmdStan start processing
2025-05-06 09:35:55,194 - INFO - CmdStan start processing


chain 1 |          | 00:00 Status

chain 2 |          | 00:00 Status

chain 3 |          | 00:00 Status

chain 4 |          | 00:00 Status

2025-05-06 09:35:55,237 - DEBUG - idx 0
2025-05-06 09:35:55,241 - DEBUG - running CmdStan, num_threads: 1
2025-05-06 09:35:55,242 - DEBUG - CmdStan args: ['C:\\Users\\nguye\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\prophet\\stan_model\\prophet_model.bin', 'id=1', 'random', 'seed=26439', 'data', 'file=C:\\Users\\nguye\\AppData\\Local\\Temp\\tmpbgj8u7h_\\2qexr4da.json', 'init=C:\\Users\\nguye\\AppData\\Local\\Temp\\tmpbgj8u7h_\\bxwwpq_4.json', 'output', 'file=C:\\Users\\nguye\\AppData\\Local\\Temp\\tmpbgj8u7h_\\prophet_modeliqcwbvg2\\prophet_model-20250506093555_1.csv', 'method=sample', 'num_samples=150', 'num_warmup=150', 'algorithm=hmc', 'adapt', 'engaged=1']
2025-05-06 09:35:55,242 - DEBUG - idx 1
2025-05-06 09:35:55,256 - DEBUG - idx 2
2025-05-06 09:35:55,257 - DEBUG - running CmdStan, num_threads: 1
2025-05-06 09:35:55,257 - DEBUG - idx 3
2025-05-06 09:35:55,273 - DEBUG - running CmdStan, num_threads: 1
2025-05-06 09:35:55,274 - DEBUG - CmdStan args: ['C:\\Us

                                                                                                                                                                                                                                                                                                                                

09:36:54 - cmdstanpy - INFO - CmdStan done processing.
2025-05-06 09:36:54,247 - INFO - CmdStan done processing.
2025-05-06 09:36:54,249 - DEBUG - runset
RunSet: chains=4, chain_ids=[1, 2, 3, 4], num_processes=4
 cmd (chain 1):
	['C:\\Users\\nguye\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\prophet\\stan_model\\prophet_model.bin', 'id=1', 'random', 'seed=26439', 'data', 'file=C:\\Users\\nguye\\AppData\\Local\\Temp\\tmpbgj8u7h_\\2qexr4da.json', 'init=C:\\Users\\nguye\\AppData\\Local\\Temp\\tmpbgj8u7h_\\bxwwpq_4.json', 'output', 'file=C:\\Users\\nguye\\AppData\\Local\\Temp\\tmpbgj8u7h_\\prophet_modeliqcwbvg2\\prophet_model-20250506093555_1.csv', 'method=sample', 'num_samples=150', 'num_warmup=150', 'algorithm=hmc', 'adapt', 'engaged=1']
 retcodes=[0, 0, 0, 0]
 per-chain output files (showing chain 1 only):
 csv_file:
	C:\Users\nguye\AppData\Local\Temp\tmpbgj8u7h_\prophet_modeliqcwbvg2\prophet_model-20250506093555_1.csv
 console_msgs (if any):
	C:\Users\nguye\AppData




2025-05-06 09:36:56,156 - INFO - Prophet Metrics (changepoint_prior_scale=0.05): {'RMSE': 4.4463437257174565, 'MAE': 4.053089545061921, 'MAPE': 1.9365116830060387}
2025-05-06 09:36:56,161 - DEBUG - cmd: where.exe tbb.dll
cwd: None
2025-05-06 09:36:56,249 - DEBUG - TBB already found in load path
2025-05-06 09:36:56,297 - DEBUG - cmd: C:\Users\nguye\AppData\Local\Programs\Python\Python312\Lib\site-packages\prophet\stan_model\prophet_model.bin info
cwd: None
2025-05-06 09:36:56,331 - DEBUG - input tempfile: C:\Users\nguye\AppData\Local\Temp\tmpbgj8u7h_\4gcdklgk.json
2025-05-06 09:36:56,423 - DEBUG - input tempfile: C:\Users\nguye\AppData\Local\Temp\tmpbgj8u7h_\9ata_w39.json
09:36:56 - cmdstanpy - INFO - CmdStan start processing
2025-05-06 09:36:56,425 - INFO - CmdStan start processing


chain 1 |          | 00:00 Status

chain 2 |          | 00:00 Status

chain 3 |          | 00:00 Status

chain 4 |          | 00:00 Status

2025-05-06 09:36:56,475 - DEBUG - idx 0
2025-05-06 09:36:56,483 - DEBUG - idx 1
2025-05-06 09:36:56,483 - DEBUG - running CmdStan, num_threads: 1
2025-05-06 09:36:56,486 - DEBUG - idx 2
2025-05-06 09:36:56,487 - DEBUG - running CmdStan, num_threads: 1
2025-05-06 09:36:56,487 - DEBUG - idx 3
2025-05-06 09:36:56,489 - DEBUG - CmdStan args: ['C:\\Users\\nguye\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\prophet\\stan_model\\prophet_model.bin', 'id=1', 'random', 'seed=29936', 'data', 'file=C:\\Users\\nguye\\AppData\\Local\\Temp\\tmpbgj8u7h_\\4gcdklgk.json', 'init=C:\\Users\\nguye\\AppData\\Local\\Temp\\tmpbgj8u7h_\\9ata_w39.json', 'output', 'file=C:\\Users\\nguye\\AppData\\Local\\Temp\\tmpbgj8u7h_\\prophet_model7f7oe20n\\prophet_model-20250506093656_1.csv', 'method=sample', 'num_samples=150', 'num_warmup=150', 'algorithm=hmc', 'adapt', 'engaged=1']
2025-05-06 09:36:56,492 - DEBUG - running CmdStan, num_threads: 1
2025-05-06 09:36:56,493 - DEBUG - CmdStan args: ['C:\\Us

                                                                                                                                                                                                                                                                                                                                

09:38:03 - cmdstanpy - INFO - CmdStan done processing.
2025-05-06 09:38:03,519 - INFO - CmdStan done processing.
2025-05-06 09:38:03,521 - DEBUG - runset
RunSet: chains=4, chain_ids=[1, 2, 3, 4], num_processes=4
 cmd (chain 1):
	['C:\\Users\\nguye\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\prophet\\stan_model\\prophet_model.bin', 'id=1', 'random', 'seed=29936', 'data', 'file=C:\\Users\\nguye\\AppData\\Local\\Temp\\tmpbgj8u7h_\\4gcdklgk.json', 'init=C:\\Users\\nguye\\AppData\\Local\\Temp\\tmpbgj8u7h_\\9ata_w39.json', 'output', 'file=C:\\Users\\nguye\\AppData\\Local\\Temp\\tmpbgj8u7h_\\prophet_model7f7oe20n\\prophet_model-20250506093656_1.csv', 'method=sample', 'num_samples=150', 'num_warmup=150', 'algorithm=hmc', 'adapt', 'engaged=1']
 retcodes=[0, 0, 0, 0]
 per-chain output files (showing chain 1 only):
 csv_file:
	C:\Users\nguye\AppData\Local\Temp\tmpbgj8u7h_\prophet_model7f7oe20n\prophet_model-20250506093656_1.csv
 console_msgs (if any):
	C:\Users\nguye\AppData




	Chain 1 had 150 iterations at max treedepth (100.0%)
	Chain 2 had 150 iterations at max treedepth (100.0%)
	Chain 3 had 150 iterations at max treedepth (100.0%)
	Chain 4 had 150 iterations at max treedepth (100.0%)
	Use the "diagnose()" method on the CmdStanMCMC object to see further information.
	Chain 1 had 150 iterations at max treedepth (100.0%)
	Chain 2 had 150 iterations at max treedepth (100.0%)
	Chain 3 had 150 iterations at max treedepth (100.0%)
	Chain 4 had 150 iterations at max treedepth (100.0%)
	Use the "diagnose()" method on the CmdStanMCMC object to see further information.
2025-05-06 09:38:06,461 - INFO - Prophet Metrics (changepoint_prior_scale=0.1): {'RMSE': 5.665656908084688, 'MAE': 4.966340307525026, 'MAPE': 2.370467905160343}
2025-05-06 09:38:06,467 - DEBUG - cmd: where.exe tbb.dll
cwd: None
2025-05-06 09:38:06,557 - DEBUG - TBB already found in load path
2025-05-06 09:38:06,600 - DEBUG - cmd: C:\Users\nguye\AppData\Local\Programs\Python\Python312\Lib\site-packag

chain 1 |          | 00:00 Status

chain 2 |          | 00:00 Status

chain 3 |          | 00:00 Status

chain 4 |          | 00:00 Status

2025-05-06 09:38:06,827 - DEBUG - idx 0
2025-05-06 09:38:06,835 - DEBUG - idx 1
2025-05-06 09:38:06,847 - DEBUG - running CmdStan, num_threads: 1
2025-05-06 09:38:06,836 - DEBUG - running CmdStan, num_threads: 1
2025-05-06 09:38:06,849 - DEBUG - idx 2
2025-05-06 09:38:06,849 - DEBUG - CmdStan args: ['C:\\Users\\nguye\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\prophet\\stan_model\\prophet_model.bin', 'id=2', 'random', 'seed=66585', 'data', 'file=C:\\Users\\nguye\\AppData\\Local\\Temp\\tmpbgj8u7h_\\4tmitcrn.json', 'init=C:\\Users\\nguye\\AppData\\Local\\Temp\\tmpbgj8u7h_\\lbnvtnos.json', 'output', 'file=C:\\Users\\nguye\\AppData\\Local\\Temp\\tmpbgj8u7h_\\prophet_model8km_oc0n\\prophet_model-20250506093806_2.csv', 'method=sample', 'num_samples=150', 'num_warmup=150', 'algorithm=hmc', 'adapt', 'engaged=1']
2025-05-06 09:38:06,851 - DEBUG - idx 3
2025-05-06 09:38:06,852 - DEBUG - CmdStan args: ['C:\\Users\\nguye\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site

                                                                                                                                                                                                                                                                                                                                

09:39:12 - cmdstanpy - INFO - CmdStan done processing.
2025-05-06 09:39:12,475 - INFO - CmdStan done processing.
2025-05-06 09:39:12,477 - DEBUG - runset
RunSet: chains=4, chain_ids=[1, 2, 3, 4], num_processes=4
 cmd (chain 1):
	['C:\\Users\\nguye\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\prophet\\stan_model\\prophet_model.bin', 'id=1', 'random', 'seed=66585', 'data', 'file=C:\\Users\\nguye\\AppData\\Local\\Temp\\tmpbgj8u7h_\\4tmitcrn.json', 'init=C:\\Users\\nguye\\AppData\\Local\\Temp\\tmpbgj8u7h_\\lbnvtnos.json', 'output', 'file=C:\\Users\\nguye\\AppData\\Local\\Temp\\tmpbgj8u7h_\\prophet_model8km_oc0n\\prophet_model-20250506093806_1.csv', 'method=sample', 'num_samples=150', 'num_warmup=150', 'algorithm=hmc', 'adapt', 'engaged=1']
 retcodes=[0, 0, 0, 0]
 per-chain output files (showing chain 1 only):
 csv_file:
	C:\Users\nguye\AppData\Local\Temp\tmpbgj8u7h_\prophet_model8km_oc0n\prophet_model-20250506093806_1.csv
 console_msgs (if any):
	C:\Users\nguye\AppData




2025-05-06 09:39:14,707 - INFO - Prophet Metrics (changepoint_prior_scale=0.5): {'RMSE': 5.361498968540318, 'MAE': 4.764130228207388, 'MAPE': 2.277255689340668}
2025-05-06 09:39:14,711 - INFO - Best Prophet changepoint_prior_scale: 0.05
2025-05-06 09:39:14,712 - INFO - Best Prophet Metrics: {'RMSE': 4.4463437257174565, 'MAE': 4.053089545061921, 'MAPE': 1.9365116830060387}


**Walk-forward validation**

In [16]:
avg_metrics = walk_forward_validation(data, forecast_horizon)

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
2025-05-06 09:39:15,844 - DEBUG - cmd: where.exe tbb.dll
cwd: None
2025-05-06 09:39:15,915 - DEBUG - TBB already found in load path
2025-05-06 09:39:15,943 - DEBUG - input tempfile: C:\Users\nguye\AppData\Local\Temp\tmpbgj8u7h_\3yp02l7m.json
2025-05-06 09:39:16,029 - DEBUG - input tempfile: C:\Users\nguye\AppData\Local\Temp\tmpbgj8u7h_\en2ayieg.json
2025-05-06 09:39:16,034 - DEBUG - idx 0
2025-05-06 09:39:16,035 - DEBUG - running CmdStan, num_threads: 1
2025-05-06 09:39:16,036 - DEBUG - CmdStan args: ['C:\\Users\\nguye\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\prophet\\stan_model\\prophet_model.bin', 'random', 'seed=55874', 'data', 'file=C:\\Users\\nguye\\AppData\\Local\\Temp\\tmpbgj8u7h_\\3yp02l7m.json', 'init=C:\\Users\\nguye\\AppData\\Local\\Temp\\tmpbgj8u7h_\\en2ayieg.json', 'output', 'file=C:\\Users\\nguye\\AppData\\Local\\Temp\\tmpbgj8u7h_\\prophet_modelmg1m319m

**Backtesting**

In [17]:
# Backtesting
if len(data) >= 100:
    # ARIMA historical predictions
    arima_model = auto_arima(data['Close'][:-forecast_horizon], seasonal=False, max_p=7, max_q=7, max_d=2)
    arima_fit = ARIMA(data['Close'][:-forecast_horizon], order=arima_model.order).fit()
    arima_hist_pred = arima_fit.predict(start=0, end=len(data)-forecast_horizon-1)
    arima_backtest_results, arima_backtest_df = backtest_strategy(data.iloc[:-forecast_horizon], arima_hist_pred, 'ARIMA')
    
    # Prophet historical predictions
    train_data = data.iloc[:-forecast_horizon]
    prophet_df = train_data.reset_index()[['Date', 'Close', 'Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA', 'Sentiment_Score']]
    prophet_df = prophet_df.drop_duplicates(subset='Date')
    prophet_df = prophet_df.rename(columns={'Date': 'ds', 'Close': 'y'})
    
    prophet_model = Prophet(daily_seasonality=True, yearly_seasonality=True, weekly_seasonality=True,
                            changepoint_prior_scale=prophet_scale, mcmc_samples=0)
    for regressor in ['Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA', 'Sentiment_Score']:
        prophet_model.add_regressor(regressor)
    prophet_model.fit(prophet_df)
    
    prophet_pred_df = prophet_model.predict(prophet_df)
    prophet_pred_df = prophet_pred_df.set_index('ds').reindex(train_data.index, method='ffill')
    prophet_hist_pred = prophet_pred_df['yhat']
    
    prophet_backtest_results, prophet_backtest_df = backtest_strategy(train_data, prophet_hist_pred, 'Prophet')
else:
    arima_backtest_results, arima_backtest_df = None, None
    prophet_backtest_results, prophet_backtest_df = None, None
    logging.warning(f"Insufficient data for backtesting {ticker}.")

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
2025-05-06 09:39:45,855 - INFO - Backtest input lengths - Data: 718, Predictions: 718
2025-05-06 09:39:45,859 - ERROR - Error in backtesting: Can only compare identically-labeled Series objects
2025-05-06 09:39:45,865 - DEBUG - cmd: where.exe tbb.dll
cwd: None
2025-05-06 09:39:45,917 - DEBUG - TBB already found in load path
2025-05-06 09:39:45,946 - DEBUG - input tempfile: C:\Users\nguye\AppData\Local\Temp\tmpbgj8u7h_\92nakbew.json
2025-05-06 09:39:46,033 - DEBUG - input tempfile: C:\Users\nguye\AppData\Local\Temp\tmpbgj8u7h_\q2blqd3w.json
2025-05-06 09:39:46,035 - DEBUG - idx 0
2025-05-06 09:39:46,037 - DEBUG - running CmdStan, num_threads: 1
2025-05-06 09:39:46,038 - DEBUG - CmdStan args: ['C:\\Users\\nguye\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\prophet\\stan_model\\prophet_model.bin', 'random', 'seed=57566', 'data', 'file=C:\\Users\\nguye\\AppData\\Local\\Temp\\

**Plot predictions**

In [24]:
# Plot predictions
plot_predictions(data, arima_forecast, prophet_forecast, prophet_forecast_df,
                arima_backtest_df, prophet_backtest_df, ticker, forecast_horizon)

In [None]:
# Save predictions
if arima_forecast is not None and prophet_forecast is not None:
    future_dates = [data.index[-1] + timedelta(days=i+1) for i in range(forecast_horizon)]
    pred_df = pd.DataFrame({
        'Date': future_dates,
        'ARIMA_Prediction': arima_forecast,
        'Prophet_Prediction': prophet_forecast
    })
    pred_df.to_csv(data_dir / f'{ticker}_stock_price_predictions_sentiment.csv', index=False)
    logging.info(f"Saved predictions for {ticker}")

# Save metrics
metrics_df = pd.DataFrame({
    'Model': ['ARIMA', 'Prophet'],
    'RMSE': [arima_metrics.get('RMSE', np.nan), prophet_metrics.get('RMSE', np.nan)],
    'MAE': [arima_metrics.get('MAE', np.nan), prophet_metrics.get('MAE', np.nan)],
    'MAPE': [arima_metrics.get('MAPE', np.nan), prophet_metrics.get('MAPE', np.nan)],
    'Best Parameters': [f"Order: {arima_order}", f"changepoint_prior_scale: {prophet_scale or 0.05}"]
})
metrics_df.to_csv(data_dir / f'{ticker}_model_metrics_sentiment.csv', index=False)
logging.info(f"Saved metrics for {ticker}")

# Save backtest results
if arima_backtest_results and prophet_backtest_results:
    backtest_df = pd.DataFrame({
        'Model': ['ARIMA', 'Prophet'],
        'Cumulative Return (%)': [arima_backtest_results['Cumulative Return (%)'], 
                                prophet_backtest_results['Cumulative Return (%)']],
        'Number of Trades': [arima_backtest_results['Number of Trades'], 
                            prophet_backtest_results['Number of Trades']]
    })
    backtest_df.to_csv(data_dir / f'{ticker}_backtest_results_sentiment.csv', index=False)
    logging.info(f"Saved backtest results for {ticker}")

# Select best model
best_model = 'Prophet' if avg_metrics['Prophet']['RMSE'] < avg_metrics['ARIMA']['RMSE'] else 'ARIMA'
results.append({'Ticker': ticker, 'Best_Model': best_model, 'Metrics': avg_metrics})

# Save summary
summary_df = pd.DataFrame(results)
summary_df.to_csv(data_dir / 'summary_results_sentiment.csv', index=False)
logging.info("Saved summary results")

2025-05-06 09:48:28,900 - INFO - Saved predictions for AAPL
2025-05-06 09:48:28,926 - INFO - Saved metrics for AAPL
2025-05-06 09:48:28,930 - INFO - Saved summary results
