In [14]:
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
from pmdarima import auto_arima
from prophet import Prophet
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
from datetime import timedelta
import os
import logging
import cmdstanpy
import re
from pathlib import Path
from tqdm import tqdm

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Suppress cmdstanpy debug logs to reduce console clutter
cmdstanpy.utils.get_logger().setLevel(logging.WARNING)

In [15]:
def load_and_prepare_data(stock_csv, balance_csv, cashflow_csv, income_csv, ticker, train_end_date=None):
    """
    Load and preprocess stock and financial data, aligning them for modeling.
    Adjusted: Interpolate financial metrics, fit scaler on training data, improve error handling.
    """
    try:
        # Load stock data
        stock_df = pd.read_csv(stock_csv)
        stock_df['Date'] = pd.to_datetime(stock_df['Date'])
        stock_df = stock_df[stock_df['Ticker'] == ticker][['Date', 'Close', 'Volume']].sort_values('Date')
        stock_df = stock_df.drop_duplicates(subset=['Date', 'Ticker'])
        
        # Validate stock data
        if stock_df['Close'].min() <= 0:
            raise ValueError("Invalid stock data: Close prices contain zero or negative values")
        
        # Create daily date range and fill missing dates
        date_range = pd.date_range(start=stock_df['Date'].min(), end=stock_df['Date'].max(), freq='D')
        stock_df = stock_df.set_index('Date').reindex(date_range, method='ffill').reset_index()
        stock_df = stock_df.rename(columns={'index': 'Date'})
        stock_df['Ticker'] = ticker
        
        logging.info(f"Loaded {len(stock_df)} daily stock data points for {ticker}")
        
        # Load financial data
        balance_df = pd.read_csv(balance_csv)
        cashflow_df = pd.read_csv(cashflow_csv)
        income_df = pd.read_csv(income_csv)
        
        # Select key financial metrics with quarterly updates
        financial_metrics = {
            'Diluted EPS': income_df[income_df['index'] == 'Diluted EPS'][['2024-09-30 00:00:00']].iloc[0, 0],
            'Free Cash Flow': cashflow_df[cashflow_df['index'] == 'Free Cash Flow'][['2024-09-30 00:00:00']].iloc[0, 0],
            'Net Debt': balance_df[balance_df['index'] == 'Net Debt'][['2024-09-30 00:00:00']].iloc[0, 0],
            'EBITDA': income_df[income_df['index'] == 'EBITDA'][['2024-09-30 00:00:00']].iloc[0, 0]
        }
        
        # Create financial DataFrame with quarterly updates
        financial_df = pd.DataFrame(index=date_range)
        for metric, value in financial_metrics.items():
            # Simulate quarterly updates (e.g., every 90 days)
            financial_df[metric] = np.nan
            reporting_dates = pd.date_range(start=date_range[0], end=date_range[-1], freq='Q')
            for report_date in reporting_dates:
                if report_date in financial_df.index:
                    financial_df.loc[report_date, metric] = value
            financial_df[metric] = financial_df[metric].ffill()
        
        # Merge stock and financial data
        merged_df = stock_df.merge(financial_df.reset_index(), on='Date', how='left')
        merged_df = merged_df.set_index('Date')
        
        # Scale financial features on training data only
        scaler = StandardScaler()
        financial_cols = ['Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA']
        if train_end_date:
            train_data = merged_df[merged_df.index <= train_end_date]
            merged_df[financial_cols] = scaler.fit_transform(train_data[financial_cols])
        else:
            merged_df[financial_cols] = scaler.fit_transform(merged_df[financial_cols])
        
        logging.info(f"Prepared data with {len(merged_df)} rows, including financial features: {list(financial_metrics.keys())}")
        return merged_df, scaler
    
    except FileNotFoundError as e:
        logging.error(f"File not found: {e}. Skipping ticker {ticker}.")
        return None, None
    except Exception as e:
        logging.error(f"Error preparing data for {ticker}: {e}. Skipping ticker.")
        return None, None

In [16]:
def clean_text(text):
    """
    Clean text by removing URLs and special characters.
    Added: Text preprocessing for sentiment analysis.
    """
    if not isinstance(text, str):
        return ""
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    return text.strip()

In [17]:
def load_news_data(news_csv, ticker, stock_dates):
    """
    Load and preprocess news data, compute sentiment scores, and align with stock data dates.
    Adjusted: Neutral fallback sentiment, text cleaning, performance optimization, date validation.
    """
    try:
        if not os.path.exists(news_csv):
            logging.error(f"News file not found: {news_csv}")
            raise FileNotFoundError(f"News file not found: {news_csv}")
        
        # Load news data
        news_df = pd.read_csv(news_csv)
        logging.info(f"Raw news data shape: {news_df.shape}, columns: {list(news_df.columns)}")
        
        # Parse dates
        news_df['Date'] = pd.to_datetime(news_df['Date'], errors='coerce').dt.date
        invalid_dates = news_df['Date'].isna().sum()
        if invalid_dates > 0:
            logging.warning(f"Dropped {invalid_dates} rows due to invalid dates")
        news_df = news_df.dropna(subset=['Date'])
        logging.info(f"After date parsing, news data shape: {news_df.shape}")
        
        # Filter for ticker
        news_df['Ticker'] = news_df['Ticker'].str.strip().str.upper()
        ticker = ticker.strip().upper()
        news_df = news_df[news_df['Ticker'] == ticker]
        logging.info(f"After filtering for ticker '{ticker}', news data shape: {news_df.shape}")
        
        if news_df.empty:
            logging.warning(f"No news articles found for {ticker}. Using neutral sentiment scores.")
            return pd.DataFrame({'Date': stock_dates, 'Sentiment_Score': 0.0})
        
        # Initialize sentiment analyzer
        analyzer = SentimentIntensityAnalyzer()
        
        # Compute sentiment with progress bar
        def get_sentiment(row):
            text = (clean_text(row['Content']) if pd.notna(row['Content']) else
                    clean_text(row['Description']) if pd.notna(row['Description']) else
                    clean_text(row['Title']) if pd.notna(row['Title']) else "")
            if not text:
                logging.debug(f"Empty text for row: {row.name}. Returning 0.0")
                return 0.0
            scores = analyzer.polarity_scores(text)
            return scores['compound']
        
        news_df['Sentiment_Score'] = [get_sentiment(row) for row in tqdm(news_df.to_dict('records'), desc="Sentiment Analysis")]
        logging.info(f"Sentiment score summary: {news_df['Sentiment_Score'].describe().to_dict()}")
        
        # Aggregate sentiment by date
        sentiment_df = news_df.groupby('Date')['Sentiment_Score'].mean().reset_index()
        sentiment_df['Date'] = pd.to_datetime(sentiment_df['Date'])
        logging.info(f"Aggregated sentiment data shape: {sentiment_df.shape}")
        
        # Create full sentiment DataFrame
        sentiment_full = pd.DataFrame({'Date': stock_dates, 'Sentiment_Score': 0.0})
        sentiment_full = sentiment_full.merge(sentiment_df, on='Date', how='left', suffixes=('', '_news'))
        sentiment_full['Sentiment_Score'] = sentiment_full['Sentiment_Score_news'].fillna(0.0)
        
        # Forward-fill sentiment after first non-zero score
        first_news_date = sentiment_full[sentiment_full['Sentiment_Score'] != 0.0]['Date'].min()
        if pd.notna(first_news_date):
            mask = sentiment_full['Date'] >= first_news_date
            sentiment_full.loc[mask, 'Sentiment_Score'] = sentiment_full.loc[mask, 'Sentiment_Score'].ffill()
        
        sentiment_full = sentiment_full[['Date', 'Sentiment_Score']]
        logging.info(f"Final sentiment data shape: {sentiment_full.shape}, non-zero scores: {sentiment_full['Sentiment_Score'].ne(0).sum()}")
        return sentiment_full
    
    except FileNotFoundError as e:
        logging.error(f"File not found: {e}")
        return pd.DataFrame({'Date': stock_dates, 'Sentiment_Score': 0.0})
    except Exception as e:
        logging.error(f"Error processing news data: {e}")
        return pd.DataFrame({'Date': stock_dates, 'Sentiment_Score': 0.0})


In [18]:
def calculate_metrics(actual, predicted):
    """
    Calculate RMSE, MAE, and MAPE for model evaluation.
    Adjusted: Log filtered data points.
    """
    actual = np.array(actual)
    predicted = np.array(predicted)
    mask = (actual > 0) & (~np.isnan(actual)) & (~np.isnan(predicted))
    actual = actual[mask]
    predicted = predicted[mask]
    
    if len(actual) < len(np.array(actual, copy=True)):
        logging.info(f"Filtered {len(np.array(actual, copy=True)) - len(actual)} invalid data points for metrics calculation")
    
    if len(actual) == 0:
        logging.warning("No valid data for metrics calculation")
        return {'RMSE': np.nan, 'MAE': np.nan, 'MAPE': np.nan}
    
    rmse = np.sqrt(mean_squared_error(actual, predicted))
    mae = mean_absolute_error(actual, predicted)
    mape = np.mean(np.abs((actual - predicted) / actual)) * 100 if np.all(actual != 0) else np.nan
    return {'RMSE': rmse, 'MAE': mae, 'MAPE': mape}

In [None]:
def arima_forecast(data, forecast_horizon=7):
    """
    Fit ARIMA model with stationarity check.
    Adjusted: Validate data, handle insufficient data, robust error handling.
    """
    try:
        # Validate data
        data = data.dropna()
        if data.isna().any() or np.isinf(data).any():
            logging.error("Invalid data: Contains NaN or inf values")
            return None, None, None
        if len(data) < 30:
            logging.error("Insufficient data for ARIMA (<30 days)")
            return None, None, None
        
        # Check stationarity
        result = adfuller(data)
        if result[1] > 0.05:
            logging.info("Data is non-stationary, applying differencing")
            data_diff = data.diff().dropna()
            if len(data_diff) < 10:
                logging.error("Insufficient data after differencing")
                return None, None, None
            model = auto_arima(data_diff, seasonal=False, max_p=7, max_q=7, max_d=2,
                              stepwise=True, trace=True, error_action='ignore')
            best_order = model.order
            arima_model = ARIMA(data_diff, order=best_order).fit()
            forecast_diff = arima_model.forecast(steps=forecast_horizon)
            forecast = data.iloc[-1] + forecast_diff.cumsum()
        else:
            model = auto_arima(data, seasonal=False, max_p=7, max_q=7, max_d=2,
                              stepwise=True, trace=True, error_action='ignore')
            best_order = model.order
            arima_model = ARIMA(data, order=best_order).fit()
            forecast = arima_model.forecast(steps=forecast_horizon)
        
        # Evaluate
        if len(data) >= forecast_horizon:
            test_data = data[-forecast_horizon:]
            forecast_test = arima_model.forecast(steps=forecast_horizon)[-forecast_horizon:]
            metrics = calculate_metrics(test_data, forecast_test)
        else:
            metrics = {'RMSE': np.nan, 'MAE': np.nan, 'MAPE': np.nan}
        
        logging.info(f"ARIMA Metrics: {metrics}")
        return forecast, metrics, best_order
    except Exception as e:
        logging.error(f"Error in ARIMA forecasting: {e}")
        return None, None, None

In [20]:
def prophet_forecast(data, forecast_horizon=7, changepoint_prior_scale=0.05):
    """
    Fit Prophet model with dynamic regressors.
    Adjusted: Forecast regressors, handle duplicates, enable MCMC for small datasets.
    """
    try:
        logging.info(f"Prophet input DataFrame columns: {list(data.columns)}")
        prophet_df = data.reset_index()[['Date', 'Close', 'Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA', 'Sentiment_Score']]
        prophet_df = prophet_df.drop_duplicates(subset='Date')
        prophet_df = prophet_df.rename(columns={'Date': 'ds', 'Close': 'y'})
        
        logging.info(f"Prophet input data summary:\n{prophet_df.describe()}")
        
        # Fit Prophet model
        model = Prophet(daily_seasonality=True, yearly_seasonality=True, weekly_seasonality=True,
                       changepoint_prior_scale=changepoint_prior_scale, mcmc_samples=300 if len(prophet_df) < 1000 else 0)
        for regressor in ['Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA', 'Sentiment_Score']:
            model.add_regressor(regressor)
        model.fit(prophet_df)
        
        # Create future dataframe with dynamic regressors
        future = model.make_future_dataframe(periods=forecast_horizon)
        for regressor in ['Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA']:
            future[regressor] = prophet_df[regressor].mean()  # Use mean of historical data
        future['Sentiment_Score'] = prophet_df['Sentiment_Score'].iloc[-10:].mean()  # Recent sentiment trend
        forecast_df = model.predict(future)
        
        # Extract forecast
        forecast = forecast_df[['ds', 'yhat']].tail(forecast_horizon).set_index('ds')['yhat']
        
        # Evaluate
        test_data = prophet_df['y'][-forecast_horizon:]
        forecast_test = model.predict(prophet_df[-forecast_horizon:])['yhat']
        metrics = calculate_metrics(test_data, forecast_test)
        
        logging.info(f"Prophet Metrics (changepoint_prior_scale={changepoint_prior_scale}): {metrics}")
        return forecast, metrics, forecast_df
    except Exception as e:
        logging.error(f"Error in Prophet forecasting: {e}")
        return None, None, None

In [21]:
def tune_prophet(data, forecast_horizon=7):
    """
    Tune Prophet model with expanded scale range.
    Adjusted: Wider scale range, add cross-validation.
    """
    from prophet.diagnostics import cross_validation
    scales = [0.01, 0.05, 0.1, 0.25, 0.5, 1.0]
    best_metrics = {'RMSE': float('inf')}
    best_forecast = None
    best_forecast_df = None
    best_scale = 0.05
    
    for scale in scales:
        forecast, metrics, forecast_df = prophet_forecast(data, forecast_horizon, changepoint_prior_scale=scale)
        if forecast is not None and not np.any(np.isnan(forecast)):
            # Cross-validation
            prophet_df = data.reset_index()[['Date', 'Close', 'Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA', 'Sentiment_Score']]
            prophet_df = prophet_df.rename(columns={'Date': 'ds', 'Close': 'y'})
            model = Prophet(changepoint_prior_scale=scale)
            for regressor in ['Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA', 'Sentiment_Score']:
                model.add_regressor(regressor)
            model.fit(prophet_df)
            cv_df = cross_validation(model, horizon=f'{forecast_horizon} days', parallel="processes")
            rmse = np.sqrt(mean_squared_error(cv_df['y'], cv_df['yhat']))
            if rmse < best_metrics['RMSE']:
                best_metrics = {'RMSE': rmse}
                best_forecast = forecast
                best_forecast_df = forecast_df
                best_scale = scale
    
    if best_forecast is None:
        logging.warning("All Prophet models failed. Using default scale=0.05.")
        forecast, metrics, forecast_df = prophet_forecast(data, forecast_horizon, changepoint_prior_scale=0.05)
        best_metrics = metrics if metrics else {'RMSE': np.nan, 'MAE': np.nan, 'MAPE': np.nan}
        best_forecast = forecast
        best_forecast_df = forecast_df
        best_scale = 0.05
    
    logging.info(f"Best Prophet changepoint_prior_scale: {best_scale}")
    logging.info(f"Best Prophet Metrics: {best_metrics}")
    return best_forecast, best_metrics, best_forecast_df, best_scale

In [22]:
def backtest_strategy(data, predictions, model_name, threshold=None):
    """
    Backtest a trading strategy with volatility-based threshold.
    Adjusted: Volatility-based threshold, prevent lookahead bias, add transaction costs.
    """
    try:
        logging.info(f"Backtest input lengths - Data: {len(data)}, Predictions: {len(predictions)}")
        if len(predictions) != len(data):
            logging.warning(f"Predictions length ({len(predictions)}) does not match data length ({len(data)}). Truncating.")
            min_length = min(len(data), len(predictions))
            data = data.iloc[:min_length]
            predictions = predictions[:min_length]
        
        pred_df = pd.DataFrame({'Date': data.index, 'Close': data['Close'], 'Prediction': predictions})
        pred_df = pred_df.dropna()
        
        # Volatility-based threshold
        volatility = pred_df['Close'].pct_change().std()
        threshold = volatility * 2 if threshold is None else threshold
        
        # Generate signals without lookahead bias
        pred_df['Signal'] = 0
        pred_df.loc[1:, 'Signal'] = np.where(
            pred_df['Prediction'][1:] > pred_df['Close'][:-1] * (1 + threshold), 1,
            np.where(pred_df['Prediction'][1:] < pred_df['Close'][:-1] * (1 - threshold), -1, 0)
        )
        
        # Calculate returns with transaction costs
        pred_df['Return'] = pred_df['Close'].pct_change()
        transaction_cost = 0.001  # 0.1% per trade
        pred_df['Strategy_Return'] = pred_df['Signal'].shift(1) * pred_df['Return'] - \
                                    pred_df['Signal'].abs().shift(1) * transaction_cost
        
        # Cumulative return
        cumulative_return = (1 + pred_df['Strategy_Return'].dropna()).cumprod().iloc[-1] - 1
        num_trades = pred_df['Signal'].abs().sum()
        
        results = {
            'Cumulative Return (%)': cumulative_return * 100,
            'Number of Trades': num_trades
        }
        
        logging.info(f"Backtest Results for {model_name}: {results}")
        return results, pred_df
    except Exception as e:
        logging.error(f"Error in backtesting: {e}")
        return None, None


In [23]:
def walk_forward_validation(data, forecast_horizon=7, n_folds=10):
    """
    Perform walk-forward validation with robust averaging.
    Adjusted: Increase folds, handle missing regressors, robust averaging.
    """
    from scipy.stats import trim_mean
    n_folds = min(n_folds, len(data) // forecast_horizon)
    arima_metrics_list = []
    prophet_metrics_list = []
    
    for i in range(n_folds):
        train_end = len(data) - (n_folds - i) * forecast_horizon
        if train_end <= forecast_horizon:
            continue
        
        train_data = data.iloc[:train_end]
        test_data = data.iloc[train_end:train_end + forecast_horizon]['Close']
        
        if len(test_data) != forecast_horizon:
            continue
        
        # ARIMA
        arima_model = auto_arima(train_data['Close'], seasonal=False, max_p=7, max_q=7, max_d=2,
                                stepwise=True, error_action='ignore')
        arima_fit = ARIMA(train_data['Close'], order=arima_model.order).fit()
        arima_pred = arima_fit.forecast(steps=forecast_horizon)
        arima_metrics = calculate_metrics(test_data, arima_pred)
        arima_metrics_list.append(arima_metrics)
        
        # Prophet
        prophet_df = train_data.reset_index()[['Date', 'Close', 'Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA', 'Sentiment_Score']]
        prophet_df = prophet_df.rename(columns={'Date': 'ds', 'Close': 'y'})
        
        prophet_model = Prophet(daily_seasonality=True, yearly_seasonality=True, weekly_seasonality=True,
                               changepoint_prior_scale=0.05, mcmc_samples=0)
        for regressor in ['Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA', 'Sentiment_Score']:
            prophet_model.add_regressor(regressor)
        prophet_model.fit(prophet_df)
        future = prophet_model.make_future_dataframe(periods=forecast_horizon)
        for regressor in ['Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA', 'Sentiment_Score']:
            future[regressor] = prophet_df[regressor].ffill().iloc[-1]
        prophet_pred_df = prophet_model.predict(future)
        prophet_pred = prophet_pred_df['yhat'].tail(forecast_horizon)
        prophet_metrics = calculate_metrics(test_data, prophet_pred)
        prophet_metrics_list.append(prophet_metrics)
    
    # Robust averaging
    avg_metrics = {
        'ARIMA': {
            'RMSE': trim_mean([m['RMSE'] for m in arima_metrics_list], proportiontocut=0.1) if arima_metrics_list else np.nan,
            'MAE': trim_mean([m['MAE'] for m in arima_metrics_list], proportiontocut=0.1) if arima_metrics_list else np.nan,
            'MAPE': trim_mean([m['MAPE'] for m in arima_metrics_list if not np.isnan(m['MAPE'])], proportiontocut=0.1) if any(not np.isnan(m['MAPE']) for m in arima_metrics_list) else np.nan
        },
        'Prophet': {
            'RMSE': trim_mean([m['RMSE'] for m in prophet_metrics_list], proportiontocut=0.1) if prophet_metrics_list else np.nan,
            'MAE': trim_mean([m['MAE'] for m in prophet_metrics_list], proportiontocut=0.1) if prophet_metrics_list else np.nan,
            'MAPE': trim_mean([m['MAPE'] for m in prophet_metrics_list if not np.isnan(m['MAPE'])], proportiontocut=0.1) if any(not np.isnan(m['MAPE']) for m in prophet_metrics_list) else np.nan
        }
    }
    
    logging.info(f"Walk-Forward Validation Results: {avg_metrics}")
    return avg_metrics

In [24]:
def plot_predictions(data, arima_forecast, prophet_forecast, prophet_forecast_df, backtest_arima_df, backtest_prophet_df, ticker, forecast_horizon=7):
    """
    Plot predictions with dynamic titles and enhanced sentiment visualization.
    Adjusted: Dynamic titles, significant sentiment highlighting, save plot.
    """
    last_date = data.index[-1]
    future_dates = [last_date + timedelta(days=i+1) for i in range(forecast_horizon)]
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 8), sharey=True)
    
    # ARIMA subplot
    ax1.plot(data.index, data['Close'], label='Historical Close', color='blue')
    if arima_forecast is not None:
        ax1.plot(future_dates, arima_forecast, label='ARIMA Forecast', color='red', linestyle='--')
    if backtest_arima_df is not None:
        buy_signals = backtest_arima_df[backtest_arima_df['Signal'] == 1]
        sell_signals = backtest_arima_df[backtest_arima_df['Signal'] == -1]
        ax1.scatter(buy_signals['Date'], buy_signals['Close'], color='green', marker='^', label='ARIMA Buy Signal', alpha=0.7)
        ax1.scatter(sell_signals['Date'], sell_signals['Close'], color='red', marker='v', label='ARIMA Sell Signal', alpha=0.7)
    
    # Sentiment on secondary axis
    if 'Sentiment_Score' in data.columns:
        ax1_sent = ax1.twinx()
        sentiment_mask = (data['Sentiment_Score'] != 0.0) & (data['Sentiment_Score'].abs() > 0.3)
        if sentiment_mask.any():
            ax1_sent.plot(data.index[sentiment_mask], data['Sentiment_Score'][sentiment_mask], 
                         label='Significant Sentiment', color='purple', alpha=0.3)
            ax1_sent.set_ylabel('Sentiment Score (-1 to 1)', color='purple')
            ax1_sent.tick_params(axis='y', labelcolor='purple')
            ax1_sent.legend(loc='upper right')
    
    ax1.set_title(f'ARIMA: {ticker} Stock Price Prediction')
    ax1.set_xlabel('Date')
    ax1.set_ylabel('Close Price (USD)')
    ax1.legend(loc='upper left')
    ax1.grid(True)
    
    # Prophet subplot
    ax2.plot(data.index, data['Close'], label='Historical Close', color='blue')
    if prophet_forecast is not None:
        ax2.plot(future_dates, prophet_forecast, label='Prophet Forecast', color='green', linestyle='--')
        ax2.fill_between(future_dates, 
                         prophet_forecast_df['yhat_lower'].tail(forecast_horizon), 
                         prophet_forecast_df['yhat_upper'].tail(forecast_horizon), 
                         color='green', alpha=0.1, label='Prophet Confidence Interval')
    if backtest_prophet_df is not None:
        buy_signals = backtest_prophet_df[backtest_prophet_df['Signal'] == 1]
        sell_signals = backtest_prophet_df[backtest_prophet_df['Signal'] == -1]
        ax2.scatter(buy_signals['Date'], buy_signals['Close'], color='lime', marker='^', label='Prophet Buy Signal', alpha=0.7)
        ax2.scatter(sell_signals['Date'], sell_signals['Close'], color='darkred', marker='v', label='Prophet Sell Signal', alpha=0.7)
    
    # Sentiment on secondary axis
    if 'Sentiment_Score' in data.columns:
        ax2_sent = ax2.twinx()
        if sentiment_mask.any():
            ax2_sent.plot(data.index[sentiment_mask], data['Sentiment_Score'][sentiment_mask], 
                         label='Significant Sentiment', color='purple', alpha=0.3)
            ax2_sent.set_ylabel('Sentiment Score (-1 to 1)', color='purple')
            ax2_sent.tick_params(axis='y', labelcolor='purple')
            ax2_sent.legend(loc='upper right')
    
    ax2.set_title(f'Prophet: {ticker} Stock Price Prediction')
    ax2.set_xlabel('Date')
    ax2.legend(loc='upper left')
    ax2.grid(True)
    
    plt.suptitle(f'{ticker} Stock Price Prediction and Backtest Signals (7-Day Forecast)', y=1.02)
    plt.tight_layout()
    plt.savefig(f'{ticker}_stock_price_predictions.png', dpi=300)
    plt.close()

**Main**

In [25]:
# Parameters
forecast_horizon = 7
data_dir = Path('../data')
stock_csv = data_dir / 'cleaned_stock_data.csv'

# Configure cmdstanpy
cmdstan_dir = os.path.expanduser('~/.cmdstan')
os.makedirs(cmdstan_dir, exist_ok=True)
os.environ['CMDSTAN'] = cmdstan_dir
logging.info(f"Set CMDSTAN directory to {cmdstan_dir}")

# Get tickers
tickers = pd.read_csv(stock_csv)['Ticker'].unique()[:5]  # Process up to 5 tickers
results = []

for ticker in tickers:
    logging.info(f"Processing ticker: {ticker}")
    balance_csv = data_dir / f'balance_sheet_{ticker}.csv'
    cashflow_csv = data_dir / f'cash_flow_{ticker}.csv'
    income_csv = data_dir / f'income_statement_{ticker}.csv'
    news_csv = data_dir / 'news_data.csv'
    
    # Load and prepare data
    data, scaler = load_and_prepare_data(stock_csv, balance_csv, cashflow_csv, income_csv, ticker)
    if data is None or len(data) < 100:
        logging.error(f"Skipping {ticker}: Insufficient data (<100 days)")
        continue
    
    # Load news data and merge
    sentiment_df = load_news_data(news_csv, ticker, data.index)
    if sentiment_df is not None:
        data = data.reset_index().merge(sentiment_df[['Date', 'Sentiment_Score']], on='Date', how='left')
        data['Sentiment_Score'] = data['Sentiment_Score'].fillna(0.0)
        data = data.set_index('Date')
        logging.info(f"Added Sentiment_Score to data for {ticker}. Columns: {list(data.columns)}")
    else:
        logging.warning(f"No sentiment data for {ticker}. Using neutral scores.")
        data['Sentiment_Score'] = 0.0

2025-05-05 19:03:55,256 - INFO - Set CMDSTAN directory to C:\Users\nguye/.cmdstan
2025-05-05 19:03:55,261 - INFO - Processing ticker: AAPL
2025-05-05 19:03:55,268 - ERROR - Error preparing data for AAPL: Index(['Ticker'], dtype='object'). Skipping ticker.
2025-05-05 19:03:55,269 - ERROR - Skipping AAPL: Insufficient data (<100 days)


**Models**

In [26]:
# ARIMA forecast
arima_forecast, arima_metrics, arima_order = arima_forecast(data['Close'], forecast_horizon)

TypeError: 'NoneType' object is not subscriptable

In [None]:
# Prophet forecast with tuning
prophet_forecast, prophet_metrics, prophet_forecast_df, prophet_scale = tune_prophet(data, forecast_horizon)

**Walk-forward validation**

In [None]:
avg_metrics = walk_forward_validation(data, forecast_horizon)

**Backtesting**

In [None]:
# Backtesting
if len(data) >= 100:
    # ARIMA historical predictions
    arima_model = auto_arima(data['Close'][:-forecast_horizon], seasonal=False, max_p=7, max_q=7, max_d=2)
    arima_fit = ARIMA(data['Close'][:-forecast_horizon], order=arima_model.order).fit()
    arima_hist_pred = arima_fit.predict(start=0, end=len(data)-forecast_horizon-1)
    arima_backtest_results, arima_backtest_df = backtest_strategy(data.iloc[:-forecast_horizon], arima_hist_pred, 'ARIMA')
    
    # Prophet historical predictions
    train_data = data.iloc[:-forecast_horizon]
    prophet_df = train_data.reset_index()[['Date', 'Close', 'Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA', 'Sentiment_Score']]
    prophet_df = prophet_df.drop_duplicates(subset='Date')
    prophet_df = prophet_df.rename(columns={'Date': 'ds', 'Close': 'y'})
    
    prophet_model = Prophet(daily_seasonality=True, yearly_seasonality=True, weekly_seasonality=True,
                            changepoint_prior_scale=prophet_scale, mcmc_samples=0)
    for regressor in ['Diluted EPS', 'Free Cash Flow', 'Net Debt', 'EBITDA', 'Sentiment_Score']:
        prophet_model.add_regressor(regressor)
    prophet_model.fit(prophet_df)
    
    prophet_pred_df = prophet_model.predict(prophet_df)
    prophet_pred_df = prophet_pred_df.set_index('ds').reindex(train_data.index, method='ffill')
    prophet_hist_pred = prophet_pred_df['yhat']
    
    prophet_backtest_results, prophet_backtest_df = backtest_strategy(train_data, prophet_hist_pred, 'Prophet')
else:
    arima_backtest_results, arima_backtest_df = None, None
    prophet_backtest_results, prophet_backtest_df = None, None
    logging.warning(f"Insufficient data for backtesting {ticker}.")

**Plot predictions**

In [None]:
# Plot predictions
plot_predictions(data, arima_forecast, prophet_forecast, prophet_forecast_df,
                arima_backtest_df, prophet_backtest_df, ticker, forecast_horizon)

In [None]:
# Save predictions
if arima_forecast is not None and prophet_forecast is not None:
    future_dates = [data.index[-1] + timedelta(days=i+1) for i in range(forecast_horizon)]
    pred_df = pd.DataFrame({
        'Date': future_dates,
        'ARIMA_Prediction': arima_forecast,
        'Prophet_Prediction': prophet_forecast
    })
    pred_df.to_csv(data_dir / f'{ticker}_stock_price_predictions.csv', index=False)
    logging.info(f"Saved predictions for {ticker}")

# Save metrics
metrics_df = pd.DataFrame({
    'Model': ['ARIMA', 'Prophet'],
    'RMSE': [arima_metrics.get('RMSE', np.nan), prophet_metrics.get('RMSE', np.nan)],
    'MAE': [arima_metrics.get('MAE', np.nan), prophet_metrics.get('MAE', np.nan)],
    'MAPE': [arima_metrics.get('MAPE', np.nan), prophet_metrics.get('MAPE', np.nan)],
    'Best Parameters': [f"Order: {arima_order}", f"changepoint_prior_scale: {prophet_scale or 0.05}"]
})
metrics_df.to_csv(data_dir / f'{ticker}_model_metrics.csv', index=False)
logging.info(f"Saved metrics for {ticker}")

# Save backtest results
if arima_backtest_results and prophet_backtest_results:
    backtest_df = pd.DataFrame({
        'Model': ['ARIMA', 'Prophet'],
        'Cumulative Return (%)': [arima_backtest_results['Cumulative Return (%)'], 
                                prophet_backtest_results['Cumulative Return (%)']],
        'Number of Trades': [arima_backtest_results['Number of Trades'], 
                            prophet_backtest_results['Number of Trades']]
    })
    backtest_df.to_csv(data_dir / f'{ticker}_backtest_results.csv', index=False)
    logging.info(f"Saved backtest results for {ticker}")

# Select best model
best_model = 'Prophet' if avg_metrics['Prophet']['RMSE'] < avg_metrics['ARIMA']['RMSE'] else 'ARIMA'
results.append({'Ticker': ticker, 'Best_Model': best_model, 'Metrics': avg_metrics})

# Save summary
summary_df = pd.DataFrame(results)
summary_df.to_csv(data_dir / 'summary_results.csv', index=False)
logging.info("Saved summary results")