In [1]:
import pandas as pd
import sqlite3
import plotly.express as px
from tqdm import tqdm
conn = sqlite3.connect("data/database.db")

In [2]:
df = pd.read_sql_query("SELECT * FROM price_data", conn)

In [3]:
df.head()

Unnamed: 0,date,ticker,close,high,low,open,volume,garman_klass_vol,rsi,bb_low,...,return_2d,return_3d,return_6d,return_9d,return_12d,Mkt-RF,SMB,HML,RMW,CMA
0,2015-01-27 00:00:00,A,38.75,39.25,38.580002,38.700001,1703500.0,-0.002277,42.551154,3.572676,...,-0.000773,-0.007624,0.002167,-0.002268,-0.00447,-0.013229,-0.0401,0.0492,0.050537,-0.007434
1,2015-01-27 00:00:00,AAPL,27.285,28.120001,27.2575,28.105,382274800.0,-0.007667,45.914745,3.195314,...,-0.017141,-0.009763,0.004893,-0.001093,-0.002072,0.073535,-0.020393,-0.015092,0.155138,0.208851
2,2015-01-27 00:00:00,ABBV,63.099998,63.5,62.200001,62.330002,6532300.0,-0.059758,41.454726,3.738412,...,0.005352,0.000635,-0.003754,-0.000509,-0.005141,0.072632,0.0454,-0.073537,0.175087,-0.034093
3,2015-01-27 00:00:00,ABT,43.68,44.02,43.43,43.880001,4847400.0,-0.013836,39.157346,3.615259,...,-0.002054,-0.006106,-0.003058,-0.002338,-0.003285,-0.030629,0.048403,0.014877,0.15112,-0.068622
4,2015-01-27 00:00:00,ACGL,19.953333,20.16,19.683332,19.76,1379700.0,-0.000351,58.501939,2.967072,...,0.00378,0.00106,0.001709,0.001308,0.000321,-0.06449,-0.056542,0.17124,-0.082922,-0.165375


In [6]:
import numpy as np
import pandas as pd
import pymc as pm
import arviz as az
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

class FinancialBayesianModel:
    def __init__(self, data, max_tickers=10):
        """
        Initialize the Bayesian financial model with ticker limit
        
        Parameters:
        data (pd.DataFrame): Financial time series data
        max_tickers (int): Maximum number of unique tickers to process
        """
        # Limit the number of unique tickers
        unique_tickers = data['ticker'].unique()
        if len(unique_tickers) > max_tickers:
            selected_tickers = np.random.choice(unique_tickers, max_tickers, replace=False)
            self.original_data = data[data['ticker'].isin(selected_tickers)].copy()
        else:
            self.original_data = data.copy()
        
        self.data = None
        self.scaler = StandardScaler()
    
    def preprocess_data(self, 
                         features=['close', 'volume', 'rsi', 'sharpe_ratio'],
                         target='return_2d', 
                         test_size=0.2):
        """
        Preprocess data with memory-efficient approach
        """
        # Sort and prepare data
        df = self.original_data.sort_values(['date', 'ticker'])
        
        # Select and prepare columns
        cols_to_use = features + [target, 'ticker']
        df_subset = df[cols_to_use].dropna()
        
        # Use categorical encoding for tickers
        df_subset['ticker_code'] = pd.Categorical(df_subset['ticker']).codes
        
        # Prepare features and target
        X = df_subset[features].values
        y = df_subset[target].values
        tickers = df_subset['ticker_code'].values
        
        # Scale features
        X_scaled = self.scaler.fit_transform(X)
        
        # Split data with stratification
        self.X_train, self.X_test, self.y_train, self.y_test, \
        self.tickers_train, self.tickers_test = \
            train_test_split(X_scaled, y, tickers, 
                             test_size=test_size, 
                             stratify=tickers, 
                             random_state=42)
        
        self.features = features
        self.target = target
        self.unique_tickers = np.unique(tickers)
        
        return self
    
    def create_hierarchical_model(self):
        """
        Simplified hierarchical model to avoid recursion
        """
        with pm.Model() as model:
            # Global parameters with less complexity
            mu_alpha = pm.Normal('mu_alpha', mu=0, sigma=1)
            sigma_alpha = pm.HalfNormal('sigma_alpha', sigma=0.5)
            
            # Global coefficients
            mu_betas = pm.Normal('mu_betas', mu=0, sigma=0.5, 
                                 shape=self.X_train.shape[1])
            sigma_betas = pm.HalfNormal('sigma_betas', sigma=0.5, 
                                        shape=self.X_train.shape[1])
            
            # Model variance
            sigma = pm.HalfNormal('sigma', sigma=1)
            
            # Linear model with vectorized computation
            mu = pm.math.dot(self.X_train, mu_betas)
            
            # Likelihood
            likelihood = pm.Normal('returns', 
                                   mu=mu, 
                                   sigma=sigma, 
                                   observed=self.y_train)
        
        return model
    
    def run_inference(self, model, draws=1000, tune=500):
        """
        Run inference with reduced complexity
        """
        with model:
            # Use NUTS sampler with adjusted parameters
            trace = pm.sample(draws=draws, 
                              tune=tune, 
                              #return_inferrable=True,
                              cores=1,  # Avoid multiprocessing issues
                              target_accept=0.9)
        
        return trace
    
    def predict(self, trace):
        """
        Make predictions with simplified approach
        """
        # Extract posterior samples
        mu_betas = trace.posterior['mu_betas']
        sigma = trace.posterior['sigma']
        
        # Predict on test data
        y_pred_samples = np.dot(self.X_test, mu_betas.T)
        
        return {
            'true_returns': self.y_test,
            'mean_prediction': y_pred_samples.mean(axis=0),
            'lower_ci': np.percentile(y_pred_samples, 2.5, axis=0),
            'upper_ci': np.percentile(y_pred_samples, 97.5, axis=0)
        }
    
    def evaluate_predictions(self, predictions):
        """
        Evaluate model predictions
        """
        mse = np.mean((predictions['true_returns'] - predictions['mean_prediction'])**2)
        mae = np.mean(np.abs(predictions['true_returns'] - predictions['mean_prediction']))
        
        return {
            'MSE': mse,
            'MAE': mae,
            'Coverage': np.mean((predictions['true_returns'] >= predictions['lower_ci']) & 
                                (predictions['true_returns'] <= predictions['upper_ci']))
        }

def main():
   
    # Initialize model with ticker limit
    model_runner = FinancialBayesianModel(df, max_tickers=50)
    
    # Preprocess data
    model_runner.preprocess_data(
        features=['close', 'volume', 'rsi', 'sharpe_ratio'],
        target='return_2d'
    )
    
    # Create and run model
    bayesian_model = model_runner.create_hierarchical_model()
    trace = model_runner.run_inference(bayesian_model)
    
    # Make predictions
    predictions = model_runner.predict(trace)
    
    # Evaluate
    performance = model_runner.evaluate_predictions(predictions)
    print("Performance Metrics:", performance)
    
    # Optional visualization
    plt.figure(figsize=(10, 6))
    plt.plot(predictions['true_returns'], label='Actual Returns')
    plt.plot(predictions['mean_prediction'], label='Predicted Returns')
    plt.fill_between(range(len(predictions['lower_ci'])), 
                     predictions['lower_ci'], 
                     predictions['upper_ci'], 
                     alpha=0.2)
    plt.legend()
    plt.show()

if __name__ == '__main__':
    main()

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Sequential sampling (2 chains in 1 job)
NUTS: [mu_alpha, sigma_alpha, mu_betas, sigma_betas, sigma]


Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 319 seconds.
We recommend running at least 4 chains for robust computation of convergence diagnostics


ValueError: shapes (24877,4) and (4,1000,2) not aligned: 4 (dim 1) != 1000 (dim 1)