In [22]:
import pandas as pd
import sqlite3
import plotly.express as px
from tqdm import tqdm
conn = sqlite3.connect("data/database.db")

In [23]:
df = pd.read_sql_query("SELECT * FROM price_data", conn)

In [24]:
df.head()

Unnamed: 0,date,ticker,close,high,low,open,volume,garman_klass_vol,rsi,bb_low,...,return_2d,return_3d,return_6d,return_9d,return_12d,Mkt-RF,SMB,HML,RMW,CMA
0,2014-02-07 00:00:00,A,42.503578,42.539341,41.58083,41.816879,2749586.0,-0.001839,56.320226,3.634578,...,0.017178,0.009198,-0.000252,0.002117,-0.002089,-0.018428,-0.034321,0.027925,0.014862,0.02653
1,2014-02-07 00:00:00,AAPL,18.559999,18.676071,18.477858,18.620714,370280400.0,-0.006857,44.058787,2.7857,...,0.009901,0.00909,0.00753,-0.005722,-0.004447,0.053902,0.008192,-0.03389,0.145374,0.193726
2,2014-02-07 00:00:00,ABBV,48.889999,49.029999,47.66,48.0,6979500.0,-0.067514,46.153243,3.435299,...,0.012079,0.006492,0.001991,0.004795,-0.000323,0.058951,0.079235,-0.09755,0.208368,-0.057851
3,2014-02-07 00:00:00,ABT,37.18,37.209999,36.650002,36.77,12028000.0,-0.014735,46.146218,3.379105,...,0.011352,0.00848,0.002944,0.002665,-0.001969,-0.026242,0.059898,-0.027832,0.136349,-0.043127
4,2014-02-07 00:00:00,ACGL,17.583332,17.663334,17.41,17.603333,1496100.0,0.000104,30.310212,2.913167,...,-0.000757,-0.001826,-0.004815,-0.001795,-0.003656,-0.06459,-0.034905,0.148707,-0.067339,-0.136078


In [None]:
import numpy as np
import pandas as pd
import pymc as pm
import arviz as az
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

class FinancialBayesianModel:
    def __init__(self, data, max_tickers=50):
        """
        Initialize the Bayesian financial model with ticker limit
        
        Parameters:
        data (pd.DataFrame): Financial time series data
        max_tickers (int): Maximum number of unique tickers to process
        """
        # Limit the number of unique tickers
        unique_tickers = data['ticker'].unique()
        if len(unique_tickers) > max_tickers:
            selected_tickers = np.random.choice(unique_tickers, max_tickers, replace=False)
            self.original_data = data[data['ticker'].isin(selected_tickers)].copy()
        else:
            self.original_data = data.copy()
        
        self.data = None
        self.scaler = StandardScaler()
    
    def preprocess_data(self, 
                         features=['close', 'volume', 'rsi', 'sharpe_ratio'],
                         target='return_2d', 
                         test_size=0.2):
        """
        Preprocess data with memory-efficient approach
        """
        # Sort and prepare data
        df = self.original_data.sort_values(['date', 'ticker'])
        
        # Select and prepare columns
        cols_to_use = features + [target, 'ticker']
        df_subset = df[cols_to_use].dropna()
        
        # Use categorical encoding for tickers
        df_subset['ticker_code'] = pd.Categorical(df_subset['ticker']).codes
        
        # Prepare features and target
        X = df_subset[features].values
        y = df_subset[target].values
        tickers = df_subset['ticker_code'].values
        
        # Scale features
        X_scaled = self.scaler.fit_transform(X)
        
        # Split data with stratification
        self.X_train, self.X_test, self.y_train, self.y_test, \
        self.tickers_train, self.tickers_test = \
            train_test_split(X_scaled, y, tickers, 
                             test_size=test_size, 
                             stratify=tickers, 
                             random_state=42)
        
        self.features = features
        self.target = target
        self.unique_tickers = np.unique(tickers)
        
        return self
    
    def create_hierarchical_model(self):
        """
        Simplified hierarchical model to avoid recursion
        """
        with pm.Model() as model:
            # Global parameters with less complexity
            mu_alpha = pm.Normal('mu_alpha', mu=0, sigma=1)
            sigma_alpha = pm.HalfNormal('sigma_alpha', sigma=0.5)
            
            # Global coefficients
            mu_betas = pm.Normal('mu_betas', mu=0, sigma=0.5, 
                                 shape=self.X_train.shape[1])
            sigma_betas = pm.HalfNormal('sigma_betas', sigma=0.5, 
                                        shape=self.X_train.shape[1])
            
            # Model variance
            sigma = pm.HalfNormal('sigma', sigma=1)
            
            # Linear model with vectorized computation
            mu = pm.math.dot(self.X_train, mu_betas)
            
            # Likelihood
            likelihood = pm.Normal('returns', 
                                   mu=mu, 
                                   sigma=sigma, 
                                   observed=self.y_train)
        
        return model
    
    def run_inference(self, model, draws=1000, tune=500):
        """
        Run inference with reduced complexity
        """
        with model:
            # Use NUTS sampler with adjusted parameters
            trace = pm.sample(draws=draws, 
                              tune=tune, 
                              return_inferrable=True,
                              cores=1,  # Avoid multiprocessing issues
                              target_accept=0.9)
        
        return trace
    
    def predict(self, trace):
        """
        Make predictions with simplified approach
        """
        # Extract posterior samples
        mu_betas = trace.posterior['mu_betas']
        sigma = trace.posterior['sigma']
        
        # Predict on test data
        y_pred_samples = np.dot(self.X_test, mu_betas.T)
        
        return {
            'true_returns': self.y_test,
            'mean_prediction': y_pred_samples.mean(axis=0),
            'lower_ci': np.percentile(y_pred_samples, 2.5, axis=0),
            'upper_ci': np.percentile(y_pred_samples, 97.5, axis=0)
        }
    
    def evaluate_predictions(self, predictions):
        """
        Evaluate model predictions
        """
        mse = np.mean((predictions['true_returns'] - predictions['mean_prediction'])**2)
        mae = np.mean(np.abs(predictions['true_returns'] - predictions['mean_prediction']))
        
        return {
            'MSE': mse,
            'MAE': mae,
            'Coverage': np.mean((predictions['true_returns'] >= predictions['lower_ci']) & 
                                (predictions['true_returns'] <= predictions['upper_ci']))
        }

def main():
    # Load your financial data
    df = pd.read_csv('your_financial_data.csv')
    
    # Initialize model with ticker limit
    model_runner = FinancialBayesianModel(df, max_tickers=50)
    
    # Preprocess data
    model_runner.preprocess_data(
        features=['close', 'volume', 'rsi', 'sharpe_ratio'],
        target='return_2d'
    )
    
    # Create and run model
    bayesian_model = model_runner.create_hierarchical_model()
    trace = model_runner.run_inference(bayesian_model)
    
    # Make predictions
    predictions = model_runner.predict(trace)
    
    # Evaluate
    performance = model_runner.evaluate_predictions(predictions)
    print("Performance Metrics:", performance)
    
    # Optional visualization
    plt.figure(figsize=(10, 6))
    plt.plot(predictions['true_returns'], label='Actual Returns')
    plt.plot(predictions['mean_prediction'], label='Predicted Returns')
    plt.fill_between(range(len(predictions['lower_ci'])), 
                     predictions['lower_ci'], 
                     predictions['upper_ci'], 
                     alpha=0.2)
    plt.legend()
    plt.show()

if __name__ == '__main__':
    main()

RecursionError: maximum recursion depth exceeded in comparison

In [None]:
from sklearn.preprocessing import StandardScaler
import numpy as np

class Preprocessor:
    def __init__(self, data):
        self.original_data = data.copy()
        self.features = [ 'garman_klass_vol', 'rsi', 'bb_low', 'bb_mid', 'bb_high',
       'sharpe_ratio', 'atr', 'macd', 'dollar_volume', 'return_1d',
       'return_2d', 'return_3d', 'return_6d', 'return_9d', 'return_12d',
       'Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA']
        self.data = None
        self.target = None
        self.scaler = StandardScaler()
    

    def create_MA(self):
        df = self.original_data.sort_values(["date","ticker"])
        df['MA_10'] = df.groupby('ticker')['close'].transform(lambda x: x.rolling(window=10).mean())
        df['MA_50'] = df.groupby('ticker')['close'].transform(lambda x: x.rolling(window=50).mean())

        self.data = df
    
    def scale_group(group): 
        scaler = StandardScaler()
        numerical_cols = self.features + ["MA_10","MA_50"]
        group[numerical_cols] = scaler.fit_transform(group[numerical_cols])
    return df.groupby("ticker").apply(scale_group).reset_index(drop=True)

prepro = Preprocessor(df)

prepro.create_MA()

UnboundLocalError: cannot access local variable 'df' where it is not associated with a value

In [31]:
import numpy as np
import pandas as pd
import pymc as pm
import arviz as az
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

class FinancialBayesianModel:
    def __init__(self, data, max_tickers=50):
        """
        Initialize the Bayesian financial model with ticker limit
        
        Parameters:
        data (pd.DataFrame): Financial time series data
        max_tickers (int): Maximum number of unique tickers to process
        """
        # Limit the number of unique tickers
        unique_tickers = data['ticker'].unique()
        if len(unique_tickers) > max_tickers:
            selected_tickers = np.random.choice(unique_tickers, max_tickers, replace=False)
            self.original_data = data[data['ticker'].isin(selected_tickers)].copy()
        else:
            self.original_data = data.copy()
        
        self.data = None
        self.scaler = StandardScaler()
    
    def preprocess_data(self, 
                         features=['close', 'volume', 'rsi', 'sharpe_ratio'],
                         target='return_2d', 
                         test_size=0.2):
        """
        Preprocess data with memory-efficient approach
        """
        # Sort and prepare data
        df = self.original_data.sort_values(['date', 'ticker'])
        
        # Select and prepare columns
        cols_to_use = features + [target, 'ticker']
        df_subset = df[cols_to_use].dropna()
        
        # Use categorical encoding for tickers
        df_subset['ticker_code'] = pd.Categorical(df_subset['ticker']).codes
        
        # Prepare features and target
        X = df_subset[features].values
        y = df_subset[target].values
        tickers = df_subset['ticker_code'].values
        
        # Scale features
        X_scaled = self.scaler.fit_transform(X)
        
        # Split data with stratification
        self.X_train, self.X_test, self.y_train, self.y_test, \
        self.tickers_train, self.tickers_test = \
            train_test_split(X_scaled, y, tickers, 
                             test_size=test_size, 
                             stratify=tickers, 
                             random_state=42)
        
        self.features = features
        self.target = target
        self.unique_tickers = np.unique(tickers)
        
        return self
    
    def create_hierarchical_model(self):
        """
        Simplified hierarchical model to avoid recursion
        """
        with pm.Model() as model:
            # Global parameters with less complexity
            mu_alpha = pm.Normal('mu_alpha', mu=0, sigma=1)
            sigma_alpha = pm.HalfNormal('sigma_alpha', sigma=0.5)
            
            # Global coefficients
            mu_betas = pm.Normal('mu_betas', mu=0, sigma=0.5, 
                                 shape=self.X_train.shape[1])
            sigma_betas = pm.HalfNormal('sigma_betas', sigma=0.5, 
                                        shape=self.X_train.shape[1])
            
            # Model variance
            sigma = pm.HalfNormal('sigma', sigma=1)
            
            # Linear model with vectorized computation
            mu = pm.math.dot(self.X_train, mu_betas)
            
            # Likelihood
            likelihood = pm.Normal('returns', 
                                   mu=mu, 
                                   sigma=sigma, 
                                   observed=self.y_train)
        
        return model
    
    def run_inference(self, model, draws=1000, tune=500):
        """
        Run inference with reduced complexity
        """
        with model:
            # Use NUTS sampler with adjusted parameters
            trace = pm.sample(draws=draws, 
                              tune=tune, 
                              #return_inferrable=True,
                              cores=1,  # Avoid multiprocessing issues
                              target_accept=0.9)
        
        return trace
    
    def predict(self, trace):
        """
        Make predictions with simplified approach
        """
        # Extract posterior samples
        mu_betas = trace.posterior['mu_betas']
        sigma = trace.posterior['sigma']
        
        # Predict on test data
        y_pred_samples = np.dot(self.X_test, mu_betas.T)
        
        return {
            'true_returns': self.y_test,
            'mean_prediction': y_pred_samples.mean(axis=0),
            'lower_ci': np.percentile(y_pred_samples, 2.5, axis=0),
            'upper_ci': np.percentile(y_pred_samples, 97.5, axis=0)
        }
    
    def evaluate_predictions(self, predictions):
        """
        Evaluate model predictions
        """
        mse = np.mean((predictions['true_returns'] - predictions['mean_prediction'])**2)
        mae = np.mean(np.abs(predictions['true_returns'] - predictions['mean_prediction']))
        
        return {
            'MSE': mse,
            'MAE': mae,
            'Coverage': np.mean((predictions['true_returns'] >= predictions['lower_ci']) & 
                                (predictions['true_returns'] <= predictions['upper_ci']))
        }

def main():
   
    # Initialize model with ticker limit
    model_runner = FinancialBayesianModel(df, max_tickers=50)
    
    # Preprocess data
    model_runner.preprocess_data(
        features=['close', 'volume', 'rsi', 'sharpe_ratio'],
        target='return_2d'
    )
    
    # Create and run model
    bayesian_model = model_runner.create_hierarchical_model()
    trace = model_runner.run_inference(bayesian_model)
    
    # Make predictions
    predictions = model_runner.predict(trace)
    
    # Evaluate
    performance = model_runner.evaluate_predictions(predictions)
    print("Performance Metrics:", performance)
    
    # Optional visualization
    plt.figure(figsize=(10, 6))
    plt.plot(predictions['true_returns'], label='Actual Returns')
    plt.plot(predictions['mean_prediction'], label='Predicted Returns')
    plt.fill_between(range(len(predictions['lower_ci'])), 
                     predictions['lower_ci'], 
                     predictions['upper_ci'], 
                     alpha=0.2)
    plt.legend()
    plt.show()

if __name__ == '__main__':
    main()

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Sequential sampling (2 chains in 1 job)
NUTS: [mu_alpha, sigma_alpha, mu_betas, sigma_betas, sigma]


Sampling 2 chains for 500 tune and 1_000 draw iterations (1_000 + 2_000 draws total) took 373 seconds.
We recommend running at least 4 chains for robust computation of convergence diagnostics


ValueError: shapes (24628,4) and (4,1000,2) not aligned: 4 (dim 1) != 1000 (dim 1)