In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import json
import os
from scipy.stats import t as student_t

class SyntheticMarketGenerator:
    def __init__(self,
                 global_seed=None,
                 trading_days_per_year=252,
                 years=10,
                 default_bull_drift=0.12,
                 default_bear_drift=-0.10,
                 default_upward_bias=0.08,
                 default_bull_vol=0.15,
                 default_bear_vol=0.25):
        
        if global_seed is not None:
            np.random.seed(global_seed)

        self.trading_days_per_year = trading_days_per_year
        self.years = years
        self.total_days = trading_days_per_year * years

        self.default_bull_drift = default_bull_drift
        self.default_bear_drift = default_bear_drift
        self.default_upward_bias = default_upward_bias
        self.default_bull_vol = default_bull_vol
        self.default_bear_vol = default_bear_vol

        self.params = {
            'flash_crash_prob': 0.0002,
            'flash_crash_magnitude': (-0.15, -0.05),
            'earnings_jump_prob': 0.01,
            'earnings_jump_magnitude': (-0.08, 0.12),
            'degrees_of_freedom': 8,
            'vol_of_vol': 0.05,
            'vol_mean_reversion': 0.80,
            'base_vol': 0.10,
        }

        self.regime_transitions = {
            'bull_to_bull': 0.95,
            'bull_to_bear': 0.01,
            'bull_to_correction': 0.04,
            'bear_to_bear': 0.90,
            'bear_to_bull': 0.10,
            'correction_length': (5, 15),
            'correction_depth': (-0.10, -0.03),
        }

        self.BULL = "bull"
        self.BEAR = "bear"
        self.CORRECTION = "correction"
        self.CRASH = "crash"
        self.RECOVERY = "recovery"

    def generate_stock_data(self, ticker="STK", initial_price=None, randomize_params=True):
        """Generate daily OHLCV data for a single stock and return a DataFrame."""
        if randomize_params:
            bull_drift = np.random.normal(self.default_bull_drift, 0.03)
            bear_drift = np.random.normal(self.default_bear_drift, 0.02)
            upward_bias = np.random.normal(self.default_upward_bias, 0.02)
            bull_vol = np.random.normal(self.default_bull_vol, 0.03)
            bear_vol = np.random.normal(self.default_bear_vol, 0.03)
        else:
            bull_drift = self.default_bull_drift
            bear_drift = self.default_bear_drift
            upward_bias = self.default_upward_bias
            bull_vol = self.default_bull_vol
            bear_vol = self.default_bear_vol

        bull_vol = max(bull_vol, 0.02)
        bear_vol = max(bear_vol, 0.05)

        dates = self._generate_dates_with_offset()
        
        N = len(dates)
        close_prices = np.zeros(N)
        open_prices = np.zeros(N)
        high_prices = np.zeros(N)
        low_prices = np.zeros(N)
        volumes = np.zeros(N)
        regimes = np.array([self.BULL]*N, dtype=object)
        daily_vols = np.zeros(N)
        log_returns = np.zeros(N)
        
        # Track current price and date for volume generation
        self.last_close = initial_price
        self.current_date = None

        # Initialize first day properly
        if initial_price is None:
            initial_price = np.random.uniform(50, 150)
            self.last_close = initial_price
            
        # For the first day, we'll generate realistic OHLC values
        current_regime = self.BULL
        daily_vol = bull_vol / np.sqrt(self.trading_days_per_year)
        daily_vols[0] = daily_vol
        
        # Set current date for first day
        self.current_date = dates[0]
        
        # Generate first day's price action
        close_prices[0] = initial_price
        
        # First day's open is typically near the previous day's close
        # We'll use a small random deviation from the initial price
        open_deviation = np.random.normal(0, daily_vol)
        open_prices[0] = initial_price * (1 + open_deviation)
        
        # Generate realistic high/low for first day
        if open_deviation > 0:  # If opened up
            high_prices[0] = max(open_prices[0], close_prices[0]) * (1 + abs(np.random.normal(0, daily_vol)))
            low_prices[0] = min(open_prices[0], close_prices[0]) * (1 - abs(np.random.normal(0, daily_vol * 0.5)))
        else:  # If opened down
            high_prices[0] = max(open_prices[0], close_prices[0]) * (1 + abs(np.random.normal(0, daily_vol * 0.5)))
            low_prices[0] = min(open_prices[0], close_prices[0]) * (1 - abs(np.random.normal(0, daily_vol)))
            
        # Ensure OHLC relationships are maintained
        high_prices[0] = max(high_prices[0], open_prices[0], close_prices[0])
        low_prices[0] = min(low_prices[0], open_prices[0], close_prices[0])
        
        # Generate first day's volume
        volumes[0] = self._make_volume(open_deviation, daily_vol, current_regime)

        correction_target = None
        correction_end = None

        # Generate subsequent days
        for i in range(1, N):
            self.current_date = dates[i]
            
            current_regime, correction_target, correction_end = self._update_regime(
                current_regime, i, regimes, correction_target, correction_end
            )
            regimes[i] = current_regime

            drift_annual, vol_annual = self._get_regime_drift_vol(
                current_regime, bull_drift, bear_drift, bull_vol, bear_vol,
                i, correction_target, correction_end
            )

            daily_drift = np.log(1 + drift_annual) / self.trading_days_per_year
            desired_vol = vol_annual / np.sqrt(self.trading_days_per_year)
            daily_drift += upward_bias / self.trading_days_per_year

            daily_vol = (
                self.params['vol_mean_reversion']*desired_vol +
                (1 - self.params['vol_mean_reversion'])*daily_vols[i-1] +
                np.random.normal(0, self.params['vol_of_vol']/self.trading_days_per_year)
            )
            min_daily_vol = self.params['base_vol']/np.sqrt(self.trading_days_per_year)
            daily_vol = max(daily_vol, min_daily_vol)
            daily_vols[i] = daily_vol

            shock = student_t.rvs(df=self.params['degrees_of_freedom'])
            shock /= np.sqrt(self.params['degrees_of_freedom']/(self.params['degrees_of_freedom'] - 2))

            daily_log_return = daily_drift + daily_vol*shock

            if current_regime not in [self.CRASH, self.CORRECTION]:
                daily_log_return = self._special_events(daily_log_return)

            log_returns[i] = daily_log_return
            close_prices[i] = close_prices[i-1]*np.exp(daily_log_return)
            self.last_close = close_prices[i]

            o, h, l = self._make_ohlc(close_prices[i-1], close_prices[i], daily_vol, current_regime)
            open_prices[i], high_prices[i], low_prices[i] = o, h, l

            volumes[i] = self._make_volume(daily_log_return, daily_vol, current_regime)

        # Final validation to ensure OHLC relationships
        for i in range(N):
            high_prices[i] = max(open_prices[i], high_prices[i], low_prices[i], close_prices[i])
            low_prices[i] = min(open_prices[i], high_prices[i], low_prices[i], close_prices[i])

        # Create the DataFrame
        df = pd.DataFrame({
            'Date': dates,
            'Open': open_prices,
            'High': high_prices,
            'Low': low_prices,
            'Close': close_prices,
            'Volume': volumes.astype(int),
            'Regime': regimes,
            'Volatility': daily_vols,
            'LogReturn': log_returns
        })
        df.set_index('Date', inplace=True)
        
        # Round price columns to 2 decimal places for realism
        df['Open'] = np.round(df['Open'], 2)
        df['High'] = np.round(df['High'], 2) 
        df['Low'] = np.round(df['Low'], 2)
        df['Close'] = np.round(df['Close'], 2)
        
        df['LogReturn'] = np.round(df['LogReturn'], 6)  # 6 decimal places for returns
        df['Volatility'] = np.round(df['Volatility'], 6)  # 6 decimal places for volatility

        # Store original attributes
        df.attrs['ticker'] = ticker
        df.attrs['bull_drift'] = bull_drift
        df.attrs['bear_drift'] = bear_drift
        df.attrs['upward_bias'] = upward_bias
        df.attrs['bull_vol'] = bull_vol
        df.attrs['bear_vol'] = bear_vol

        # Calculate performance metrics
        total_return = df['Close'][-1] / df['Close'][0] - 1
        years_held = len(dates) / self.trading_days_per_year
        annualized_return = (1 + total_return) ** (1 / years_held) - 1
        annualized_vol = np.std(df['LogReturn'][1:]) * np.sqrt(self.trading_days_per_year)
        sharpe_ratio = annualized_return / annualized_vol if annualized_vol > 0 else 0

        # Add to DataFrame attributes
        df.attrs['total_return'] = total_return
        df.attrs['annualized_return'] = annualized_return
        df.attrs['sharpe_ratio'] = sharpe_ratio

        return df

    def _generate_dates_with_offset(self):
        """Generate a list of trading dates with a random offset up to 60 days."""
        offset = np.random.randint(0, 61)
        start_date = datetime.datetime(2010,1,1) + datetime.timedelta(days=offset)

        dates = []
        current = start_date
        while len(dates) < self.total_days:
            if current.weekday() < 5:  # Monday to Friday
                dates.append(current)
            current += datetime.timedelta(days=1)
        return dates

    def _update_regime(self, current_regime, i, regimes, corr_target, corr_end):
        r = np.random.random()
        if current_regime == self.BULL:
            if r < self.regime_transitions['bull_to_bear']:
                current_regime = self.BEAR
            elif r < (self.regime_transitions['bull_to_bear'] +
                      self.regime_transitions['bull_to_correction']):
                current_regime = self.CORRECTION
                dur = np.random.randint(*self.regime_transitions['correction_length'])
                corr_end = i + dur
                corr_target = np.random.uniform(*self.regime_transitions['correction_depth'])
        elif current_regime == self.BEAR:
            if r < self.regime_transitions['bear_to_bull']:
                current_regime = self.RECOVERY
                bear_days = np.sum(regimes[:i] == self.BEAR)
                corr_end = i + int(bear_days*0.5)
        elif current_regime == self.CORRECTION:
            if corr_end is not None and i >= corr_end:
                current_regime = self.BULL
                corr_target = None
                corr_end = None
        elif current_regime == self.RECOVERY:
            if corr_end is not None and i >= corr_end:
                current_regime = self.BULL
                corr_end = None
        elif current_regime == self.CRASH:
            current_regime = self.BULL
        return current_regime, corr_target, corr_end

    def _get_regime_drift_vol(self, regime, bull_drift, bear_drift,
                              bull_vol, bear_vol,
                              day_i, corr_target, corr_end):
        """Return annual drift & vol depending on the current regime."""
        if regime == self.BULL:
            drift = bull_drift
            vol   = bull_vol
        elif regime == self.BEAR:
            drift = bear_drift
            vol   = bear_vol
        elif regime == self.CORRECTION:
            if corr_target is None:
                corr_target = np.random.uniform(*self.regime_transitions['correction_depth'])
            drift = corr_target
            vol   = 0.5*(bull_vol + bear_vol)
        elif regime == self.RECOVERY:
            drift = bull_drift*1.5
            vol   = bull_vol + 0.3*(bear_vol - bull_vol)
        elif regime == self.CRASH:
            drift = np.random.uniform(*self.params['flash_crash_magnitude'])
            vol   = bear_vol*2
        else:
            drift = bull_drift
            vol   = bull_vol
        return drift, vol

    def _special_events(self, daily_log_return):
        """Flash crash or earnings jump with given probabilities."""
        if np.random.random() < self.params['flash_crash_prob']:
            return np.random.uniform(*self.params['flash_crash_magnitude'])
        if np.random.random() < self.params['earnings_jump_prob']:
            if np.random.random() < 0.55:
                jump = np.random.uniform(0, self.params['earnings_jump_magnitude'][1])
            else:
                jump = np.random.uniform(self.params['earnings_jump_magnitude'][0], 0)
            return daily_log_return + jump
        return daily_log_return

    def _make_ohlc(self, prev_close, curr_close, daily_vol, regime):
        """Construct open/high/low from close-to-close movement + random intraday range."""
        if regime in [self.BEAR, self.CRASH]:
            daily_range = 0.03
        else:
            daily_range = 0.02

        bull_daily_vol = self.default_bull_vol / np.sqrt(self.trading_days_per_year)
        factor = daily_vol / bull_daily_vol if bull_daily_vol > 0 else 1
        daily_range *= factor

        open_frac = np.clip(np.random.normal(0.5, 0.2), 0, 1)
        open_price = prev_close + (curr_close - prev_close)*open_frac

        if curr_close > prev_close:
            up_wick = np.random.uniform(0, daily_range*0.7)
            down_wick = np.random.uniform(0, daily_range*0.3)
        else:
            up_wick = np.random.uniform(0, daily_range*0.3)
            down_wick = np.random.uniform(0, daily_range*0.7)

        high_price = max(open_price, curr_close) + up_wick
        low_price  = min(open_price, curr_close) - down_wick

        if high_price < low_price:
            high_price = low_price * 1.001

        return open_price, high_price, low_price

    def _make_volume(self, daily_log_return, daily_vol, regime):
        """Generate realistic trading volume based on stock characteristics and market conditions."""
        # Base volume varies by 'market cap' (approximated by price)
        price = self.last_close  # We'll need to track this in generate_stock_data
        
        # Scale base volume by price to approximate market cap influence
        if price < 10:  # Penny/small cap
            base_volume = np.random.randint(50_000, 500_000)
        elif price < 50:  # Small-mid cap
            base_volume = np.random.randint(500_000, 2_000_000)
        elif price < 200:  # Mid-large cap
            base_volume = np.random.randint(2_000_000, 10_000_000)
        else:  # Large cap
            base_volume = np.random.randint(8_000_000, 30_000_000)

        # Apply existing scaling factors
        bull_daily_vol = self.default_bull_vol / np.sqrt(self.trading_days_per_year)
        if bull_daily_vol <= 0:
            bull_daily_vol = 1e-9

        vol_factor = 1 + 1.5*(daily_vol / bull_daily_vol - 1)
        
        # Higher volume on big price moves
        move_factor = 1
        if daily_vol > 0:
            move_factor = 1 + 0.8*(abs(daily_log_return)/daily_vol)
        
        # Day-to-day randomness in volume
        random_factor = np.random.lognormal(0, 0.6)

        # Volume tends to be higher on Mondays and Fridays
        day_of_week = self.current_date.weekday()
        weekday_factor = 1.0
        if day_of_week == 0:  # Monday
            weekday_factor = 1.1
        elif day_of_week == 4:  # Friday
            weekday_factor = 1.15

        volume = base_volume * vol_factor * move_factor * random_factor * weekday_factor

        # Volume is much higher during market stress
        if regime == self.CRASH:
            volume *= 5
        elif regime == self.BEAR:
            volume *= 1.3
        elif regime == self.CORRECTION:
            volume *= 1.2

        # Ensure volume is a whole number
        return int(volume)

    def _calculate_max_drawdown(self, prices):
        """Calculate the maximum drawdown from a series of prices."""
        running_max = np.maximum.accumulate(prices)
        drawdowns = prices / running_max - 1
        return float(np.min(drawdowns))

    def plot_stock(self, df, ticker, save_path):
        """
        Basic plot: line chart of 'Close' with color shading for each regime.
        Saves the figure to 'save_path'.
        """
        fig, ax = plt.subplots(figsize=(10,6))
        ax.plot(df.index, df['Close'], 'k-', lw=1.5, label='Close')

        regime_colors = {
            self.BULL: 'lightgreen',
            self.BEAR: 'lightcoral',
            self.CORRECTION: 'yellow',
            self.CRASH: 'red',
            self.RECOVERY: 'lightblue'
        }

        max_y = df['Close'].max() * 1.1
        for regime_val, color in regime_colors.items():
            mask = (df['Regime'] == regime_val)
            if mask.any():
                ax.fill_between(
                    df.index, 0, max_y,
                    where=mask, color=color, alpha=0.2,
                    label=regime_val
                )

        # Add performance metrics to the title
        annualized_return = df.attrs.get('annualized_return', 0) * 100
        sharpe_ratio = df.attrs.get('sharpe_ratio', 0)
        title = f"{ticker} Synthetic Price\nAnn. Return: {annualized_return:.2f}%, Sharpe Ratio: {sharpe_ratio:.2f}"
        
        ax.set_title(title, fontsize=14)
        ax.set_ylabel("Price")
        ax.grid(True, alpha=0.3)
        ax.legend(loc='best')
        fig.savefig(save_path, dpi=150, bbox_inches='tight')
        plt.close(fig)

    def generate_random_ticker(self, length=4):
        """
        Create a random uppercase 'ticker' name of given length, e.g. 'ABCD'.
        """
        letters = list("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
        # We'll pick random letters from the above
        # If you want to ensure uniqueness, you could store used tickers in a set,
        # but for demonstration, this is fine.
        arr = np.random.choice(letters, size=length, replace=True)
        return "".join(arr)

    def generate_portfolio(self, num_stocks=5, output_dir="synthetic_portfolio"):
        """
        Generate data for 'num_stocks' random tickers,
        save CSV + PNG + JSON for each.
        """
        os.makedirs(output_dir, exist_ok=True)
        all_stocks = {}

        for _ in range(num_stocks):
            # Make a random 3- or 4-letter ticker name
            name_len = np.random.choice([3,4])
            ticker_name = self.generate_random_ticker(length=name_len)

            df = self.generate_stock_data(ticker=ticker_name)
            all_stocks[ticker_name] = df

            # Save CSV
            csv_path = os.path.join(output_dir, f"{ticker_name}.csv")
            df.to_csv(csv_path)

            # Save figure
            fig_path = os.path.join(output_dir, f"{ticker_name}.png")
            self.plot_stock(df, ticker_name, fig_path)

            # Save metadata as JSON
            meta = {
                'ticker': ticker_name,
                'start_date': df.index[0].strftime('%Y-%m-%d'),
                'end_date': df.index[-1].strftime('%Y-%m-%d'),
                'initial_price': float(df['Close'][0]),
                'final_price': float(df['Close'][-1]),
                'total_return': float(df.attrs['total_return'] * 100),
                'annualized_return': float(df.attrs['annualized_return'] * 100),
                'sharpe_ratio': float(df.attrs['sharpe_ratio']),
                'max_drawdown': float(self._calculate_max_drawdown(df['Close'])),
                'annualized_volatility': float(np.std(df['LogReturn']) * np.sqrt(self.trading_days_per_year) * 100),
                'regime_distribution': {
                    regime: int(np.sum(df['Regime'] == regime))
                    for regime in [self.BULL, self.BEAR, self.CORRECTION, self.RECOVERY, self.CRASH]
                },
                'bull_drift': df.attrs['bull_drift'],
                'bear_drift': df.attrs['bear_drift'],
                'upward_bias': df.attrs['upward_bias'],
                'bull_vol': df.attrs['bull_vol'],
                'bear_vol': df.attrs['bear_vol']
            }
            meta_path = os.path.join(output_dir, f"{ticker_name}_metadata.json")
            with open(meta_path, 'w') as f:
                json.dump(meta, f, indent=2)

        print(f"Generated {num_stocks} random stocks into '{output_dir}'.")
        return all_stocks

def main():
    # Example usage: generate 5 random stocks with a global seed
    gen = SyntheticMarketGenerator(global_seed=42)
    gen.generate_portfolio(num_stocks=50, output_dir="sonnet_ticker_portfolio")

if __name__ == "__main__":
    main()

In [None]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
import talib
from tqdm import tqdm

def calculate_technical_indicators(df):
    """Calculate technical indicators from OHLC data."""
    ohlc_df = df.copy()
    
    # Calculate Moving Averages
    ohlc_df['MA5'] = talib.SMA(ohlc_df['Close'].values, timeperiod=5)
    ohlc_df['MA20'] = talib.SMA(ohlc_df['Close'].values, timeperiod=20)
    ohlc_df['MA50'] = talib.SMA(ohlc_df['Close'].values, timeperiod=50)
    ohlc_df['MA200'] = talib.SMA(ohlc_df['Close'].values, timeperiod=200)
    
    # Calculate Moving Average Crossovers
    ohlc_df['MA5_cross_MA20'] = (ohlc_df['MA5'] > ohlc_df['MA20']).astype(int)
    ohlc_df['MA50_cross_MA200'] = (ohlc_df['MA50'] > ohlc_df['MA200']).astype(int)
    
    # Calculate RSI
    ohlc_df['RSI'] = talib.RSI(ohlc_df['Close'].values, timeperiod=14)
    
    # Calculate Bollinger Bands
    upper, middle, lower = talib.BBANDS(ohlc_df['Close'].values, timeperiod=20, 
                                        nbdevup=2, nbdevdn=2, matype=0)
    ohlc_df['BB_upper'] = upper
    ohlc_df['BB_middle'] = middle
    ohlc_df['BB_lower'] = lower
    ohlc_df['BB_width'] = (upper - lower) / middle
    
    # Calculate ATR
    ohlc_df['ATR'] = talib.ATR(ohlc_df['High'].values, ohlc_df['Low'].values, 
                               ohlc_df['Close'].values, timeperiod=14)
    
    return ohlc_df

def calculate_rolling_max_drawdown(df, window=252):
    """
    Calculate the worst drawdown within a rolling window of specified length.
    For each day, find the maximum percentage decline from peak to trough
    within the previous window days.
    """
    # Initialize series to store results
    max_drawdowns = pd.Series(index=df.index, dtype='float64')
    
    # Loop through each day
    for i in range(len(df)):
        # Get window start
        start_idx = max(0, i - window + 1)
        
        # Select the window
        window_prices = df['Close'].iloc[start_idx:i+1].values
        
        # Calculate maximum drawdown within the window
        max_dd = 0
        peak = window_prices[0]
        
        for price in window_prices:
            # Update peak if we find new high
            if price > peak:
                peak = price
            
            # Calculate drawdown from current peak
            drawdown = (price / peak) - 1
            
            # Update max drawdown if this is worse
            if drawdown < max_dd:
                max_dd = drawdown
        
        max_drawdowns.iloc[i] = max_dd
    
    return max_drawdowns

def calculate_return_features(df):
    """Calculate return-based features and metrics."""
    ret_df = df.copy()
    
    # Calculate log returns
    ret_df['LogReturn'] = np.log(ret_df['Close'] / ret_df['Close'].shift(1))
    
    # Calculate momentum metrics (1-week, 1-month, 3-month returns)
    ret_df['Return_1W'] = ret_df['Close'].pct_change(5)  # 5 trading days
    ret_df['Return_1M'] = ret_df['Close'].pct_change(21)  # ~21 trading days
    ret_df['Return_3M'] = ret_df['Close'].pct_change(63)  # ~63 trading days
    
    # Calculate current drawdown from all-time high
    rolling_max = ret_df['Close'].cummax()
    ret_df['CurrentDrawdown'] = (ret_df['Close'] / rolling_max) - 1
    
    # Calculate rolling max drawdown (252-day window)
    ret_df['MaxDrawdown_252d'] = calculate_rolling_max_drawdown(ret_df)
    
    # Calculate rolling Sharpe ratios (20-day and 60-day)
    # Assuming risk-free rate of 0 for simplicity
    ret_df['Sharpe_20d'] = (
        ret_df['LogReturn'].rolling(20).mean() / 
        ret_df['LogReturn'].rolling(20).std()
    ) * np.sqrt(252)  # Annualize
    
    ret_df['Sharpe_60d'] = (
        ret_df['LogReturn'].rolling(60).mean() / 
        ret_df['LogReturn'].rolling(60).std()
    ) * np.sqrt(252)  # Annualize
    
    return ret_df

def preprocess_stocks(input_dir, output_dir):
    """Process all stock data files in the input directory."""
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Get list of CSV files
    csv_files = list(Path(input_dir).glob("*.csv"))
    print(f"Found {len(csv_files)} CSV files to process")
    
    for csv_file in tqdm(csv_files, desc="Processing stocks"):
        # Skip if not a ticker CSV file (e.g., avoid metadata files)
        if "_metadata" in csv_file.name:
            continue
            
        # Read the CSV file
        df = pd.read_csv(csv_file, index_col='Date', parse_dates=True)
        
        # Calculate technical indicators
        print(f"Calculating technical indicators for {csv_file.name}")
        df = calculate_technical_indicators(df)
        
        # Calculate return-based features
        print(f"Calculating return features for {csv_file.name}")
        df = calculate_return_features(df)
        # remove rows with NaN values
        df.dropna(inplace=True)
        
        # Save the preprocessed data
        output_file = Path(output_dir) / f"preprocessed_{csv_file.name}"
        df.to_csv(output_file)
        
        print(f"Processed {csv_file.name} -> {output_file.name}")

if __name__ == "__main__":
    input_directory = "/Users/newuser/Projects/robust_algo_trader/data/gen_synthetic_data/sonnet_ticker_portfolio"
    output_directory = "/Users/newuser/Projects/robust_algo_trader/data/gen_synthetic_data/preprocessed_data"
    
    preprocess_stocks(input_directory, output_directory)
    print("Preprocessing complete!")