In [1]:
import pandas as pd
import os

def load_trading_data(data_folder: str) -> dict[str, pd.DataFrame]:
    """
    Load trading data from CSV files into pandas DataFrames.

    Args:
        data_folder (str): Path to the folder containing the CSV files

    Returns:
        Dict[str, pd.DataFrame]: Dictionary containing the following DataFrames:
            - 'prices': Price data for all days
            - 'trades': Trade data for all days
    """
    data = {
        'prices': [],
        'trades': [],
        'observations': []
    }

    # Load data for each day
    for day in [1, 2, 3]:
        # Load price data
        price_file = os.path.join(data_folder, f'prices_round_4_day_{day}.csv')
        if os.path.exists(price_file):
            price_df = pd.read_csv(price_file, sep=';')
            # Add day column
            price_df['day'] = day
            # Convert timestamp if it exists
            if 'timestamp' in price_df.columns:
                price_df = price_df.sort_values('timestamp')
            data['prices'].append(price_df)

        # Load trade data
        trade_file = os.path.join(data_folder, f'trades_round_4_day_{day}.csv')
        if os.path.exists(trade_file):
            trade_df = pd.read_csv(trade_file, sep=';')
            trade_df['day'] = day
            if 'timestamp' in trade_df.columns:
                trade_df = trade_df.sort_values('timestamp')
            data['trades'].append(trade_df)

        # Load observation data
        observation_file = os.path.join(data_folder, f'observations_round_4_day_{day}.csv')
        if os.path.exists(observation_file):
            observation_df = pd.read_csv(observation_file, sep=',')
            observation_df['day'] = day
            if 'timestamp' in observation_df.columns:
                observation_df = observation_df.sort_values('timestamp')
            data['observations'].append(observation_df)

    # Concatenate all days' data
    result = {}
    for key, dfs in data.items():
        if dfs:  # Only add if we found any data
            result[key] = pd.concat(dfs, ignore_index=True)

    return result

def get_product_data(df: pd.DataFrame, product: str) -> pd.DataFrame:
    """
    Filter DataFrame for a specific product.

    Args:
        df (pd.DataFrame): DataFrame containing trading data
        product (str): Product name to filter for

    Returns:
        pd.DataFrame: Filtered DataFrame containing only data for the specified product
    """
    return df[df['product'] == product]

def get_day_data(df: pd.DataFrame, day: int) -> pd.DataFrame:
    """
    Filter DataFrame for a specific day.

    Args:
        df (pd.DataFrame): DataFrame containing trading data
        day (int): Day number to filter for

    Returns:
        pd.DataFrame: Filtered DataFrame containing only data for the specified day
    """
    return df[df['day'] == day]

def get_product_day_data(df: pd.DataFrame, product: str, day: int) -> pd.DataFrame:
    """
    Filter DataFrame for a specific product and day.

    Args:
        df (pd.DataFrame): DataFrame containing trading data
        product (str): Product name to filter for
        day (int): Day number to filter for

    Returns:
        pd.DataFrame: Filtered DataFrame containing only data for the specified product and day
    """
    return df[(df['product'] == product) & (df['day'] == day)]

def get_price_data(df: pd.DataFrame, product: str = None, day: int = None) -> pd.DataFrame:
    """
    Get price data with optional filtering by product and/or day.

    Args:
        df (pd.DataFrame): DataFrame containing price data
        product (str, optional): Product name to filter for
        day (int, optional): Day number to filter for

    Returns:
        pd.DataFrame: Filtered price data
    """
    result = df.copy()
    if product:
        result = result[result['product'] == product]
    if day:
        result = result[result['day'] == day]
    if 'timestamp' in result.columns:
        result = result.sort_values('timestamp')
    return result

def get_order_book_data(df: pd.DataFrame, product: str = None, day: int = None) -> pd.DataFrame:
    """
    Get order book data with optional filtering by product and/or day.

    Args:
        df (pd.DataFrame): DataFrame containing price data
        product (str, optional): Product name to filter for
        day (int, optional): Day number to filter for

    Returns:
        pd.DataFrame: Filtered order book data
    """
    result = df.copy()
    if product:
        result = result[result['product'] == product]
    if day:
        result = result[result['day'] == day]
    if 'timestamp' in result.columns:
        result = result.sort_values('timestamp')
    return result

def get_volume_data(df: pd.DataFrame, product: str = None, day: int = None) -> pd.DataFrame:
    """
    Get volume data with optional filtering by product and/or day.

    Args:
        df (pd.DataFrame): DataFrame containing trade data
        product (str, optional): Product name to filter for
        day (int, optional): Day number to filter for

    Returns:
        pd.DataFrame: Filtered volume data
    """
    result = df.copy()
    if product:
        result = result[result['product'] == product]
    if day:
        result = result[result['day'] == day]
    if 'timestamp' in result.columns:
        result = result.sort_values('timestamp')
    return result

def convert_timestamp(df: pd.DataFrame) -> pd.DataFrame:
    return df.assign(t=(df['day'] - 1) * 1_000_000 + df['timestamp']).drop(columns=['day', 'timestamp'])

In [2]:
trading_data = load_trading_data('round-4-island-data-bottle')

In [3]:
price_df = trading_data['prices']
price_df = convert_timestamp(price_df)
price_df

Unnamed: 0,product,bid_price_1,bid_volume_1,bid_price_2,bid_volume_2,bid_price_3,bid_volume_3,ask_price_1,ask_volume_1,ask_price_2,ask_volume_2,ask_price_3,ask_volume_3,mid_price,profit_and_loss,t
0,VOLCANIC_ROCK,10515.0,143.0,10514.0,57.0,,,10517,200,,,,,10516.0,0.0,0
1,DJEMBES,13409.0,61.0,,,,,13410,27,13411.0,34.0,,,13409.5,0.0,0
2,VOLCANIC_ROCK_VOUCHER_10000,517.0,12.0,,,,,518,12,,,,,517.5,0.0,0
3,JAMS,6541.0,237.0,,,,,6543,237,,,,,6542.0,0.0,0
4,KELP,2032.0,23.0,,,,,2035,23,,,,,2033.5,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
449995,DJEMBES,13372.0,23.0,13371.0,42.0,,,13373,65,,,,,13372.5,0.0,2999900
449996,MAGNIFICENT_MACARONS,794.0,12.0,791.0,18.0,785.0,23.0,802,6,810.0,23.0,816.0,18.0,798.0,0.0,2999900
449997,VOLCANIC_ROCK,9920.0,137.0,9919.0,69.0,,,9921,131,9922.0,69.0,,,9920.5,0.0,2999900
449998,KELP,2037.0,25.0,,,,,2040,25,,,,,2038.5,0.0,2999900


In [4]:
observation_df = trading_data['observations']
observation_df = convert_timestamp(observation_df)
observation_df

Unnamed: 0,bidPrice,askPrice,transportFees,exportTariff,importTariff,sugarPrice,sunlightIndex,t
0,627.0,628.5,1.0,9.0,-3.0,200.000000,60.00,0
1,630.0,631.5,1.0,9.0,-3.0,200.098445,60.00,100
2,630.0,631.5,1.0,9.0,-3.0,200.103915,60.00,200
3,632.0,633.5,1.0,9.0,-3.0,200.352462,60.01,300
4,630.5,632.0,1.0,9.0,-3.0,200.201545,60.01,400
...,...,...,...,...,...,...,...,...
29995,797.5,799.0,1.7,9.5,-5.5,212.124918,54.95,2999500
29996,801.5,803.0,1.7,9.5,-5.5,212.147650,54.96,2999600
29997,804.0,805.5,1.7,9.5,-5.5,212.202086,54.97,2999700
29998,798.5,800.0,1.7,9.5,-5.5,212.112734,54.98,2999800


In [5]:
df = price_df[price_df['product'].str.contains('VOLCANIC')]
df = df[df['t'] <= 1000000]

In [6]:
df.columns

Index(['product', 'bid_price_1', 'bid_volume_1', 'bid_price_2', 'bid_volume_2',
       'bid_price_3', 'bid_volume_3', 'ask_price_1', 'ask_volume_1',
       'ask_price_2', 'ask_volume_2', 'ask_price_3', 'ask_volume_3',
       'mid_price', 'profit_and_loss', 't'],
      dtype='object')

In [7]:
df['product'].unique()

array(['VOLCANIC_ROCK', 'VOLCANIC_ROCK_VOUCHER_10000',
       'VOLCANIC_ROCK_VOUCHER_10500', 'VOLCANIC_ROCK_VOUCHER_9750',
       'VOLCANIC_ROCK_VOUCHER_9500', 'VOLCANIC_ROCK_VOUCHER_10250'],
      dtype=object)

In [10]:
!uv pip install numba

[2K[2mResolved [1m3 packages[0m [2min 89ms[0m[0m                                          [0m
[2K[2mInstalled [1m2 packages[0m [2min 28ms[0m[0m                                [0m
 [32m+[39m [1mllvmlite[0m[2m==0.44.0[0m
 [32m+[39m [1mnumba[0m[2m==0.61.2[0m


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import minimize_scalar
from scipy.stats import norm
import numba

# Optimize Black-Scholes functions with numba
@numba.njit
def d1(S, K, T, r, sigma):
    return (np.log(S/K) + (r + 0.5 * sigma**2) * T) / (sigma * np.sqrt(T))

@numba.njit
def d2(S, K, T, r, sigma):
    return d1(S, K, T, r, sigma) - sigma * np.sqrt(T)

@numba.njit
def bs_call(S, K, T, r, sigma):
    d1_val = d1(S, K, T, r, sigma)
    d2_val = d2(S, K, T, r, sigma)
    return S * norm.cdf(d1_val) - K * np.exp(-r * T) * norm.cdf(d2_val)

@numba.njit
def bs_put(S, K, T, r, sigma):
    d1_val = d1(S, K, T, r, sigma)
    d2_val = d2(S, K, T, r, sigma)
    return K * np.exp(-r * T) * norm.cdf(-d2_val) - S * norm.cdf(-d1_val)

# Function to estimate implied volatility - optimized
def implied_volatility(price, S, K, T, r, option_type='call'):
    def objective(sigma):
        if sigma <= 0:
            return 1e10
        if option_type == 'call':
            return abs(bs_call(S, K, T, r, sigma) - price)
        else:
            return abs(bs_put(S, K, T, r, sigma) - price)
    
    try:
        result = minimize_scalar(objective, bounds=(0.001, 5.0), method='bounded')
        return result.x
    except:
        return np.nan

# Extract strike prices from product names - vectorized
def extract_strikes(products):
    strikes = np.full(len(products), np.nan)
    mask = products.str.contains('VOUCHER')
    strikes[mask] = products[mask].str.split('_').str[-1].astype(float)
    return strikes

# Preprocess data more efficiently
def preprocess_data(df):
    # Extract strikes
    df['strike'] = extract_strikes(df['product'])
    df = df[~pd.isna(df['strike'])].copy()
    
    # Create a lookup for underlying prices
    volcanic_rock_df = df[df['product'] == 'VOLCANIC_ROCK'][['t', 'mid_price']]
    volcanic_rock_df = volcanic_rock_df.rename(columns={'mid_price': 'underlying_price'})
    
    # Join with option data
    df = pd.merge(df, volcanic_rock_df, on='t', how='left')
    
    # Calculate moneyness
    df['moneyness'] = df['strike'] / df['underlying_price']
    
    return df

# Calculate implied volatility in batches for better performance
def calculate_implied_volatility(df, r=0.0, T=4/365):
    # Vectorize option type determination
    df['option_type'] = np.where(df['strike'] > df['underlying_price'], 'call', 'put')
    
    # Initialize array for results
    implied_vol = np.full(len(df), np.nan)
    
    # Calculate in smaller batches for better performance
    batch_size = 1000
    for i in range(0, len(df), batch_size):
        batch = df.iloc[i:i+batch_size]
        for j, row in batch.iterrows():
            if pd.isna(row['underlying_price']) or pd.isna(row['mid_price']):
                continue
            implied_vol[j] = implied_volatility(
                price=row['mid_price'],
                S=row['underlying_price'],
                K=row['strike'],
                T=T,
                r=r,
                option_type=row['option_type']
            )
    
    return implied_vol

# Polynomial function for fitting
def poly_func(x, a, b, c):
    return a * x**2 + b * x + c

# Plot volatility smile
def plot_volatility_smile(df):
    timestamps = df['t'].unique()
    plt.figure(figsize=(12, 8))
    
    # Get unique strikes for coloring
    strikes = sorted(df['strike'].unique())
    colors = plt.cm.viridis(np.linspace(0, 1, len(strikes)))
    strike_color_map = dict(zip(strikes, colors))
    
    # Keep track of plotted strikes for legend
    plotted_strikes = set()
    
    for t in timestamps:
        t_data = df[df['t'] == t]
        
        # Skip timestamps with insufficient data
        if len(t_data) < 3:
            continue
        
        # Only plot each strike once for the legend
        for strike in strikes:
            strike_data = t_data[t_data['strike'] == strike]
            if not strike_data.empty:
                label = f'Strike {strike}' if strike not in plotted_strikes else ""
                if label:
                    plotted_strikes.add(strike)
                
                plt.scatter(
                    strike_data['moneyness'],
                    strike_data['implied_vol'],
                    color=strike_color_map[strike],
                    label=label,
                    alpha=0.7
                )
        
        # Fit polynomial to this timestamp's data
        try:
            x = t_data['moneyness'].values
            y = t_data['implied_vol'].values
            mask = ~np.isnan(x) & ~np.isnan(y)
            x, y = x[mask], y[mask]
            
            if len(x) >= 3:
                popt, _ = curve_fit(poly_func, x, y)
                x_smooth = np.linspace(min(x), max(x), 100)
                y_smooth = poly_func(x_smooth, *popt)
                plt.plot(x_smooth, y_smooth, '--', color='gray', alpha=0.5)
        except Exception:
            pass
    
    # Only plot legend once with unique strike values
    plt.xlabel('Moneyness (Strike/Underlying Price)')
    plt.ylabel('Implied Volatility')
    plt.title('Volatility Smile')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()

# Main workflow
if __name__ == "__main__":
    from scipy.optimize import curve_fit
    
    # Assuming df is filtered properly from the source code
    # df = price_df[price_df['product'].str.contains('VOLCANIC')]
    # df = df[df['t'] <= 1000000]
    
    # Preprocess data
    df = preprocess_data(df)
    
    # Calculate implied volatility
    df['implied_vol'] = calculate_implied_volatility(df)
    
    # Plot results
    plot_volatility_smile(df)

KeyboardInterrupt: 