In [1]:
import pandas as pd
import os
from pathlib import Path
import re

def load_trading_data(data_folder: str) -> dict[str, pd.DataFrame]:
    """
    Load trading data from CSV files into pandas DataFrames, agnostic to round and day.

    Args:
        data_folder (str): Path to the folder containing the CSV files

    Returns:
        Dict[str, pd.DataFrame]: Dictionary containing the following DataFrames:
            - 'prices': Price data for all days/rounds
            - 'trades': Trade data for all days/rounds
            - 'observations': Observation data for all days/rounds
    """
    data = {
        'prices': [],
        'trades': [],
        'observations': []
    }

    # Regex to match files and extract type, round, and day
    pattern = re.compile(r'^(prices|trades|observations)_round_(\d+)_day_(\d+)\.csv$')

    for file in Path(data_folder).iterdir():
        if file.is_file():
            match = pattern.match(file.name)
            if match:
                data_type, round_num, day = match.groups()
                sep = ';' if data_type in ['prices', 'trades'] else ','
                df = pd.read_csv(file, sep=sep)
                df['day'] = int(day)
                df['round'] = int(round_num)
                if 'timestamp' in df.columns:
                    df = df.sort_values('timestamp')
                data[data_type].append(df)

    # Concatenate all days' data
    result = {}
    for key, dfs in data.items():
        if dfs:  # Only add if we found any data
            result[key] = pd.concat(dfs, ignore_index=True)
            result[key].sort_values(['day', 'timestamp'], inplace=True)

    return result

def get_product_data(df: pd.DataFrame, product: str) -> pd.DataFrame:
    """
    Filter DataFrame for a specific product.

    Args:
        df (pd.DataFrame): DataFrame containing trading data
        product (str): Product name to filter for

    Returns:
        pd.DataFrame: Filtered DataFrame containing only data for the specified product
    """
    return df[df['product'] == product]

def get_day_data(df: pd.DataFrame, day: int) -> pd.DataFrame:
    """
    Filter DataFrame for a specific day.

    Args:
        df (pd.DataFrame): DataFrame containing trading data
        day (int): Day number to filter for

    Returns:
        pd.DataFrame: Filtered DataFrame containing only data for the specified day
    """
    return df[df['day'] == day]

def get_product_day_data(df: pd.DataFrame, product: str, day: int) -> pd.DataFrame:
    """
    Filter DataFrame for a specific product and day.

    Args:
        df (pd.DataFrame): DataFrame containing trading data
        product (str): Product name to filter for
        day (int): Day number to filter for

    Returns:
        pd.DataFrame: Filtered DataFrame containing only data for the specified product and day
    """
    return df[(df['product'] == product) & (df['day'] == day)]

def get_price_data(df: pd.DataFrame, product: str = None, day: int = None) -> pd.DataFrame:
    """
    Get price data with optional filtering by product and/or day.

    Args:
        df (pd.DataFrame): DataFrame containing price data
        product (str, optional): Product name to filter for
        day (int, optional): Day number to filter for

    Returns:
        pd.DataFrame: Filtered price data
    """
    result = df.copy()
    if product:
        result = result[result['product'] == product]
    if day:
        result = result[result['day'] == day]
    if 'timestamp' in result.columns:
        result = result.sort_values('timestamp')
    return result

def get_order_book_data(df: pd.DataFrame, product: str = None, day: int = None) -> pd.DataFrame:
    """
    Get order book data with optional filtering by product and/or day.

    Args:
        df (pd.DataFrame): DataFrame containing price data
        product (str, optional): Product name to filter for
        day (int, optional): Day number to filter for

    Returns:
        pd.DataFrame: Filtered order book data
    """
    result = df.copy()
    if product:
        result = result[result['product'] == product]
    if day:
        result = result[result['day'] == day]
    if 'timestamp' in result.columns:
        result = result.sort_values('timestamp')
    return result

def get_volume_data(df: pd.DataFrame, product: str = None, day: int = None) -> pd.DataFrame:
    """
    Get volume data with optional filtering by product and/or day.

    Args:
        df (pd.DataFrame): DataFrame containing trade data
        product (str, optional): Product name to filter for
        day (int, optional): Day number to filter for

    Returns:
        pd.DataFrame: Filtered volume data
    """
    result = df.copy()
    if product:
        result = result[result['product'] == product]
    if day:
        result = result[result['day'] == day]
    if 'timestamp' in result.columns:
        result = result.sort_values('timestamp')
    return result

def convert_timestamp(df: pd.DataFrame) -> pd.DataFrame:
    return df.assign(t=(df['day'] - 1) * 1_000_000 + df['timestamp']).drop(columns=['day', 'timestamp'])

In [2]:
# round 6 is for submission rounds, round 7 is for end of round runs

prosperity2 = load_trading_data('../imc-prosperity-2-backtester/prosperity2bt/resources/round6')
prosperity2_prices = prosperity2['prices']
prosperity2_prices

Unnamed: 0,day,timestamp,product,bid_price_1,bid_volume_1,bid_price_2,bid_volume_2,bid_price_3,bid_volume_3,ask_price_1,ask_volume_1,ask_price_2,ask_volume_2,ask_price_3,ask_volume_3,mid_price,profit_and_loss,round
0,0,0,AMETHYSTS,10002,1,9996.0,2.0,9995.0,29.0,10004,2,10005.0,29.0,,,10003.0,0.0,6
1,0,0,STARFRUIT,5002,1,4997.0,31.0,,,5003,31,,,,,5002.5,0.0,6
2,0,100,AMETHYSTS,9996,2,9995.0,22.0,,,10004,2,10005.0,22.0,,,10000.0,0.0,6
3,0,100,STARFRUIT,4997,24,,,,,5003,2,5004.0,22.0,,,5000.0,0.0,6
4,0,200,STARFRUIT,4997,2,4996.0,20.0,,,5003,20,,,,,5000.0,0.0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,5,99900,STRAWBERRIES,3980,231,,,,,3981,240,,,,,3980.5,0.0,6
24996,5,99900,ROSES,13677,74,,,,,13678,28,13679.0,46.0,,,13677.5,0.0,6
24997,5,99900,AMETHYSTS,9996,2,9995.0,29.0,,,10004,2,10005.0,29.0,,,10000.0,0.0,6
24998,5,99900,STARFRUIT,4898,4,4897.0,31.0,,,4903,2,4904.0,29.0,,,4900.5,0.0,6


In [3]:
prosperity3 = load_trading_data('../imc-prosperity-3-backtester/prosperity3bt/resources/round6')
prosperity3_prices = prosperity3['prices'].query('day != 6')
prosperity3_prices

Unnamed: 0,day,timestamp,product,bid_price_1,bid_volume_1,bid_price_2,bid_volume_2,bid_price_3,bid_volume_3,ask_price_1,ask_volume_1,ask_price_2,ask_volume_2,ask_price_3,ask_volume_3,mid_price,profit_and_loss,round
0,0,0,RAINFOREST_RESIN,10002.0,1.0,9996.0,2.0,9995.0,29.0,10004,2,10005.0,29.0,,,10003.0,0.0,6
1,0,0,KELP,2028.0,1.0,2026.0,2.0,2025.0,29.0,2029,31,,,,,2028.5,0.0,6
2,0,100,KELP,2025.0,24.0,,,,,2028,2,2029.0,22.0,,,2026.5,0.0,6
3,0,100,RAINFOREST_RESIN,9996.0,2.0,9995.0,22.0,,,10004,2,10005.0,22.0,,,10000.0,0.0,6
4,0,200,RAINFOREST_RESIN,9995.0,20.0,,,,,10005,20,,,,,10000.0,0.0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51995,5,99900,VOLCANIC_ROCK,9914.0,144.0,9913.0,76.0,,,9916,220,,,,,9915.0,0.0,6
51996,5,99900,VOLCANIC_ROCK_VOUCHER_10500,0.0,17.0,,,,,2,17,,,,,1.0,0.0,6
51997,5,99900,PICNIC_BASKET2,30157.0,4.0,30153.0,2.0,30152.0,43.0,30158,17,30159.0,28.0,,,30157.5,0.0,6
51998,5,99900,VOLCANIC_ROCK_VOUCHER_10000,31.0,17.0,,,,,32,17,,,,,31.5,0.0,6


In [4]:
r5_past = prosperity2_prices.query('day == 5').drop(columns='day')
r5_past

Unnamed: 0,timestamp,product,bid_price_1,bid_volume_1,bid_price_2,bid_volume_2,bid_price_3,bid_volume_3,ask_price_1,ask_volume_1,ask_price_2,ask_volume_2,ask_price_3,ask_volume_3,mid_price,profit_and_loss,round
16000,0,STARFRUIT,4906,2,4905.0,20.0,,,4912,22,,,,,4909.0,0.0,6
16001,0,STRAWBERRIES,3957,199,,,,,3959,199,,,,,3958.0,0.0,6
16002,0,COCONUT,9883,133,9882.0,47.0,,,9884,133,9885.0,47.0,,,9883.5,0.0,6
16003,0,COCONUT_COUPON,574,45,,,,,575,45,,,,,574.5,0.0,6
16004,0,ROSES,13712,63,,,,,13714,63,,,,,13713.0,0.0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,99900,STRAWBERRIES,3980,231,,,,,3981,240,,,,,3980.5,0.0,6
24996,99900,ROSES,13677,74,,,,,13678,28,13679.0,46.0,,,13677.5,0.0,6
24997,99900,AMETHYSTS,9996,2,9995.0,29.0,,,10004,2,10005.0,29.0,,,10000.0,0.0,6
24998,99900,STARFRUIT,4898,4,4897.0,31.0,,,4903,2,4904.0,29.0,,,4900.5,0.0,6


In [5]:
r5_present = prosperity3_prices.query('day == 5').drop(columns='day')
r5_present

Unnamed: 0,timestamp,product,bid_price_1,bid_volume_1,bid_price_2,bid_volume_2,bid_price_3,bid_volume_3,ask_price_1,ask_volume_1,ask_price_2,ask_volume_2,ask_price_3,ask_volume_3,mid_price,profit_and_loss,round
37000,0,SQUID_INK,1928.0,22.0,,,,,1931,2,1932.0,20.0,,,1929.5,0.0,6
37001,0,VOLCANIC_ROCK,9920.0,133.0,9919.0,47.0,,,9922,180,,,,,9921.0,0.0,6
37002,0,MAGNIFICENT_MACARONS,795.0,12.0,793.0,19.0,787.0,20.0,803,6,812.0,20.0,818.0,19.0,799.0,0.0,6
37003,0,RAINFOREST_RESIN,9992.0,35.0,,,,,10008,35,,,,,10000.0,0.0,6
37004,0,VOLCANIC_ROCK_VOUCHER_9500,421.0,21.0,,,,,422,21,,,,,421.5,0.0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51995,99900,VOLCANIC_ROCK,9914.0,144.0,9913.0,76.0,,,9916,220,,,,,9915.0,0.0,6
51996,99900,VOLCANIC_ROCK_VOUCHER_10500,0.0,17.0,,,,,2,17,,,,,1.0,0.0,6
51997,99900,PICNIC_BASKET2,30157.0,4.0,30153.0,2.0,30152.0,43.0,30158,17,30159.0,28.0,,,30157.5,0.0,6
51998,99900,VOLCANIC_ROCK_VOUCHER_10000,31.0,17.0,,,,,32,17,,,,,31.5,0.0,6


In [6]:
mid_r5_past = r5_past.pivot(index='timestamp', columns='product', values='mid_price')
mid_r5_past

product,AMETHYSTS,CHOCOLATE,COCONUT,COCONUT_COUPON,GIFT_BASKET,ORCHIDS,ROSES,STARFRUIT,STRAWBERRIES
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,10001.0,7832.5,9883.5,574.5,69146.5,1103.0,13713.0,4909.0,3958.0
100,9999.0,7831.5,9885.5,575.5,69133.0,1103.0,13712.5,4907.0,3958.0
200,10000.0,7834.0,9885.0,575.5,69157.5,1103.0,13713.5,4909.0,3958.0
300,10000.0,7834.0,9886.5,577.0,69141.0,1102.0,13711.5,4909.0,3957.5
400,10000.0,7834.5,9885.0,576.5,69137.0,1102.0,13707.5,4909.5,3958.0
...,...,...,...,...,...,...,...,...,...
99500,10001.0,7831.5,9876.0,589.5,69164.5,1117.0,13680.5,4899.0,3980.5
99600,10000.0,7830.5,9877.0,590.5,69169.5,1116.5,13679.0,4899.0,3980.5
99700,9998.5,7830.5,9875.5,589.5,69163.0,1116.0,13680.5,4899.0,3980.5
99800,10003.5,7829.5,9875.5,589.0,69153.5,1115.5,13676.0,4900.0,3980.5


In [7]:
mid_r5_present = r5_present.pivot(index='timestamp', columns='product', values='mid_price')
mid_r5_present

product,CROISSANTS,DJEMBES,JAMS,KELP,MAGNIFICENT_MACARONS,PICNIC_BASKET1,PICNIC_BASKET2,RAINFOREST_RESIN,SQUID_INK,VOLCANIC_ROCK,VOLCANIC_ROCK_VOUCHER_10000,VOLCANIC_ROCK_VOUCHER_10250,VOLCANIC_ROCK_VOUCHER_10500,VOLCANIC_ROCK_VOUCHER_9500,VOLCANIC_ROCK_VOUCHER_9750
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,4296.5,13372.5,6446.5,2038.0,799.0,58334.5,30131.5,10000.0,1929.5,9921.0,34.5,2.5,1.0,421.5,186.5
100,4296.5,13373.5,6446.0,2036.5,793.0,58335.5,30132.5,10000.0,1924.5,9919.5,32.5,2.5,1.0,420.5,184.5
200,4296.5,13372.5,6446.0,2038.0,791.0,58334.5,30133.5,10000.0,1928.5,9919.5,34.5,2.5,1.0,420.5,183.5
300,4296.5,13373.0,6446.5,2037.5,789.0,58340.5,30127.5,10000.0,1927.5,9919.0,33.5,2.5,1.5,420.0,182.5
400,4296.0,13372.5,6446.5,2037.5,784.0,58335.5,30126.0,10000.0,1928.5,9917.5,30.5,2.5,1.0,418.5,183.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99500,4293.0,13354.0,6447.5,2033.5,720.0,58350.5,30158.5,10000.0,2030.0,9911.0,30.5,2.5,1.0,412.5,175.5
99600,4292.5,13354.5,6447.0,2033.5,723.0,58352.5,30159.0,9997.5,2031.5,9912.5,29.5,2.5,1.0,414.5,176.5
99700,4292.5,13354.5,6447.0,2033.5,719.0,58352.5,30157.0,9997.5,2029.5,9914.0,30.0,2.0,0.5,414.0,177.0
99800,4292.5,13354.0,6447.0,2033.5,721.0,58358.5,30158.0,10000.0,2030.5,9914.5,32.5,2.5,1.0,415.5,178.5


In [8]:
import plotly.graph_objects as go

def compare_products(past_df, present_df, product1, product2, plot=False):
    if plot:
        fig = go.Figure()

        # Add ORCHIDS from mid_r5_past (left y-axis)
        fig.add_trace(
            go.Scatter(
                x=past_df.index,  # or use a time column if available
                y=past_df[product1],
                name=f'{product1} (Past)',
                yaxis='y1',
                line=dict(color='blue')
        )
        )

        # Add VOLCANIC_ROCK from mid_r5_present (right y-axis)
        fig.add_trace(
            go.Scatter(
                x=present_df.index,  # or use a time column if available
                y=present_df[product2],
                name=f'{product2} (Present)',
                yaxis='y2',
                line=dict(color='red')
            )
        )

        # Update layout for dual y-axes
        fig.update_layout(
            title=f'{product1} (Past) vs {product2} (Present)',
            xaxis=dict(title='Index or Time'),
            yaxis=dict(
                title=dict(text=product1, font=dict(color='blue')),
                tickfont=dict(color='blue')
            ),
            yaxis2=dict(
                title=dict(text=product2, font=dict(color='red')),
                tickfont=dict(color='red'),
                anchor='x',
                overlaying='y',
                side='right'
            ),
            legend=dict(x=0.01, y=0.99)
        )

        fig.show()
    # calculate correlation between the two products
    return past_df[product1].corr(present_df[product2])

In [9]:
compare_products(mid_r5_past, mid_r5_present, 'ORCHIDS', 'VOLCANIC_ROCK')

np.float64(-0.43432771110232227)

In [10]:
compare_products(mid_r5_past, mid_r5_present, 'COCONUT_COUPON', 'VOLCANIC_ROCK_VOUCHER_10000')

np.float64(0.3885109057198476)

In [11]:
corrs = {}
for past_product in mid_r5_past.columns:
    for present_product in mid_r5_present.columns:
        corrs[(past_product, present_product)] = compare_products(mid_r5_past, mid_r5_present, past_product, present_product)
corrs = pd.Series(corrs).sort_values(ascending=False)
corrs

ROSES         CROISSANTS              0.935200
              PICNIC_BASKET1          0.882239
              PICNIC_BASKET2          0.881745
COCONUT       PICNIC_BASKET1          0.862427
              PICNIC_BASKET2          0.835222
                                        ...   
STRAWBERRIES  MAGNIFICENT_MACARONS   -0.703395
              CROISSANTS             -0.774921
ORCHIDS       JAMS                   -0.798591
STARFRUIT     SQUID_INK              -0.810907
STRAWBERRIES  DJEMBES                -0.825766
Length: 135, dtype: float64

In [12]:
for past_product, present_product in corrs.abs().sort_values(ascending=False).index[:10]:
    compare_products(mid_r5_past, mid_r5_present, past_product, present_product, plot=True)

In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

def linear_regression_r2(past_df: pd.DataFrame, present_df: pd.DataFrame, product1: str, product2: str) -> float:
    """
    Perform linear regression between two product series from past and present DataFrames.
    Returns the R^2 (coefficient of determination) of the fit.

    Args:
        past_df (pd.DataFrame): DataFrame containing past product data.
        present_df (pd.DataFrame): DataFrame containing present product data.
        product1 (str): Column name for the product in past_df (independent variable).
        product2 (str): Column name for the product in present_df (dependent variable).

    Returns:
        float: R^2 score of the linear regression.
    """
    # Align indices to ensure matching rows
    min_len = min(len(past_df), len(present_df))
    x = past_df[product1].iloc[:min_len].values.reshape(-1, 1)
    y = present_df[product2].iloc[:min_len].values

    model = LinearRegression()
    model.fit(x, y)
    y_pred = model.predict(x)
    r2 = r2_score(y, y_pred)
    return r2

In [14]:
r2s = {}
for past_product in mid_r5_past.columns:
    for present_product in mid_r5_present.columns:
        r2s[(past_product, present_product)] = linear_regression_r2(mid_r5_past, mid_r5_present, past_product, present_product)
r2s = pd.Series(r2s).sort_values(ascending=False)
r2s

ROSES         CROISSANTS                     8.745989e-01
              PICNIC_BASKET1                 7.783457e-01
              PICNIC_BASKET2                 7.774739e-01
COCONUT       PICNIC_BASKET1                 7.437797e-01
              PICNIC_BASKET2                 6.975957e-01
                                                 ...     
STARFRUIT     RAINFOREST_RESIN               3.725343e-06
CHOCOLATE     DJEMBES                        1.685805e-06
STRAWBERRIES  RAINFOREST_RESIN               1.241556e-06
STARFRUIT     VOLCANIC_ROCK_VOUCHER_10500    4.117380e-07
AMETHYSTS     CROISSANTS                     2.228742e-07
Length: 135, dtype: float64