In [10]:
from pathlib import Path
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from typing import Final

# Define data paths and load data (previous code remains the same)
DATA_DIR: Final[Path] = Path('data')

def load_price_data(day: int) -> pd.DataFrame:
    """Load price data for a specific day."""
    file_path = DATA_DIR / f'prices_round_1_day_{day}.csv'
    return pd.read_csv(file_path, sep=';')

def load_trade_data(day: int) -> pd.DataFrame:
    """Load trade data for a specific day."""
    file_path = DATA_DIR / f'trades_round_1_day_{day}.csv'
    return pd.read_csv(file_path, sep=';')

# Load data for all days
price_data = {day: load_price_data(day) for day in [-2, -1, 0]}
trade_data = {day: load_trade_data(day) for day in [-2, -1, 0]}

# Create visualization for each day
for day in [-2, -1, 0]:
    prices = price_data[day]
    
    # Create figure with secondary y-axis
    fig = make_subplots(rows=2, cols=1, 
                       subplot_titles=(f'KELP and SQUID_INK Prices - Day {day}',
                                    f'RAINFOREST_RESIN Prices - Day {day}'),
                       vertical_spacing=0.15)
    
    # Add traces for KELP and SQUID_INK
    for product in ['KELP', 'SQUID_INK']:
        product_data = prices[prices['product'] == product]
        fig.add_trace(
            go.Scatter(x=product_data['timestamp'], 
                      y=product_data['mid_price'],
                      name=product,
                      mode='lines'),
            row=1, col=1
        )
    
    # Add trace for RAINFOREST_RESIN
    resin_data = prices[prices['product'] == 'RAINFOREST_RESIN']
    fig.add_trace(
        go.Scatter(x=resin_data['timestamp'], 
                  y=resin_data['mid_price'],
                  name='RAINFOREST_RESIN',
                  mode='lines'),
        row=2, col=1
    )
    
    # Update layout
    fig.update_layout(
        height=800,
        showlegend=True,
        title_text=f"Product Prices - Day {day}"
    )
    
    # Update y-axes ranges
    fig.update_yaxes(range=[1800, 2200], row=1, col=1)  # Range for KELP and SQUID_INK
    fig.update_yaxes(range=[9995, 10005], row=2, col=1)  # Range for RAINFOREST_RESIN
    
    fig.show()

In [13]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import plotly.graph_objects as go

def prepare_regression_data(prices_df: pd.DataFrame) -> tuple[pd.DataFrame, pd.Series]:
    """
    Prepare data for regression analysis by aligning prices of all products.
    
    Args:
        prices_df: DataFrame containing price data
        
    Returns:
        X: Features DataFrame (KELP and RAINFOREST_RESIN prices)
        y: Target Series (SQUID_INK prices)
    """
    # Pivot the data to get separate columns for each product
    price_matrix = pd.pivot_table(
        prices_df,
        values='mid_price',
        index='timestamp',
        columns='product'
    ).reset_index()
    
    # Prepare features (X) and target (y)
    X = price_matrix[['KELP', 'RAINFOREST_RESIN']]
    y = price_matrix['SQUID_INK']
    
    return X, y

def analyze_price_relationships(day: int) -> None:
    """
    Analyze price relationships using linear regression for a specific day.
    
    Args:
        day: The day to analyze
    """
    prices = price_data[day]
    
    # Prepare data
    X, y = prepare_regression_data(prices)
    
    # Create and fit the model
    model = LinearRegression()
    model.fit(X, y)
    
    # Make predictions
    y_pred = model.predict(X)
    
    # Calculate metrics
    r2 = r2_score(y, y_pred)
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    
    # Print results
    print(f"\nRegression Analysis for Day {day}:")
    print("Coefficients:")
    print(f"KELP: {model.coef_[0]:.4f}")
    print(f"RAINFOREST_RESIN: {model.coef_[1]:.4f}")
    print(f"Intercept: {model.intercept_:.4f}")
    print(f"\nModel Performance:")
    print(f"R² Score: {r2:.4f}")
    print(f"RMSE: {rmse:.4f}")
    
    # Create visualization
    fig = go.Figure()
    
    # Actual prices
    fig.add_trace(
        go.Scatter(
            x=prices[prices['product'] == 'SQUID_INK']['timestamp'],
            y=y,
            name='Actual SQUID_INK Price',
            mode='lines',
            line=dict(color='blue')
        )
    )
    
    # Predicted prices
    fig.add_trace(
        go.Scatter(
            x=prices[prices['product'] == 'SQUID_INK']['timestamp'],
            y=y_pred,
            name='Predicted SQUID_INK Price',
            mode='lines',
            line=dict(color='red', dash='dash')
        )
    )
    
    fig.update_layout(
        title=f'SQUID_INK Price Prediction - Day {day}',
        xaxis_title='Timestamp',
        yaxis_title='Price',
        height=600
    )
    
    fig.show()
    
    # Additional analysis: Feature importance through correlation
    correlation_matrix = pd.DataFrame({
        'SQUID_INK': y,
        'KELP': X['KELP'],
        'RAINFOREST_RESIN': X['RAINFOREST_RESIN']
    }).corr()
    
    print("\nCorrelation Matrix:")
    print(correlation_matrix['SQUID_INK'])

# Run analysis for each day
for day in [-2, -1, 0]:
    analyze_price_relationships(day)


Regression Analysis for Day -2:
Coefficients:
KELP: 1.9397
RAINFOREST_RESIN: 0.2759
Intercept: -4627.2137

Model Performance:
R² Score: 0.0793
RMSE: 55.8712



Correlation Matrix:
SQUID_INK           1.000000
KELP                0.281494
RAINFOREST_RESIN    0.008457
Name: SQUID_INK, dtype: float64

Regression Analysis for Day -1:
Coefficients:
KELP: 0.8994
RAINFOREST_RESIN: 0.2446
Intercept: -2294.9242

Model Performance:
R² Score: 0.0230
RMSE: 25.5328



Correlation Matrix:
SQUID_INK           1.000000
KELP                0.150879
RAINFOREST_RESIN    0.014995
Name: SQUID_INK, dtype: float64

Regression Analysis for Day 0:
Coefficients:
KELP: -5.4195
RAINFOREST_RESIN: -0.0458
Intercept: 13392.1581

Model Performance:
R² Score: 0.0946
RMSE: 40.5885



Correlation Matrix:
SQUID_INK           1.000000
KELP               -0.307517
RAINFOREST_RESIN   -0.006085
Name: SQUID_INK, dtype: float64


In [14]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import plotly.graph_objects as go

def create_features(price_matrix: pd.DataFrame, 
                   rolling_windows: list[int] = [5, 10, 20],
                   product_columns: list[str] = ['KELP', 'RAINFOREST_RESIN']) -> pd.DataFrame:
    """
    Create time series features for the price data.
    
    Args:
        price_matrix: DataFrame with timestamp-aligned prices for all products
        rolling_windows: List of window sizes for rolling statistics
        product_columns: List of product names to create features for
        
    Returns:
        DataFrame with additional features
    """
    features = price_matrix.copy()
    
    for product in product_columns:
        # Price changes
        features[f'{product}_pct_change'] = features[product].pct_change()
        features[f'{product}_diff'] = features[product].diff()
        
        # Lagged prices (t-1, t-2)
        features[f'{product}_lag1'] = features[product].shift(1)
        features[f'{product}_lag2'] = features[product].shift(2)
        
        # Rolling statistics
        for window in rolling_windows:
            # Rolling mean
            features[f'{product}_rolling_mean_{window}'] = (
                features[product].rolling(window=window, min_periods=1).mean()
            )
            
            # Rolling standard deviation
            features[f'{product}_rolling_std_{window}'] = (
                features[product].rolling(window=window, min_periods=1).std()
            )
            
            # Rolling min/max
            features[f'{product}_rolling_min_{window}'] = (
                features[product].rolling(window=window, min_periods=1).min()
            )
            features[f'{product}_rolling_max_{window}'] = (
                features[product].rolling(window=window, min_periods=1).max()
            )
            
            # Rolling range
            features[f'{product}_rolling_range_{window}'] = (
                features[f'{product}_rolling_max_{window}'] - 
                features[f'{product}_rolling_min_{window}']
            )
        
        # Exponential moving averages
        features[f'{product}_ema_fast'] = features[product].ewm(span=5).mean()
        features[f'{product}_ema_slow'] = features[product].ewm(span=20).mean()
        
        # Volatility (realized volatility)
        features[f'{product}_volatility'] = (
            features[f'{product}_pct_change'].rolling(window=10).std()
        )
    
    # Cross-product features
    features['price_ratio_kelp_resin'] = features['KELP'] / features['RAINFOREST_RESIN']
    features['price_spread'] = features['RAINFOREST_RESIN'] - features['KELP']
    
    # Fill NaN values created by rolling operations and differences
    features = features.fillna(method='bfill').fillna(method='ffill')
    
    return features

def prepare_regression_data(prices_df: pd.DataFrame) -> tuple[pd.DataFrame, pd.Series]:
    """
    Prepare data for regression analysis with enhanced features.
    
    Args:
        prices_df: DataFrame containing price data
        
    Returns:
        X: Features DataFrame
        y: Target Series (SQUID_INK prices)
    """
    # Pivot the data to get separate columns for each product
    price_matrix = pd.pivot_table(
        prices_df,
        values='mid_price',
        index='timestamp',
        columns='product'
    ).reset_index()
    
    # Create features
    features_df = create_features(price_matrix)
    
    # Select features for regression
    feature_columns = [col for col in features_df.columns 
                      if col not in ['timestamp', 'SQUID_INK']
                      and not pd.isna(features_df[col]).any()]
    
    X = features_df[feature_columns]
    y = features_df['SQUID_INK']
    
    return X, y, feature_columns

def analyze_price_relationships(day: int) -> None:
    """
    Analyze price relationships using linear regression with enhanced features.
    
    Args:
        day: The day to analyze
    """
    prices = price_data[day]
    
    # Prepare data
    X, y, feature_columns = prepare_regression_data(prices)
    
    # Create and fit the model
    model = LinearRegression()
    model.fit(X, y)
    
    # Make predictions
    y_pred = model.predict(X)
    
    # Calculate metrics
    r2 = r2_score(y, y_pred)
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    
    # Print results
    print(f"\nEnhanced Regression Analysis for Day {day}:")
    print("\nTop 10 Most Important Features:")
    
    # Get feature importance
    feature_importance = pd.DataFrame({
        'Feature': feature_columns,
        'Coefficient': model.coef_
    })
    feature_importance['Abs_Coefficient'] = abs(feature_importance['Coefficient'])
    feature_importance = feature_importance.sort_values('Abs_Coefficient', ascending=False)
    
    print(feature_importance.head(10).to_string(index=False))
    
    print(f"\nModel Performance:")
    print(f"R² Score: {r2:.4f}")
    print(f"RMSE: {rmse:.4f}")
    
    # Create visualization
    fig = go.Figure()
    
    # Actual prices
    fig.add_trace(
        go.Scatter(
            x=prices[prices['product'] == 'SQUID_INK']['timestamp'],
            y=y,
            name='Actual SQUID_INK Price',
            mode='lines',
            line=dict(color='blue')
        )
    )
    
    # Predicted prices
    fig.add_trace(
        go.Scatter(
            x=prices[prices['product'] == 'SQUID_INK']['timestamp'],
            y=y_pred,
            name='Predicted SQUID_INK Price',
            mode='lines',
            line=dict(color='red', dash='dash')
        )
    )
    
    # Add prediction error
    prediction_error = y_pred - y
    fig.add_trace(
        go.Scatter(
            x=prices[prices['product'] == 'SQUID_INK']['timestamp'],
            y=prediction_error,
            name='Prediction Error',
            mode='lines',
            line=dict(color='green', dash='dot'),
            yaxis='y2'
        )
    )
    
    fig.update_layout(
        title=f'Enhanced SQUID_INK Price Prediction - Day {day}',
        xaxis_title='Timestamp',
        yaxis_title='Price',
        yaxis2=dict(
            title='Prediction Error',
            overlaying='y',
            side='right'
        ),
        height=600
    )
    
    fig.show()

# Run enhanced analysis for each day
for day in [-2, -1, 0]:
    analyze_price_relationships(day)


Enhanced Regression Analysis for Day -2:

Top 10 Most Important Features:
                    Feature   Coefficient  Abs_Coefficient
     price_ratio_kelp_resin  6.126174e+06     6.126174e+06
RAINFOREST_RESIN_pct_change -3.902806e+06     3.902806e+06
            KELP_pct_change -1.135594e+05     1.135594e+05
RAINFOREST_RESIN_volatility  4.415141e+04     4.415141e+04
            KELP_volatility  9.590843e+03     9.590843e+03
      RAINFOREST_RESIN_diff  4.306744e+02     4.306744e+02
                       KELP -3.304474e+02     3.304474e+02
               price_spread  2.081388e+02     2.081388e+02
           RAINFOREST_RESIN -1.222971e+02     1.222971e+02
                  KELP_lag1 -7.475252e+01     7.475252e+01

Model Performance:
R² Score: 0.1005
RMSE: 55.2235



DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.




Enhanced Regression Analysis for Day -1:

Top 10 Most Important Features:
                    Feature   Coefficient  Abs_Coefficient
RAINFOREST_RESIN_pct_change  2.947560e+06     2.947560e+06
     price_ratio_kelp_resin -1.923825e+05     1.923825e+05
            KELP_pct_change  6.107385e+04     6.107385e+04
RAINFOREST_RESIN_volatility -2.923104e+04     2.923104e+04
            KELP_volatility -4.850886e+02     4.850886e+02
      RAINFOREST_RESIN_diff -2.602066e+02     2.602066e+02
                  KELP_diff -7.374471e+01     7.374471e+01
                  KELP_lag1 -4.277169e+01     4.277169e+01
      RAINFOREST_RESIN_lag1  3.603357e+01     3.603357e+01
               price_spread -3.360349e+01     3.360349e+01

Model Performance:
R² Score: 0.0568
RMSE: 25.0863



DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.




Enhanced Regression Analysis for Day 0:

Top 10 Most Important Features:
                    Feature   Coefficient  Abs_Coefficient
     price_ratio_kelp_resin  1.151421e+07     1.151421e+07
RAINFOREST_RESIN_pct_change  5.535626e+06     5.535626e+06
            KELP_pct_change -7.191681e+04     7.191681e+04
RAINFOREST_RESIN_volatility -4.132659e+04     4.132659e+04
            KELP_volatility  7.201393e+03     7.201393e+03
      RAINFOREST_RESIN_diff -6.374098e+02     6.374098e+02
                       KELP -5.507452e+02     5.507452e+02
               price_spread  4.332751e+02     4.332751e+02
                  KELP_lag1 -1.731701e+02     1.731701e+02
                  KELP_diff -1.355893e+02     1.355893e+02

Model Performance:
R² Score: 0.1358
RMSE: 39.6534



DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.

