In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/jane-street-real-time-market-data-forecasting/responders.csv
/kaggle/input/jane-street-real-time-market-data-forecasting/sample_submission.csv
/kaggle/input/jane-street-real-time-market-data-forecasting/features.csv
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=4/part-0.parquet
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=5/part-0.parquet
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=6/part-0.parquet
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=3/part-0.parquet
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=1/part-0.parquet
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=8/part-0.parquet
/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=2/part-0.parquet
/kaggle/input/jane-street-real-time-market

In [2]:
import pickle
import polars as pl
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
from glob import glob
import numpy as np
import cudf
# Load the cudf.pandas extension for pandas-like GPU acceleration
%load_ext cudf.pandas

In [3]:
    k = 4
    data_path = "/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet"
    
    # Collect all Parquet file paths
    parquet_files = glob(f"{data_path}/partition_id={k}/part-0.parquet")
    
    print(f"Processing partition {k}")

    # Process each Parquet file
    final_df = pl.concat([pl.read_parquet(file) for file in parquet_files])
    
    # Print summary
    print(f"Final DataFrame shape: {final_df.shape}")
    
    # Fill NaNs with forward fill
    final_df = final_df.fill_null(strategy="forward").fill_null(strategy="backward")
    
    # List of responder columns
    responder_columns = [f"responder_{i}" for i in range(9)]
    
    # Apply the lagging for each responder column
    for responder in responder_columns:
        lagged_column_name = f"{responder}_lag_1"
        final_df = final_df.with_columns(
            pl.col(responder)
            .shift(1)
            .over(["time_id", "symbol_id"])
            .alias(lagged_column_name)
        )

Processing partition 4


Final DataFrame shape: (5022952, 92)


In [4]:
final_df = final_df.fill_null(strategy="forward").fill_null(strategy="backward")
final_df = final_df.fill_null(0)
df = final_df.to_pandas()


In [5]:
print (df.head())

   date_id  time_id  symbol_id    weight  feature_00  feature_01  feature_02  \
0      680        0          0  2.298160    0.851814    1.197591    0.219422   
1      680        0          1  3.928745    0.534441    1.079740    0.038748   
2      680        0          2  1.340433   -0.227643    0.764146   -0.243349   
3      680        0          3  1.695526    0.267686    1.193612   -0.388798   
4      680        0          5  2.700766    0.952372    0.861269   -0.375405   

   feature_03  feature_04  feature_05  ...  responder_8  responder_0_lag_1  \
0    0.411698    2.057359   -0.542597  ...     1.101371          -0.304665   
1    0.275343    2.135057   -0.541966  ...     1.986971          -0.304665   
2    0.247027    2.347248   -0.478477  ...    -0.049303          -0.304665   
3    0.030673    2.175273   -0.408371  ...     3.031337          -0.304665   
4    0.259099    2.497325   -0.618828  ...     2.073280          -0.304665   

   responder_1_lag_1  responder_2_lag_1  responder

In [None]:
import pandas as pd
import numpy as np
import warnings

# Suppress warnings
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)

# Placeholder for cross-correlation results
cross_correlation_results = []

# Subset the DataFrame to the first 1 million rows
df = df.head(100_000)

# Define features to lag and the lag range
features = [col for col in df.columns if col.startswith('feature_')]
print(features)

time_lags = range(1, 11)  # Corresponds to -1 to -10 lags

# Sort data by symbol_id, date_id, and time_id
df = df.sort_values(by=['symbol_id', 'date_id', 'time_id'])

# Backpropagate lagged features directly in the DataFrame
for lag in time_lags:
    for feature in features:
        df[f"{feature}_lag_{lag}"] = np.nan  # Initialize new columns

# Fill lagged values by propagating values from previous time_id or date_id
for symbol_id in df['symbol_id'].unique():
    symbol_data = df[df['symbol_id'] == symbol_id]
    for lag in time_lags:
        for feature in features:
            lagged_col = f"{feature}_lag_{lag}"
            # Shift values by `lag` within each group of `date_id`
            df.loc[df['symbol_id'] == symbol_id, lagged_col] = symbol_data.groupby('date_id')[feature].shift(lag)

# Compute cross-correlation for each feature and lag
for lag in time_lags:
    for feature in features:
        lagged_col = f"{feature}_lag_{lag}"
        # Calculate correlation only for rows where lagged values are not NaN
        valid_rows = df[~df[lagged_col].isna()]
        correlation = valid_rows[lagged_col].corr(valid_rows['responder_6'])
        cross_correlation_results.append({
            'lag': -lag,  # Negative lag for clarity
            'feature': feature,
            'cross_correlation': correlation
        })

# Convert results to DataFrame for easier analysis
cross_correlation_df = pd.DataFrame(cross_correlation_results)

# Sort results for better readability
cross_correlation_df = cross_correlation_df.sort_values(by=['feature', 'lag'], ascending=[True, False])

# Display results
print(cross_correlation_df)


In [None]:
# Display the top 50 cross-correlation results in descending order
top_50_cross_correlations = cross_correlation_df.sort_values(by='cross_correlation', ascending=False).head(20)

# Display the top 50 results
print(top_50_cross_correlations)


In [None]:
# Group by 'feature' and 'lag', then calculate the mean cross-correlation
mean_cross_correlation = (
    cross_correlation_df
    .groupby(['feature', 'lag'])
    .agg(mean_correlation=('cross_correlation', 'mean'))
    .reset_index()
)

# Sort the results for better readability
mean_cross_correlation = mean_cross_correlation.sort_values(by='mean_correlation', ascending=False)

# Display the results
print(mean_cross_correlation)

In [None]:
mean_cross_correlation =mean_cross_correlation.dropna()

In [None]:
print(mean_cross_correlation.head(20))


In [None]:
print(mean_cross_correlation.tail(20))


In [None]:
correlation_matrix = df.corr()


In [None]:
# Get the correlation of all features with 'responder_1'
correlations_with_responder_1 = correlation_matrix["responder_6"].drop("responder_6")
# Sort correlations in descending order and get the top 10 features
top_10_features = correlations_with_responder_1.abs().sort_values(ascending=False).head(100)

# Display the result
print("Top 10 Features Correlated with Responder_6:")
print(top_10_features)