In [1]:
# import packages
import random
import numpy as np
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)



# Data loading and Preprocessing

In [2]:
train = pd.read_parquet("/kaggle/input/drw-crypto-market-prediction/train.parquet")
test = pd.read_parquet("/kaggle/input/drw-crypto-market-prediction/test.parquet")

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 525887 entries, 2023-03-01 00:00:00 to 2024-02-29 23:59:00
Columns: 896 entries, bid_qty to label
dtypes: float64(896)
memory usage: 3.5 GB


In [4]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 538150 entries, 1 to 538150
Columns: 896 entries, bid_qty to label
dtypes: float64(896)
memory usage: 3.6 GB


In [5]:
#train = train.reset_index(drop=True)

In [6]:
train.head()

Unnamed: 0_level_0,bid_qty,ask_qty,buy_qty,sell_qty,volume,X1,X2,X3,X4,X5,...,X882,X883,X884,X885,X886,X887,X888,X889,X890,label
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-03-01 00:00:00,15.283,8.425,176.405,44.984,221.389,0.121263,-0.41769,0.005399,0.125948,0.058359,...,1.925423,1.847943,0.005676,0.190791,0.369691,0.37763,0.210153,0.159183,0.530636,0.562539
2023-03-01 00:01:00,38.59,2.336,525.846,321.95,847.796,0.302841,-0.049576,0.356667,0.481087,0.237954,...,1.928569,1.849468,0.005227,0.18466,0.363642,0.374515,0.209573,0.158963,0.530269,0.533686
2023-03-01 00:02:00,0.442,60.25,159.227,136.369,295.596,0.167462,-0.291212,0.083138,0.206881,0.101727,...,1.928047,1.849282,0.004796,0.178719,0.357689,0.371424,0.208993,0.158744,0.529901,0.546505
2023-03-01 00:03:00,4.865,21.016,335.742,124.963,460.705,0.072944,-0.43659,-0.102483,0.017551,0.007149,...,1.928621,1.849608,0.004398,0.172967,0.351832,0.368358,0.208416,0.158524,0.529534,0.357703
2023-03-01 00:04:00,27.158,3.451,98.411,44.407,142.818,0.17382,-0.213489,0.096067,0.215709,0.107133,...,1.927084,1.84895,0.004008,0.167391,0.346066,0.365314,0.207839,0.158304,0.529167,0.362452


In [7]:
# Drop columns have exactly 1 value
NUNIQUE1=[c for c in train.columns if train[c].nunique()==1]
train.drop(NUNIQUE1,axis=1,inplace=True)
test.drop(NUNIQUE1+['label'],axis=1,inplace=True)

# Feature Importance by Xgboost

In [8]:
top_features = [
    "X344", "X598", "X863", "X862", "X856", "X137", "X174", "X425", "X612", "X167",
    "X852", "X168", "X27", "X422", "X342", "X427", "X532", "X178", "X539", "X881",
    "X889", "X421", "X341", "X875", "X465", "X97", "X603", "X138", "X855", "X572",
    "X338", "X890", "X95", "X161", "X533", "X271", "X861", "X279", "X424", "X888",
    "X866", "X169", "X879", "X283", "X332", "X854", "X574", "X28", "X281", "X757",
    "X754", "X445", "X180", "X94", "X88", "X525", "X285", "X181", "X429", "X343",
    "X688", "X692", "X680", "X832", "X755", "X860", "X695", "X345", "X611", "X689",
    "X387", "X588", "X686", "X140", "X530", "X878", "X753", "X98", "X24", "X880",
    "X756", "X540", "X531", "X340", "X383", "X331", "X873", "X385", "X277", "X602",
    "X136", "X586", "X786", "X887", "X300", "X284", "X91", "X379", "X685", "X177",
    'bid_qty', 'ask_qty', 'buy_qty', 'sell_qty', 'volume'
    
]


train= train[top_features + ["label"]]
test= test[top_features]

In [9]:
# def reduce_memory_usage(df: pd.DataFrame) -> pd.DataFrame:
#     start_mem = df.memory_usage(deep=True).sum() / 1024**2
#     print(f"Memory usage before: {start_mem:.2f} MB")

#     for col in df.columns:
#         col_type = df[col].dtype

#         if col_type == 'float64':
#             try:
#                 df[col] = df[col].astype('float16')
#             except ValueError:
#                 pass  

#         elif col_type == 'int64':
#             min_val = df[col].min()
#             max_val = df[col].max()
#             if min_val >= -128 and max_val <= 127:
#                 df[col] = df[col].astype('int8')
#             else:
#                 # optionally handle other int downcasts (int16, int32)
#                 df[col] = pd.to_numeric(df[col], downcast='integer')

#     end_mem = df.memory_usage(deep=True).sum() / 1024**2
#     print(f"Memory usage after: {end_mem:.2f} MB")
#     print(f"Reduced by {(start_mem - end_mem) / start_mem * 100:.1f}%")

#     return df


# train = reduce_memory_usage(train)
# test = reduce_memory_usage(test)


In [10]:
# def add_interaction_features(df):
#     eps = 1e-6

#     df['bid_ask_spread_ratio'] = df['bid_qty'] / (df['ask_qty'] + eps)
#     df['buy_sell_ratio'] = df['buy_qty'] / (df['sell_qty'] + eps)
#     df['net_qty'] = df['buy_qty'] - df['sell_qty']
#     df['total_liquidity'] = df['bid_qty'] + df['ask_qty']
#     df['liquidity_per_volume'] = df['total_liquidity'] / (df['volume'] + eps)
#     df['trade_density'] = (df['buy_qty'] + df['sell_qty']) / (df['volume'] + eps)
#     df['volume_per_order'] = df['volume'] / (df['buy_qty'] + df['sell_qty'] + eps)

#     return df


# train = add_interaction_features(train)
# test = add_interaction_features(test)


In [11]:
train.head()

Unnamed: 0_level_0,X344,X598,X863,X862,X856,X137,X174,X425,X612,X167,...,X91,X379,X685,X177,bid_qty,ask_qty,buy_qty,sell_qty,volume,label
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-03-01 00:00:00,-0.362607,0.075641,0.21857,-1.027483,-0.216525,0.656033,-0.167949,-0.267053,0.817685,1.271148,...,-0.072709,-0.97,-0.133024,0.077878,15.283,8.425,176.405,44.984,221.389,0.562539
2023-03-01 00:01:00,-0.376922,0.067653,0.088014,-1.024055,-0.180112,0.655122,-0.167483,-0.266682,0.817685,1.264105,...,-0.048671,-0.96731,-0.13284,0.288646,38.59,2.336,525.846,321.95,847.796,0.533686
2023-03-01 00:02:00,-0.368205,0.067288,-0.147363,-1.024056,-0.265966,0.654213,-0.167019,-0.270686,0.809375,1.257102,...,-0.048536,-0.964626,-0.132655,0.270661,0.442,60.25,159.227,136.369,295.596,0.546505
2023-03-01 00:03:00,-0.356326,0.069881,-0.09459,-1.024058,-0.322244,0.653305,-0.18045,-0.270311,0.807937,1.250137,...,-0.033184,-1.083187,-0.132471,0.363641,4.865,21.016,335.742,124.963,460.705,0.357703
2023-03-01 00:04:00,-0.347715,0.072288,0.162221,-1.02406,-0.369625,0.652398,-0.179949,-0.255023,0.795718,1.243211,...,-0.028631,-1.080183,-0.132287,0.389574,27.158,3.451,98.411,44.407,142.818,0.362452


# Model evaluation

In [12]:
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
import numpy as np
import gc


X = train.drop('label', axis=1)
y = train['label']

#Params
xgb_params = {
    "colsample_bylevel": 0.477,
    "colsample_bynode": 0.362,
    "colsample_bytree": 0.710,
    "gamma": 1.709,
    "learning_rate": 0.03,
    "max_depth": 20,
    "max_leaves": 12,
    "min_child_weight": 16,
    "n_estimators": 10000,
    "n_jobs": -1,
    "random_state": 42,
    "reg_alpha": 39.354,
    "reg_lambda": 65.44,
    "subsample": 0.065,
    #"tree_method": "gpu_hist", 
    "verbosity": 0
}


split = TimeSeriesSplit(n_splits=50).split(X, y)
xgb_oof_preds = np.zeros(len(X)) 
xgb_scores = []
fold = 1

for train_idx, val_idx in split:
    print(f"Fold {fold}:")

    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = xgb.XGBRegressor(**xgb_params, early_stopping_rounds=100)
    model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=False
        )       

    preds = model.predict(X_val)
    xgb_oof_preds[val_idx] = preds  

    pearson_corr = np.corrcoef(y_val, preds)[0,1]
    print(f"Pearson Correlation: {pearson_corr:.5f}")
    xgb_scores.append(pearson_corr)

    del X_train, X_val, y_train, y_val, preds
    gc.collect()

    fold += 1

print(f"\nAverage Pearson Correlation: {np.mean(xgb_scores):.5f}")


Fold 1:
Pearson Correlation: -0.13542
Fold 2:
Pearson Correlation: 0.08920
Fold 3:
Pearson Correlation: 0.15840
Fold 4:
Pearson Correlation: 0.17336
Fold 5:
Pearson Correlation: 0.25220
Fold 6:
Pearson Correlation: 0.09650
Fold 7:
Pearson Correlation: 0.35234
Fold 8:
Pearson Correlation: 0.11895
Fold 9:
Pearson Correlation: 0.12540
Fold 10:
Pearson Correlation: 0.06553
Fold 11:
Pearson Correlation: 0.08450
Fold 12:
Pearson Correlation: 0.07826
Fold 13:
Pearson Correlation: 0.29735
Fold 14:
Pearson Correlation: 0.20408
Fold 15:
Pearson Correlation: 0.27441
Fold 16:
Pearson Correlation: 0.17625
Fold 17:
Pearson Correlation: -0.10301
Fold 18:
Pearson Correlation: 0.25323
Fold 19:
Pearson Correlation: 0.22811
Fold 20:
Pearson Correlation: 0.20769
Fold 21:
Pearson Correlation: 0.12225
Fold 22:
Pearson Correlation: 0.24717
Fold 23:
Pearson Correlation: 0.02552
Fold 24:
Pearson Correlation: 0.13209
Fold 25:
Pearson Correlation: 0.10867
Fold 26:
Pearson Correlation: 0.07925
Fold 27:
Pearson Co

In [13]:
final_preds = model.predict(test)
submission = pd.read_csv("/kaggle/input/drw-crypto-market-prediction/sample_submission.csv")
submission["prediction"] = final_preds
submission.to_csv("submission.csv", index=False)
submission.head()

Unnamed: 0,ID,prediction
0,1,0.01427
1,2,0.052564
2,3,0.090698
3,4,0.050749
4,5,0.104866
