In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/jane-street-real-time-market-data-forecasting'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
import catboost as cbt
from sklearn.model_selection import KFold
import optuna
from optuna import Trial, create_study
import gc
import json
import os

ModuleNotFoundError: No module named 'lightgbm'

In [7]:
class PurgedKFold:
    def __init__(self, n_splits=5, purge_window=20, embargo_pct=0.02):
        self.n_splits = n_splits
        self.purge_window = purge_window
        self.embargo_pct = embargo_pct
    
    def get_n_splits(self, X, y=None, groups=None):
        return self.n_splits
    
    def split(self, X, y=None, groups=None):
        n_samples = len(X)
        indices = np.arange(n_samples)
        embargo_size = int(self.embargo_pct * n_samples)
        
        kf = KFold(n_splits=self.n_splits)
        for train_idx, test_idx in kf.split(X):
            test_start = test_idx[0]
            test_end = test_idx[-1]
            
            # Remove samples within purge window
            purge_start = max(0, test_start - self.purge_window)
            purge_end = min(n_samples, test_end + self.purge_window + embargo_size)
            
            train_mask = np.ones(n_samples, dtype=bool)
            train_mask[purge_start:purge_end] = False
            
            final_train_idx = indices[train_mask & np.isin(indices, train_idx)]
            
            yield final_train_idx, test_idx

def load_data(input_path, n_partitions=3):
    """Load and combine data from multiple partitions"""
    df_list = []
    
    for partition_id in range(n_partitions):
        partition_path = f'{input_path}/train.parquet/partition_id={partition_id}/part-0.parquet'
        df = pd.read_parquet(partition_path)
        df_list.append(df)
        gc.collect()
    
    final_df = pd.concat(df_list, ignore_index=True)
    del df_list
    gc.collect()
    
    return final_df

def objective_lgb(trial: Trial, X_train, y_train, X_valid, y_valid, w_train, w_valid):
    param = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 31, 255),  # broader range
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),  # increased upper bound
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 0.9),
        'subsample': trial.suggest_float('subsample', 0.5, 0.9),
        'subsample_freq': trial.suggest_int('subsample_freq', 1, 7),
        'max_depth': trial.suggest_int('max_depth', 5, 8),  # narrower, more reasonable range
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 50),  # lower range
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 1.0, log=True),  # less extreme range
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 1.0, log=True),  # less extreme range
        'device': 'gpu',
        'gpu_use_dp': True,
        'n_estimators': 1000
    }

    model = lgb.LGBMRegressor(**param)
    
    # Use callbacks without verbose parameter
    model.fit(
        X_train, y_train,
        sample_weight=w_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric='rmse',
        eval_sample_weight=[w_valid],
        callbacks=[lgb.early_stopping(stopping_rounds=50)]
    )
    
    preds = model.predict(X_valid)
    rmse = np.sqrt(np.average((preds - y_valid) ** 2, weights=w_valid))
    
    return rmse

def objective_xgb(trial: Trial, X_train, y_train, X_valid, y_valid, w_train, w_valid):
    param = {
        'max_depth': trial.suggest_int('max_depth', 4, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'min_child_weight': trial.suggest_int('min_child_weight', 10, 100),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'tree_method': 'hist',  # Changed from 'hist' to 'gpu_hist'
        'device': 'cuda',  # Explicitly specify CUDA device
        'early_stopping_rounds': 100  # Added here instead of in fit
    }
    
    model = xgb.XGBRegressor(**param)
    
    # Convert data to GPU
    dtrain = xgb.DMatrix(X_train, y_train, weight=w_train)
    dvalid = xgb.DMatrix(X_valid, y_valid, weight=w_valid)
    
    model.fit(
        X_train, y_train,
        sample_weight=w_train,
        eval_set=[(X_valid, y_valid)],
        sample_weight_eval_set=[w_valid],
        verbose=False
    )
    
    preds = model.predict(X_valid)
    rmse = np.sqrt(np.average((preds - y_valid) ** 2, weights=w_valid))
    
    return rmse

def objective_catboost(trial: Trial, X_train, y_train, X_valid, y_valid, w_train, w_valid):
    param = {
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 10.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 100),
        'bootstrap_type': 'Bernoulli',
        'task_type': 'GPU',
        'loss_function': 'RMSE'
    }
    
    model = cbt.CatBoostRegressor(**param)
    eval_pool = cbt.Pool(X_valid, y_valid, weight=w_valid)
    
    model.fit(
        X_train, y_train,
        sample_weight=w_train,
        eval_set=[eval_pool],
        early_stopping_rounds=50,
        verbose=False
    )
    
    preds = model.predict(X_valid)
    rmse = np.sqrt(np.average((preds - y_valid) ** 2, weights=w_valid))
    
    return rmse

In [None]:
if __name__ == "__main__":
    # Set paths
    input_path = '/kaggle/input/jane-street-real-time-market-data-forecasting'
    output_path = 'model_params'
    os.makedirs(output_path, exist_ok=True)
    
    # Feature names
    feature_names = [f"feature_{i:02d}" for i in range(79)]
    
    # Load data
    print("Loading data...")
    df = load_data(input_path, n_partitions=3)  # Adjust n_partitions as needed
    
    # Initialize PurgedKFold
    cv = PurgedKFold(n_splits=5, purge_window=20, embargo_pct=0.02)
    
    # Dictionary to store results for each model
    results = {}
    
    # For each model type
    #for model_type in ['lgb', 'xgb', 'catboost']:
    for model_type in [ 'xgb', 'catboost']:
        print(f"\nOptimizing {model_type}...")
        
        # Create study
        study = create_study(direction='minimize')
        fold_scores = []
        
        # Select objective function
        if model_type == 'lgb':
            objective = objective_lgb
        elif model_type == 'xgb':
            objective = objective_xgb
        else:
            objective = objective_catboost
        
        # Run optimization for each fold
        for fold, (train_idx, valid_idx) in enumerate(cv.split(df[feature_names])):
            print(f"Processing fold {fold + 1}")
            
            # Split data
            X_train = df[feature_names].iloc[train_idx]
            X_valid = df[feature_names].iloc[valid_idx]
            y_train = df['responder_6'].iloc[train_idx]
            y_valid = df['responder_6'].iloc[valid_idx]
            w_train = df['weight'].iloc[train_idx]
            w_valid = df['weight'].iloc[valid_idx]
            
            # Create fold-specific objective
            fold_objective = lambda trial: objective(
                trial, X_train, y_train, X_valid, y_valid, w_train, w_valid
            )
            
            # Optimize
            study.optimize(fold_objective, n_trials=20)  # Adjust n_trials as needed
            fold_scores.append(study.best_value)
            
            # Clean up memory
            gc.collect()
        
        # Store results
        results[model_type] = {
            'best_params': study.best_params,
            'mean_rmse': float(np.mean(fold_scores)),
            'std_rmse': float(np.std(fold_scores)),
            'fold_scores': [float(score) for score in fold_scores]
        }
        
        # Print results
        print(f"\n{model_type.upper()} Results:")
        print(f"Best parameters: {study.best_params}")
        print(f"Mean RMSE: {np.mean(fold_scores):.6f}")
        print(f"Std RMSE: {np.std(fold_scores):.6f}")
        
        # Save individual model results
        with open(f'{output_path}/{model_type}_results.json', 'w') as f:
            json.dump(results[model_type], f, indent=4)
    
    # Save all results
    with open(f'{output_path}/all_results.json', 'w') as f:
        json.dump(results, f, indent=4)
    
    print("\nOptimization completed. Results saved in 'model_params' directory.")

Loading data...


[I 2025-01-15 21:16:22,707] A new study created in memory with name: no-name-c5e4494f-f99b-4a43-b89a-9aadc49bd841



Optimizing xgb...
Processing fold 1


[I 2025-01-15 21:17:50,654] Trial 0 finished with value: 0.7839458584785461 and parameters: {'max_depth': 5, 'learning_rate': 0.04316606815265267, 'min_child_weight': 90, 'subsample': 0.9552442007331393, 'colsample_bytree': 0.6593456004656606, 'gamma': 0.010853096969235122, 'reg_alpha': 5.532685262070338e-06, 'reg_lambda': 2.7406194109816155}. Best is trial 0 with value: 0.7839458584785461.
[I 2025-01-15 21:19:16,076] Trial 1 finished with value: 0.783700168132782 and parameters: {'max_depth': 7, 'learning_rate': 0.031202646654055047, 'min_child_weight': 13, 'subsample': 0.6937766181023162, 'colsample_bytree': 0.8617708545910219, 'gamma': 0.10433095804462972, 'reg_alpha': 0.06296985190532312, 'reg_lambda': 1.7426482313531487e-08}. Best is trial 1 with value: 0.783700168132782.
[I 2025-01-15 21:20:38,150] Trial 2 finished with value: 0.7837851047515869 and parameters: {'max_depth': 5, 'learning_rate': 0.05188594450434426, 'min_child_weight': 10, 'subsample': 0.9993476725724016, 'colsamp