In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/drw-crypto-market-prediction/sample_submission.csv
/kaggle/input/drw-crypto-market-prediction/train.parquet
/kaggle/input/drw-crypto-market-prediction/test.parquet


In [2]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
import gc
import warnings
warnings.filterwarnings('ignore')

# ===================== MEMORY OPTIMIZATION =====================
def reduce_mem_usage(df):
    """Reduce dataframe memory usage"""
    start_mem = df.memory_usage().sum() / 1024**2
    print(f"Initial memory usage: {start_mem:.2f} MB")
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    
    end_mem = df.memory_usage().sum() / 1024**2
    print(f"Memory usage after optimization: {end_mem:.2f} MB")
    print(f"Reduced by {100 * (start_mem - end_mem) / start_mem:.1f}%")
    return df

# ===================== DATA LOADING =====================
def load_data(path, sample_frac=1.0):
    """Load data with memory optimization"""
    df = pd.read_parquet(path)
    df = reduce_mem_usage(df)
    
    if sample_frac < 1.0:
        df = df.sample(frac=sample_frac, random_state=42)
    
    return df

print("Loading train data...")
train = load_data('/kaggle/input/drw-crypto-market-prediction/train.parquet', sample_frac=0.5)
print("\nLoading test data...")
test = load_data('/kaggle/input/drw-crypto-market-prediction/test.parquet')

# ===================== FEATURE ENGINEERING =====================
def create_features(df):
    """Create memory-efficient features"""
    df['order_imbalance'] = ((df['buy_qty'] - df['sell_qty']) / 
                           (df['buy_qty'] + df['sell_qty'] + 1e-6)).astype(np.float16)
    df['liquidity_ratio'] = ((df['bid_qty'] + df['ask_qty']) / 
                            (df['volume'] + 1e-6)).astype(np.float16)
    return df

print("\nCreating features...")
train = create_features(train)
test = create_features(test)

# Select features
feature_cols = [col for col in train.columns if col != 'label' and 
               train[col].dtype in [np.float16, np.float32, np.float64, 
                                   np.int8, np.int16, np.int32]]

# ===================== MODEL TRAINING =====================
params = {
    'objective': 'regression',
    'metric': 'mae',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'verbosity': -1,
    'seed': 42,
    'max_depth': -1,
    'min_data_in_leaf': 20
}

predictions = np.zeros(len(test))
oof_preds = np.zeros(len(train))

for fold, (train_idx, val_idx) in enumerate(KFold(n_splits=3, shuffle=False).split(train)):
    print(f"\n=== Fold {fold+1} ===")
    
    X_train, X_val = train[feature_cols].iloc[train_idx], train[feature_cols].iloc[val_idx]
    y_train, y_val = train['label'].iloc[train_idx], train['label'].iloc[val_idx]
    
    # Correct LightGBM training with callbacks
    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
    
    model = lgb.train(
        params,
        train_data,
        num_boost_round=800,
        valid_sets=[val_data],
        callbacks=[
            lgb.early_stopping(stopping_rounds=50, verbose=True),
            lgb.log_evaluation(50)
        ]
    )
    
    oof_preds[val_idx] = model.predict(X_val)
    predictions += model.predict(test[feature_cols]) / 3
    
    del X_train, X_val, y_train, y_val, train_data, val_data, model
    gc.collect()

# Calculate OOF score
oof_mae = mean_absolute_error(train['label'], oof_preds)
print(f"\nOut-of-Fold MAE: {oof_mae:.4f}")

# ===================== SUBMISSION =====================
submission = pd.DataFrame({
    'ID': test.index,      # ✅ MUST be "ID" (exactly!)
    'prediction': predictions  # ✅ MUST be "prediction" (lowercase!)
})
submission.to_csv('submission.csv', index=False)
print("\n✅ Submission saved successfully!")

Loading train data...
Initial memory usage: 3157.59 MB
Memory usage after optimization: 792.41 MB
Reduced by 74.9%

Loading test data...
Initial memory usage: 3227.13 MB
Memory usage after optimization: 806.27 MB
Reduced by 75.0%

Creating features...

=== Fold 1 ===
Training until validation scores don't improve for 50 rounds
[50]	valid_0's l1: 0.582443
[100]	valid_0's l1: 0.539266
[150]	valid_0's l1: 0.506634
[200]	valid_0's l1: 0.479167
[250]	valid_0's l1: 0.456479
[300]	valid_0's l1: 0.435959
[350]	valid_0's l1: 0.418391
[400]	valid_0's l1: 0.402686
[450]	valid_0's l1: 0.388114
[500]	valid_0's l1: 0.37472
[550]	valid_0's l1: 0.362385
[600]	valid_0's l1: 0.35055
[650]	valid_0's l1: 0.339908
[700]	valid_0's l1: 0.32988
[750]	valid_0's l1: 0.319752
[800]	valid_0's l1: 0.310843
Did not meet early stopping. Best iteration is:
[800]	valid_0's l1: 0.310843

=== Fold 2 ===
Training until validation scores don't improve for 50 rounds
[50]	valid_0's l1: 0.578772
[100]	valid_0's l1: 0.535765
