# Create Cross-Validation Folds

This notebook creates a static, reusable cross-validation file so all models (ResNet, XGBoost, LightGBM, CatBoost) are trained on the **exact same splits**.

**Configuration**:
- `n_splits = 5` (ensures ~30 TDEs per fold from ~148 total)
- `shuffle = True`
- `random_state = 15` (fixed seed for reproducibility)

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
import warnings

warnings.filterwarnings('ignore')

In [2]:
# Configuration
N_SPLITS = 5
SHUFFLE = True
RANDOM_STATE = 15  # Fixed seed for reproducibility

# Paths
DATA_DIR = os.path.join('..', 'data', 'processed')
INPUT_PATH = os.path.join(DATA_DIR, '2dgp_train_features.parquet')
OUTPUT_PATH = os.path.join(DATA_DIR, 'train_folds.csv')

In [3]:
# Load data
print("Loading training data...")
df = pd.read_parquet(INPUT_PATH)
print(f"Data shape: {df.shape}")
print(f"\nClass distribution:")
print(df['target'].value_counts())

Loading training data...
Data shape: (3043, 290)

Class distribution:
target
0    2895
1     148
Name: count, dtype: int64


In [4]:
# Initialize StratifiedKFold
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=SHUFFLE, random_state=RANDOM_STATE)

# Create kfold column
df['kfold'] = -1

# Assign fold numbers
for fold_idx, (train_idx, val_idx) in enumerate(skf.split(df, df['target'])):
    df.loc[val_idx, 'kfold'] = fold_idx

print(f"Created {N_SPLITS} folds.")

Created 5 folds.


In [5]:
# Sanity Check: Verify stratification
print("\n=== Sanity Check: Target=1 (TDE) count per fold ===")
print("Expected: ~29-30 TDEs per fold\n")

for fold in range(N_SPLITS):
    fold_data = df[df['kfold'] == fold]
    tde_count = fold_data['target'].sum()
    total_count = len(fold_data)
    print(f"Fold {fold}: {tde_count} TDEs / {total_count} total samples ({tde_count/total_count*100:.2f}%)")

print(f"\nTotal TDEs: {df['target'].sum()}")


=== Sanity Check: Target=1 (TDE) count per fold ===
Expected: ~29-30 TDEs per fold

Fold 0: 30 TDEs / 609 total samples (4.93%)
Fold 1: 30 TDEs / 609 total samples (4.93%)
Fold 2: 30 TDEs / 609 total samples (4.93%)
Fold 3: 29 TDEs / 608 total samples (4.77%)
Fold 4: 29 TDEs / 608 total samples (4.77%)

Total TDEs: 148


In [6]:
# Save output (only object_id, target, kfold - drop feature columns to save space)
output_df = df[['object_id', 'target', 'kfold']]

output_df.to_csv(OUTPUT_PATH, index=False)
print(f"\nFolds saved to: {OUTPUT_PATH}")
print(f"Columns: {output_df.columns.tolist()}")
print(f"Shape: {output_df.shape}")


Folds saved to: ..\data\processed\train_folds.csv
Columns: ['object_id', 'target', 'kfold']
Shape: (3043, 3)


In [7]:
# Verification
print("\n=== Verification ===")
saved_df = pd.read_csv(OUTPUT_PATH)
print(f"File loaded successfully: {OUTPUT_PATH}")
print(f"Columns: {saved_df.columns.tolist()}")
print(f"Unique fold values: {sorted(saved_df['kfold'].unique())}")
print(f"\nFirst few rows:")
print(saved_df.head(10))


=== Verification ===
File loaded successfully: ..\data\processed\train_folds.csv
Columns: ['object_id', 'target', 'kfold']
Unique fold values: [0, 1, 2, 3, 4]

First few rows:
                  object_id  target  kfold
0  Dornhoth_anwar_melethron       0      2
1     Dornhoth_archam_grond       0      0
2       Dornhoth_certh_iaun       0      0
3      Dornhoth_drafn_celon       0      2
4  Dornhoth_fervain_onodrim       0      1
5       Dornhoth_galadh_ylf       0      3
6      Dornhoth_gwend_nagol       0      2
7   Dornhoth_hervenn_tathar       0      0
8       Dornhoth_inias_gond       0      1
9        Dornhoth_lavan_ank       0      3
