In [3]:
import numpy as np
from sklearn.model_selection import StratifiedKFold, train_test_split


In [4]:
seed=43
local_gt_path = "./training_data/ground_truth.npz"

gt_dict = np.load(local_gt_path)
y_full = gt_dict['y'].astype(int)

In [5]:
def balance_dataset(y_full, seed):
    """
    Returns list of indices to keep to get balanced dataset
    """
    print("  Original distribution:")
    print(f"    Positives: {(y_full == 1).sum()}")
    print(f"    Negatives: {(y_full == 0).sum()}")
    
    pos_idx = np.where(y_full == 1)[0]
    neg_idx = np.where(y_full == 0)[0]
    n_min = min(len(pos_idx), len(neg_idx))
    
    rng = np.random.RandomState(seed)
    pos_sample = rng.choice(pos_idx, size=n_min, replace=False)
    neg_sample = rng.choice(neg_idx, size=n_min, replace=False)
    keep_idx = np.concatenate([pos_sample, neg_sample])
    rng.shuffle(keep_idx)
    
    y_filtered = y_full[keep_idx]
    
    N = len(keep_idx)
    print(f"  After balancing: N = {N}, positives = {(y_filtered==1).sum()}, negatives = {(y_filtered==0).sum()}")
    
    return keep_idx

def make_stratified_train_test_split(y_full, filtered_indices, seed, test_size=0.15):
    y_filtered = y_full[filtered_indices]
    N = y_filtered.shape[0]

    train_val_pos, test_pos = train_test_split(
        np.arange(N), test_size=test_size, stratify=y_filtered, random_state=seed
    )
    
    print(f"  Train+Val size: {len(train_val_pos)}, Test size: {len(test_pos)}")
    print("    Class distribution (train+val):", np.bincount(y_filtered[train_val_pos]))
    print("    Class distribution (test):", np.bincount(y_filtered[test_pos]))

    return filtered_indices[train_val_pos], filtered_indices[test_pos]

def make_stratified_folds(train_val_indices, y_full, n_folds, seed):
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)

    folds = []
    class_counts = []

    for fold_idx, (train_ids, val_ids) in enumerate(
        skf.split(train_val_indices, y_full[train_val_indices])
    ):
        train_idx = train_val_indices[train_ids]
        val_idx = train_val_indices[val_ids]
        
        y_val = y_full[val_idx]
        counts = np.bincount(y_val, minlength=2)
        class_counts.append(counts)

        folds.append({
            "train_idx": train_idx,
            "val_idx": val_idx
        })

    class_counts = np.array(class_counts)
    avg_counts = class_counts.mean(axis=0)

    # Both should be even:
    print(f"  Average val class distribution across folds: {avg_counts.astype(int)}")
    print(f"  Average val class proportions: {avg_counts / avg_counts.sum()}")

    return folds

def save_splits_and_folds(file_name, filter_indices, train_val_indices, test_indices, final_train_pos, final_val_pos, folds):
    np.savez(
        file_name,
        filter_indices=filter_indices,
        train_val_indices=train_val_indices,
        test_indices=test_indices,
        final_train_pos=final_train_pos,
        final_val_pos=final_val_pos,
        fold_train_indices=[f["train_idx"] for f in folds],
        fold_val_indices=[f["val_idx"] for f in folds],
    )    
    print("  Successfully saved everything to file.")

In [7]:

print("Balancing datasets...")
filter_indices = balance_dataset(y_full, seed)

print("Creating train/test split...")
train_val_pos, test_pos = make_stratified_train_test_split(y_full, filter_indices, seed, test_size=0.15)

print("Making final training train/val split...")
final_train_pos, final_val_pos = make_stratified_train_test_split(y_full, train_val_pos, seed, test_size=0.15)

print("Making k-folds...")
folds = make_stratified_folds(train_val_pos, y_full, 10, seed)

print("Saving...")
save_splits_and_folds("./training_data/data_split_indices.npz", filter_indices, train_val_pos, test_pos, final_train_pos, final_val_pos, folds)

Balancing datasets...
  Original distribution:
    Positives: 665
    Negatives: 2140
  After balancing: N = 1330, positives = 665, negatives = 665
Creating train/test split...
  Train+Val size: 1130, Test size: 200
    Class distribution (train+val): [565 565]
    Class distribution (test): [100 100]
Making final training train/val split...
  Train+Val size: 960, Test size: 170
    Class distribution (train+val): [480 480]
    Class distribution (test): [85 85]
Making k-folds...
  Average val class distribution across folds: [56 56]
  Average val class proportions: [0.5 0.5]
Saving...
  Successfully saved everything to file.
