Import the data and get it to the required format

In [1]:
import os
import pickle
import pandas as pd

# Paths
base_folder = "Data"
autofeat_folder = os.path.join(base_folder, "AutoFeat_Data")

# Final dictionary
all_data = {}

def align_X_to_y_index(X, y, where="train"):
    """
    Set X's index to y's index (no reordering).
    If y doesn't have an index (e.g., it's a NumPy array), we leave X unchanged.
    """
    y_idx = getattr(y, "index", None)
    if y_idx is None:
        # Can't align if y has no index
        return X

    # Handle DataFrame / Series
    if isinstance(X, (pd.DataFrame, pd.Series)):
        X = X.copy()
        X.index = y_idx
        return X

    # If X is something else (e.g., numpy array), we can't set an index
    # without converting it to a DataFrame (which might be undesired).
    # So we leave it as-is.
    return X

# Loop through all pickle files in the folder
for file_name in os.listdir(autofeat_folder):
    if not file_name.endswith(".pkl"):
        continue

    file_path = os.path.join(autofeat_folder, file_name)

    # Extract dataset name (everything before '_fold_data.pkl')
    dataset_name = file_name.replace("_fold_data.pkl", "")

    # Load the pickle file
    with open(file_path, "rb") as f:
        dataset_dict = pickle.load(f)

    # Initialize dataset entry
    all_data[dataset_name] = {}

    # Loop through folds
    for fold_key, fold_data in dataset_dict.items():
        X_train = fold_data['Training_Independent']
        y_train = fold_data['Training_Dependent']
        X_test  = fold_data['Testing_Independent']
        y_test  = fold_data['Testing_Dependent']

        # Align independent indices to dependent indices (order unchanged)
        X_train = align_X_to_y_index(X_train, y_train, where="train")
        X_test  = align_X_to_y_index(X_test, y_test, where="test")

        all_data[dataset_name][fold_key] = {
            'Training_Independent': X_train,
            'Training_Dependent': y_train,
            'Testing_Independent': X_test,
            'Testing_Dependent': y_test
        }

# Save the combined dictionary
os.makedirs(base_folder, exist_ok=True)
output_path = os.path.join(base_folder, "Data.pkl")
with open(output_path, "wb") as f:
    pickle.dump(all_data, f, protocol=pickle.HIGHEST_PROTOCOL)

print(f"Saved combined data (X indices matched to y) to {output_path}")


Saved combined data (X indices matched to y) to Data\Data.pkl


Breaking Validation

In [3]:
from sklearn.model_selection import train_test_split

# Load Data
with open('Data/Data.pkl', 'rb') as f:
    Data = pickle.load(f)

# Process each dataset and fold
for dataset_name, folds in Data.items():
    for fold_name, fold_data in folds.items():
        # Extract original splits
        X_train = fold_data.pop('Training_Independent')
        y_train = fold_data.pop('Training_Dependent')
        X_test = fold_data.pop('Testing_Independent')
        y_test = fold_data.pop('Testing_Dependent')

        # Save full training and testing sets
        fold_data['Training_Independent_Full'] = X_train
        fold_data['Training_Dependent_Full'] = y_train
        fold_data['Testing_Independent_Full'] = X_test
        fold_data['Testing_Dependent_Full'] = y_test

        # Split training into broken train/val
        X_train_broken, X_val_broken, y_train_broken, y_val_broken = train_test_split(
            X_train, y_train, test_size=0.2, random_state=42
        )

        # Save broken train/val sets
        fold_data['Training_Independent_Broken'] = X_train_broken
        fold_data['Training_Dependent_Broken'] = y_train_broken
        fold_data['Validation_Independent_Broken'] = X_val_broken
        fold_data['Validation_Dependent_Broken'] = y_val_broken




In [4]:
# Optional: Save modified Data
with open('Data/Validation_Data.pkl', 'wb') as f:
    pickle.dump(Data, f)