# Dataset Preprocessing for H5 Format

This notebook allows you to:
1. Load datasets from filesystem (CSV, parquet, etc.) or OpenML
2. Select and configure target variables (including additional targets)
3. Preprocess features (categorical/continuous separation)
4. Save in H5 format compatible with the modular dataset system
5. Validate the created H5 dataset

## Requirements
- pandas
- numpy
- h5py
- scikit-learn
- openml (for OpenML datasets)
- dataset_utils (our custom module)

In [None]:
# Standard library imports
import os
import sys
import json
import logging

# Data science imports
import pandas as pd
import numpy as np
import h5py
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Import our custom utilities
from data.dataset_utils import create_h5_from_dataframe, validate_h5_dataset, inspect_h5_dataset

# Optional: Import OpenML if available
try:
    import openml
    OPENML_AVAILABLE = True
    print("OpenML available")
except ImportError:
    OPENML_AVAILABLE = False
    print("OpenML not available. Install with: pip install openml")

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

## Step 0: Configuration

Set up the basic configuration for your dataset preprocessing.

In [None]:
# Configuration for Higgs dataset
CONFIG = {
    'dataset_name': 'higgs',  # Name for your dataset
    'short_name': 'hig',            # Short name for the dataset
    'output_dir': '/path/to/data',   # Where to save the H5 file (must match DATA_ROOT in run.sh)
    'data_source': 'openml',           # 'file' or 'openml'
    'file_path': None,                  # Path to your data file (if using 'file')
    'openml_id': 45570,                  # OpenML dataset ID (if using 'openml')
    'train_split': 0.90,
    'val_split': 0.05,
    'test_split': 0.05,
    'random_seed': 42,
    'categorical_threshold': 4,        # Max unique values to consider as categorical
    'force_categorical': [],            # Columns to force as categorical
    'force_continuous': [],             # Columns to force as continuous
}

# Configuration for ACS Income dataset
# CONFIG = {
#     'dataset_name': 'acs_income',  # Name for your dataset
#     'short_name': 'inc',            # Short name for the dataset
#     'output_dir': '/path/to/data',   # Where to save the H5 file (must match DATA_ROOT in run.sh)
#     'data_source': 'openml',           # 'file' or 'openml'
#     'file_path': None,                  # Path to your data file (if using 'file')
#     'openml_id': 43137,                  # OpenML dataset ID (if using 'openml')
#     'train_split': 0.70,
#     'val_split': 0.15,
#     'test_split': 0.15,
#     'random_seed': 42,
#     'categorical_threshold': 530,        # Max unique values to consider as categorical
#     'force_categorical': [],            # Columns to force as categorical
#     'force_continuous': [],             # Columns to force as continuous
# }

print("Configuration set up. Modify the CONFIG dictionary above as needed.")

## Step 1: Load Dataset

Choose your data source and load the dataset.

In [None]:
from pathlib import Path

def load_dataset_from_file(file_path):
    """
    Load dataset from various file formats.
    """
    file_path = Path(file_path)
    
    if not file_path.exists():
        raise FileNotFoundError(f"File not found: {file_path}")
    
    # Determine file format and load accordingly
    if file_path.suffix.lower() == '.csv':
        df = pd.read_csv(file_path)
    elif file_path.suffix.lower() in ['.parquet', '.pq']:
        df = pd.read_parquet(file_path)
    elif file_path.suffix.lower() in ['.xlsx', '.xls']:
        df = pd.read_excel(file_path)
    elif file_path.suffix.lower() == '.json':
        df = pd.read_json(file_path)
    else:
        raise ValueError(f"Unsupported file format: {file_path.suffix}")
    
    print(f"Loaded dataset from {file_path}")
    print(f"Shape: {df.shape}")
    
    return df

def load_dataset_from_openml(dataset_id):
    """
    Load dataset from OpenML.
    """
    if not OPENML_AVAILABLE:
        raise ImportError("OpenML is not available. Install with: pip install openml")
    
    print(f"Loading dataset {dataset_id} from OpenML...")
    dataset = openml.datasets.get_dataset(dataset_id)
    X, y, categorical_indicator, attribute_names = dataset.get_data(
        dataset_format="dataframe", target=dataset.default_target_attribute
    )
    
    # Combine features and target
    df = X.copy()
    if y is not None:
        df[dataset.default_target_attribute] = y
    
    print(f"Loaded OpenML dataset: {dataset.name}")
    print(f"Shape: {df.shape}")
    print(f"Default target: {dataset.default_target_attribute}")
    
    return df, dataset.default_target_attribute, categorical_indicator

# Load the dataset based on configuration
df = None
original_target = None
openml_categorical_indicator = None

if CONFIG['data_source'] == 'file':
    if CONFIG['file_path'] is None:
        print("Please set CONFIG['file_path'] to your data file path")
    else:
        df = load_dataset_from_file(CONFIG['file_path'])
        
elif CONFIG['data_source'] == 'openml':
    if CONFIG['openml_id'] is None:
        print("Please set CONFIG['openml_id'] to the OpenML dataset ID")
    else:
        df, original_target, openml_categorical_indicator = load_dataset_from_openml(CONFIG['openml_id'])
        print(f"Original target column: {original_target}")

if CONFIG['dataset_name'] == 'acs_income' and df is not None:
    # Binarize the income column
    if 'PINCP' in df.columns:
        df['PINCP'] = (df['PINCP'] > 50000).astype(int)
        print("\nBinarized 'PINCP' column to indicate income > 50K")
        
if df is not None:
    print("\nDataset loaded successfully!")
    print(f"Columns: {list(df.columns)}")
    print(f"Data types:\n{df.dtypes}")
else:
    print("Please configure the data source properly")

## Step 2: Explore Dataset

Get familiar with your dataset structure and identify potential issues.

In [None]:
if df is not None:
    print("=== Dataset Overview ===")
    print(f"Shape: {df.shape}")
    print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    
    print("\n=== Missing Values ===")
    missing_info = df.isnull().sum()
    missing_info = missing_info[missing_info > 0].sort_values(ascending=False)
    if len(missing_info) > 0:
        print(missing_info)
    else:
        print("No missing values found!")
    
    print("\n=== Data Types ===")
    print(df.dtypes.value_counts())
    
    print("\n=== Unique Values per Column ===")
    unique_counts = df.nunique().sort_values(ascending=False)
    print(unique_counts)
    
    print("\n=== Sample Data ===")
    display(df.head())
    
    print("\n=== Statistical Summary ===")
    display(df.describe(include='all'))

## Step 3: Configure Target Variables

Select primary target and any additional targets you want to include.

In [None]:
if df is not None:
    print("Available columns for target selection:")
    for i, col in enumerate(df.columns):
        print(f"{i:2d}: {col} (dtype: {df[col].dtype}, unique: {df[col].nunique()})")
    
    print("\nConfigure your targets below:")

# Configure Higgs targets here
TARGET_CONFIG = {
    'primary_target': 'Target',      # e.g., 'target' or column index
    'additional_targets': ['m_jj', 'm_jjj', 'm_lv', 'm_jlv', 'm_bb', 'm_wbb', 'm_wwbb'],
    'target_types': {'Target': 'binary', 'm_jj': 'regression', 'm_jjj': 'regression', 'm_lv': 'regression', 'm_jlv': 'regression', 'm_bb': 'regression', 'm_wbb': 'regression', 'm_wwbb': 'regression'}  # e.g., {'target': 'binary', 'income': 'classification'}
}

# Configure ACS Income targets here
# TARGET_CONFIG = {
#     'primary_target': 'PINCP',
#     'additional_targets': ['MAR'],
#     'target_types': {
#         'PINCP': 'binary',
#         'MAR': 'classification'
#     }
# }

print("\nPlease configure TARGET_CONFIG above with your target columns.")
print("Target types can be: 'binary', 'classification', 'regression'")

In [None]:
def process_targets(df, target_config):
    """
    Process and validate target configuration.
    """
    if target_config['primary_target'] is None:
        raise ValueError("Please specify a primary target")
    
    # Convert column indices to names if necessary
    def get_column_name(col_ref):
        if isinstance(col_ref, int):
            return df.columns[col_ref]
        return col_ref
    
    # Get all target columns
    primary_target = get_column_name(target_config['primary_target'])
    additional_targets = [get_column_name(col) for col in target_config['additional_targets']]
    
    all_targets = [primary_target] + additional_targets
    
    # Validate targets exist
    for target in all_targets:
        if target not in df.columns:
            raise ValueError(f"Target column '{target}' not found in dataset")
    
    # Analyze targets
    target_info = {}
    for target in all_targets:
        unique_vals = df[target].nunique()
        null_count = df[target].isnull().sum()
        
        # Auto-detect target type if not specified
        if target not in target_config['target_types']:
            if unique_vals == 2:
                suggested_type = 'binary'
            elif unique_vals <= 20 and df[target].dtype in ['object', 'category']:
                suggested_type = 'classification'
            elif unique_vals <= 20:
                suggested_type = 'classification'
            else:
                suggested_type = 'regression'
        else:
            suggested_type = target_config['target_types'][target]
        
        target_info[target] = {
            'unique_values': unique_vals,
            'null_count': null_count,
            'dtype': str(df[target].dtype),
            'type': suggested_type,
            'sample_values': df[target].dropna().unique()[:10].tolist()
        }
    
    return all_targets, target_info

# Process targets
if df is not None and TARGET_CONFIG['primary_target'] is not None:
    try:
        target_columns, target_info = process_targets(df, TARGET_CONFIG)
        
        print("=== Target Analysis ===")
        for target, info in target_info.items():
            print(f"\nTarget: {target}")
            print(f"  Type: {info['type']}")
            print(f"  Unique values: {info['unique_values']}")
            print(f"  Null count: {info['null_count']}")
            print(f"  Data type: {info['dtype']}")
            print(f"  Sample values: {info['sample_values']}")
            
            # Show value distribution for categorical targets
            if info['type'] in ['binary', 'classification'] and info['unique_values'] <= 20:
                print(f"  Value distribution:")
                value_counts = df[target].value_counts()
                for val, count in value_counts.head(10).items():
                    print(f"    {val}: {count} ({count/len(df)*100:.1f}%)")
        
        print(f"\nTotal targets configured: {len(target_columns)}")
        
    except Exception as e:
        print(f"Error processing targets: {e}")
        target_columns = None
else:
    print("Please configure targets first")
    target_columns = None

## Step 4: Feature Processing

Separate features into categorical and continuous, handle missing values, and encode appropriately.

In [None]:
def analyze_features(df, target_columns, config):
    """
    Analyze features and categorize them as categorical or continuous.
    """
    feature_columns = [col for col in df.columns if col not in target_columns]
    
    categorical_cols = []
    continuous_cols = []
    
    print("=== Feature Analysis ===")
    print(f"Total features: {len(feature_columns)}")
    
    for col in feature_columns:
        unique_vals = df[col].nunique()
        dtype = df[col].dtype
        null_count = df[col].isnull().sum()
        
        # Determine if categorical or continuous
        if col in config['force_categorical']:
            feature_type = 'categorical'
        elif col in config['force_continuous']:
            feature_type = 'continuous'
        elif dtype in ['object', 'category', 'bool']:
            feature_type = 'categorical'
        elif unique_vals <= config['categorical_threshold']:
            feature_type = 'categorical'
        else:
            feature_type = 'continuous'
        
        if feature_type == 'categorical':
            categorical_cols.append(col)
        else:
            continuous_cols.append(col)
        
        print(f"{col:30s} | {feature_type:12s} | unique: {unique_vals:6d} | nulls: {null_count:6d} | dtype: {dtype}")
    
    print(f"\nCategorical features: {len(categorical_cols)}")
    print(f"Continuous features: {len(continuous_cols)}")
    
    return categorical_cols, continuous_cols

if df is not None and target_columns is not None:
    categorical_features, continuous_features = analyze_features(df, target_columns, CONFIG)
else:
    print("Please complete previous steps first")

In [None]:
def preprocess_features(df, categorical_cols, continuous_cols, target_columns):
    """
    Preprocess features: handle missing values, encode categorical variables, scale continuous variables.
    """
    df_processed = df.copy()
    
    print("=== Preprocessing Features ===")
    
    # Handle missing values
    print("\nHandling missing values...")
    
    # For categorical features: fill with 'Unknown'
    for col in categorical_cols:
        if df_processed[col].isnull().sum() > 0:
            df_processed[col] = df_processed[col].fillna('Unknown')
            print(f"  {col}: filled {df[col].isnull().sum()} missing values with 'Unknown'")
    
    # For continuous features: fill with median
    for col in continuous_cols:
        if df_processed[col].isnull().sum() > 0:
            median_val = df_processed[col].median()
            df_processed[col] = df_processed[col].fillna(median_val)
            print(f"  {col}: filled {df[col].isnull().sum()} missing values with median ({median_val})")
    
    # Encode categorical features
    print("\nEncoding categorical features...")
    label_encoders = {}
    
    for col in categorical_cols:
        le = LabelEncoder()
        df_processed[col] = le.fit_transform(df_processed[col].astype(str))
        label_encoders[col] = le
        print(f"  {col}: encoded {len(le.classes_)} unique values")

    # Scale continuous features
    print("\nScaling continuous features...")
    scaler = StandardScaler()
    for col in continuous_cols:
        df_processed[col] = scaler.fit_transform(df_processed[[col]])
        print(f"  {col}: scaled with StandardScaler")
    
    # Handle target encoding
    print("\nProcessing targets...")
    target_encoders = {}
    
    for target in target_columns:
        target_type = target_info[target]['type']
        
        if target_type in ['binary', 'classification']:
            # Handle missing values in targets
            if df_processed[target].isnull().sum() > 0:
                print(f"  Warning: {target} has {df_processed[target].isnull().sum()} missing values")
                # Drop rows with missing targets
                df_processed = df_processed.dropna(subset=[target])
                print(f"  Dropped rows with missing {target} values")
            
            le = LabelEncoder()
            df_processed[target] = le.fit_transform(df_processed[target].astype(str))
            target_encoders[target] = le
            print(f"  {target}: encoded as {target_type} with {len(le.classes_)} classes")
        
        elif target_type == 'regression':
            # Ensure numeric type
            df_processed[target] = pd.to_numeric(df_processed[target], errors='coerce')
            # Handle missing values
            if df_processed[target].isnull().sum() > 0:
                print(f"  Warning: {target} has {df_processed[target].isnull().sum()} missing values after conversion")
                df_processed = df_processed.dropna(subset=[target])
            print(f"  {target}: processed as regression target")
    
    print(f"\nFinal dataset shape: {df_processed.shape}")
    
    return df_processed, label_encoders, target_encoders

if df is not None and target_columns is not None:
    try:
        df_processed, feature_encoders, target_encoders = preprocess_features(
            df, categorical_features, continuous_features, target_columns
        )
        print("\nPreprocessing completed successfully!")
    except Exception as e:
        print(f"Error during preprocessing: {e}")
        df_processed = None
else:
    print("Please complete previous steps first")
    df_processed = None

## Step 5: Create H5 Dataset

Save the processed dataset in H5 format compatible with our modular dataset system.

In [None]:
def create_dataset_config(dataset_name, short_name, target_columns, target_info, categorical_features, continuous_features):
    """
    Create a dataset configuration dictionary.
    """
    # Create task type mapping
    task_types = {}
    task_out_dims = {}
    
    for target in target_columns:
        task_type = target_info[target]['type']
        task_types[target] = task_type
        
        if task_type in ['binary', 'regression']:
            task_out_dims[target] = 1
        elif task_type == 'classification':
            task_out_dims[target] = target_info[target]['unique_values']
    
    config = {
        'data': {
            'name': dataset_name,
            'short_name': short_name,
            'format': 'h5',
            'path': f'/{dataset_name}/',
            'tasks': target_columns,
            'task_type': task_types,
            'task_out_dim': task_out_dims,
            'num_features': len(categorical_features) + len(continuous_features),
        }
    }
    
    return config

if df_processed is not None:
    # Create output directory
    output_dir = Path(f"{CONFIG['output_dir']}/{CONFIG['dataset_name']}")
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Create H5 file path
    h5_filename = "train_val_test.h5"
    h5_path = output_dir / h5_filename
    
    print(f"Creating H5 dataset: {h5_path}")
    
    try:      
        # Create configuration
        dataset_config = create_dataset_config(
            CONFIG['dataset_name'], CONFIG['short_name'], target_columns, target_info, 
            categorical_features, continuous_features
        )
        
        # Save configuration
        config_path = output_dir / "config.yaml"
        
        import yaml
        with open(config_path, 'w') as f:
            yaml.dump(dataset_config, f, default_flow_style=False)
        
        print(f"Configuration saved: {config_path}")

        # Create H5 dataset
        create_h5_from_dataframe(
            df=df_processed,
            output_path=str(h5_path),
            tasks=target_columns,
            task_types=dataset_config['data']['task_type'],
            categorical_cols=categorical_features,
            continuous_cols=continuous_features,
            train_split=CONFIG['train_split'],
            val_split=CONFIG['val_split'],
            test_split=CONFIG['test_split'],
            random_seed=CONFIG['random_seed'],
            use_stratified=True
        )
        
        print(f"\nH5 dataset created successfully: {h5_path}")
        
        h5_created = True
        
    except Exception as e:
        print(f"Error creating H5 dataset: {e}")
        h5_created = False
        
else:
    print("Please complete preprocessing first")
    h5_created = False

## Step 6: Validation

Validate the created H5 dataset to ensure it's properly formatted.

In [None]:
if h5_created:
    print("=== H5 Dataset Validation ===")
    
    # Inspect the H5 structure
    print("\n--- H5 File Structure ---")
    inspect_h5_dataset(str(h5_path))
    
    # Validate the dataset
    print("\n--- Validation Results ---")
    validation_result = validate_h5_dataset(str(h5_path), target_columns)
    
    print(f"Valid: {validation_result['valid']}")
    print(f"Splits found: {validation_result['splits']}")
    print(f"Tasks found: {validation_result['tasks_found']}")
    
    if validation_result['errors']:
        print("\nErrors found:")
        for error in validation_result['errors']:
            print(f"  - {error}")
    
    print("\n--- Sample Counts ---")
    for split, count in validation_result['num_samples'].items():
        print(f"  {split}: {count:,} samples")
    
    print("\n--- Feature Shapes ---")
    for key, shape in validation_result['feature_shapes'].items():
        print(f"  {key}: {shape}")
    
    # Test loading with our dataset class
    print("\n--- Testing Dataset Loading ---")
    try:
        from data.dataset import H5Dataset
        
        # Test loading train split
        test_dataset = H5Dataset(
            h5_path=str(h5_path),
            split='train',
            tasks=target_columns,
            task_types={target: target_info[target]['type'] for target in target_columns},
            seperate_ft_types=True
        )
        
        print(f"Dataset loaded successfully!")
        print(f"Length: {len(test_dataset)}")
        print(f"Field dimensions: {test_dataset.field_dims}")
        
        # Test getting a sample
        sample = test_dataset[0]
        print(f"Sample keys: {list(sample.keys())}")
        print(f"Feature keys: {list(sample['features'].keys()) if isinstance(sample['features'], dict) else 'tensor'}")
        print(f"Target keys: {list(sample['targets'].keys())}")
        
        if isinstance(sample['features'], dict):
            print(f"Categorical features shape: {sample['features']['categorical'].shape}")
            print(f"Continuous features shape: {sample['features']['continuous'].shape}")
        else:
            print(f"Combined features shape: {sample['features'].shape}")
        
        print("\nAll validation tests passed! âœ…")
        
    except Exception as e:
        print(f"Error loading dataset: {e}")
        
else:
    print("No H5 dataset to validate")