# EpiLearn Dataset Loading Test

This Notebook is used to test the data loading functionality of the EpiLearn framework, including the following two test methods:

1. **CSV Dataset Test** - Test loading custom data from CSV files
2. **Toy Dataset Test** - Test built-in toy datasets

## Environment Setup


In [1]:
import os
import sys
import torch
import numpy as np
from datetime import datetime
import pandas as pd

# Add EpiLearn to Python path
current_dir = os.path.dirname(os.path.abspath('.'))
sys.path.append(current_dir)

try:
    from epilearn.data import UniversalDataset
    from epilearn.utils import transforms
    print("‚úì Successfully imported EpiLearn modules")
except ImportError as e:
    print(f"‚ùå Import failed: {e}")
    print("Please ensure EpiLearn is properly installed")
    raise


  from .autonotebook import tqdm as notebook_tqdm


‚úì Successfully imported EpiLearn modules


In [2]:
# Utility functions
def print_separator(title):
    print("\n" + "="*60)
    print(f"  {title}")
    print("="*60)

def print_tensor_info(name, tensor):
    if tensor is not None:
        print(f"  {name:15}: {tensor.shape} | {tensor.dtype} | Memory: {tensor.numel() * tensor.element_size() / 1024:.1f} KB")
        if tensor.numel() > 0:
            print(f"  {'':<15}  Range: [{tensor.min().item():.4f}, {tensor.max().item():.4f}]")
    else:
        print(f"  {name:15}: None")


## 1. CSV Dataset Test

This test demonstrates how to load custom data from CSV files, including automatic feature column and target column detection functionality.


In [3]:
def infer_cols(csv_path, node_col='node', time_col='time',
                target_hints=('y','target','label'), target_regex=None, sample_rows=None,
                manual_feature_cols=None, manual_target_cols=None):
    """
    Automatically infer feature_cols / target_cols from CSV, with manual override options.
    
    Parameters:
    - target_hints: Common target column names (case insensitive)
    - target_regex: Optional regex pattern (e.g., r'^y(_.*)?$')
    - sample_rows: Only read first N rows for quick inference; None for full table
    - manual_feature_cols: Manually specified feature column list, overrides auto inference if provided
    - manual_target_cols: Manually specified target column list, overrides auto inference if provided
    
    Returns:
    - feature_cols: Final feature columns to use
    - target_cols: Final target columns to use
    """
    df = pd.read_csv(csv_path, nrows=sample_rows)
    cols = df.columns.tolist()
    if node_col not in cols or time_col not in cols:
        raise ValueError(f"Missing key columns: node_col='{node_col}', time_col='{time_col}'")

    # Candidate columns = all columns except node/time
    user_cols = [c for c in cols if c not in (node_col, time_col)]

    # Keep only "numeric-like" columns (at least one value can be converted to numeric)
    numeric_candidates = []
    for c in user_cols:
        s = pd.to_numeric(df[c], errors='coerce')
        if (~s.isna()).any():
            numeric_candidates.append(c)

    # Auto-infer target columns
    lowers = {c: c.lower() for c in numeric_candidates}
    t_candidates = [c for c, lc in lowers.items()
                    if lc in target_hints or lc.startswith('y_')]
    if target_regex:
        import re
        pat = re.compile(target_regex)
        t_candidates += [c for c in numeric_candidates if pat.fullmatch(c)]

    # Remove duplicates while preserving original order
    seen = set()
    auto_target_cols = [c for c in t_candidates if not (c in seen or seen.add(c))]
    auto_target_cols = auto_target_cols or None

    # Auto-infer feature columns = numeric candidates - target columns
    auto_feature_cols = [c for c in numeric_candidates if c not in set(auto_target_cols or [])]

    # Decide final columns: manual > auto
    final_target_cols = manual_target_cols if manual_target_cols is not None else auto_target_cols
    final_feature_cols = manual_feature_cols if manual_feature_cols is not None else auto_feature_cols

    # Print results
    print("üîé [Column Selection Results]")
    print(f"  All columns: {cols}")
    print(f"  Numeric candidates: {numeric_candidates}")
    print(f"  Auto-inferred target_cols: {auto_target_cols}")
    print(f"  Auto-inferred feature_cols: {auto_feature_cols}")
    
    if manual_target_cols is not None:
        print(f"  üéØ Manually specified target_cols: {manual_target_cols}")
    if manual_feature_cols is not None:
        print(f"  üéØ Manually specified feature_cols: {manual_feature_cols}")
        
    print(f"  ‚úÖ Final target_cols: {final_target_cols}")
    print(f"  ‚úÖ Final feature_cols: {final_feature_cols}")

    return final_feature_cols, final_target_cols


In [None]:
def test_csv_dataset(manual_feature_cols=None, manual_target_cols=None):
    """
    CSV Test: Read existing CSV files -> from_csv loading -> print data info
    
    Parameters:
    - manual_feature_cols: Manually specified feature columns, e.g., ['f1', 'f2']
    - manual_target_cols: Manually specified target columns, e.g., ['y']
    
    If manual columns are not provided, auto-inferred results will be used
    """
    print_separator("Test CSV Dataset (Manual/Auto Column Selection)")

    try:
        os.makedirs('./datasets', exist_ok=True)
        feat_path = 'toy_features.csv'
        edge_path = '/toy_edges.csv'

        # Smart column selection: support manual override of auto inference
        feature_cols, target_cols = infer_cols(
            feat_path,
            node_col='node',
            time_col='time',
            target_hints=('y','target','label'),
            target_regex=None,
            sample_rows=None,
            manual_feature_cols=manual_feature_cols,  # New: manual feature columns
            manual_target_cols=manual_target_cols     # New: manual target columns
        )

        # Load data using finalized columns
        ds = UniversalDataset.from_csv(
            feature_csv=feat_path,
            node_id_col="node",
            time_col="time",
            feature_cols=feature_cols,
            target_cols=target_cols,
            edge_csv=edge_path
        )

        # Simple print verification
        print("\n‚úÖ CSV loaded:")
        print("  x:", ds.x.shape, "| y:", None if ds.y is None else ds.y.shape)
        print("  graph:", None if ds.graph is None else ds.graph.shape,
              "| edge_index:", None if ds.edge_index is None else ds.edge_index.shape)

        # Print basic information
        print("\nüìä CSV Data Dimension Info:")
        print_tensor_info("x (features)", ds.x)
        print_tensor_info("y (targets)", ds.y)
        print_tensor_info("graph (static)", ds.graph)
        print_tensor_info("edge_index", ds.edge_index)

        print("‚úÖ CSV data loading test passed!")
        return ds

    except Exception as e:
        print(f"‚ùå CSV dataset test failed: {e}")
        return None

# Demo 1: Use auto inference (default behavior)
print("üîÑ Demo 1: Auto-infer columns")
csv_dataset_auto = test_csv_dataset()


üîÑ Demo 1: Auto-infer columns

  Test CSV Dataset (Manual/Auto Column Selection)
üîé [Column Selection Results]
  All columns: ['time', 'node', 'f0', 'f1', 'f2', 'f3', 'y']
  Numeric candidates: ['f0', 'f1', 'f2', 'f3', 'y']
  Auto-inferred target_cols: ['y']
  Auto-inferred feature_cols: ['f0', 'f1', 'f2', 'f3']
  ‚úÖ Final target_cols: ['y']
  ‚úÖ Final feature_cols: ['f0', 'f1', 'f2', 'f3']

‚úÖ CSV loaded:
  x: torch.Size([539, 47, 4]) | y: torch.Size([539, 47])
  graph: torch.Size([47, 47]) | edge_index: torch.Size([2, 2189])

üìä CSV Data Dimension Info:
  x (features)   : torch.Size([539, 47, 4]) | torch.float32 | Memory: 395.8 KB
                   Range: [-0.6547, 5908.0000]
  y (targets)    : torch.Size([539, 47]) | torch.float32 | Memory: 99.0 KB
                   Range: [0.0000, 5908.0000]
  graph (static) : torch.Size([47, 47]) | torch.float32 | Memory: 8.6 KB
                   Range: [0.0000, 1.0000]
  edge_index     : torch.Size([2, 2189]) | torch.int64 | Memory: 3

In [5]:
# Demo 2: Manual column specification (override auto inference)
print("\nüîÑ Demo 2: Manual column specification")
csv_dataset_manual = test_csv_dataset(
    manual_feature_cols=['f1', 'f2'],  # Manually specify feature columns
    manual_target_cols=['y']           # Manually specify target columns
)



üîÑ Demo 2: Manual column specification

  Test CSV Dataset (Manual/Auto Column Selection)
üîé [Column Selection Results]
  All columns: ['time', 'node', 'f0', 'f1', 'f2', 'f3', 'y']
  Numeric candidates: ['f0', 'f1', 'f2', 'f3', 'y']
  Auto-inferred target_cols: ['y']
  Auto-inferred feature_cols: ['f0', 'f1', 'f2', 'f3']
  üéØ Manually specified target_cols: ['y']
  üéØ Manually specified feature_cols: ['f1', 'f2']
  ‚úÖ Final target_cols: ['y']
  ‚úÖ Final feature_cols: ['f1', 'f2']

‚úÖ CSV loaded:
  x: torch.Size([539, 47, 2]) | y: torch.Size([539, 47])
  graph: torch.Size([47, 47]) | edge_index: torch.Size([2, 2189])

üìä CSV Data Dimension Info:
  x (features)   : torch.Size([539, 47, 2]) | torch.float32 | Memory: 197.9 KB
                   Range: [-0.6547, 6.0000]
  y (targets)    : torch.Size([539, 47]) | torch.float32 | Memory: 99.0 KB
                   Range: [0.0000, 5908.0000]
  graph (static) : torch.Size([47, 47]) | torch.float32 | Memory: 8.6 KB
                

## 2. Toy Dataset Test

This test demonstrates how to load EpiLearn's built-in toy dataset for quick framework functionality verification.


In [6]:
def test_toy_dataset():
    print_separator("Test Toy Dataset")
    try:
        dataset = UniversalDataset()
        print("üì¶ Starting to download toy dataset...")
        dataset.load_toy_dataset()
        print("‚úì Toy dataset loaded successfully!")

        print("\nüìä Data Dimension Info:")
        print_tensor_info("x (features)", dataset.x)
        print_tensor_info("y (targets)", dataset.y)
        print_tensor_info("states", dataset.states)
        print_tensor_info("graph (static)", dataset.graph)
        print_tensor_info("dynamic_graph", dataset.dynamic_graph)
        print_tensor_info("edge_index", dataset.edge_index)
        print_tensor_info("edge_weight", dataset.edge_weight)

        print(f"\nüìè Dataset length: {len(dataset)}")

        if len(dataset) > 0:
            sample = dataset[0]
            print(f"\nüîç First sample:")
            print(f"  Type: {type(sample)}")
            if hasattr(sample, 'x'):
                print(f"  sample.x: {sample.x.shape if sample.x is not None else 'None'}")
            if hasattr(sample, 'y'):
                print(f"  sample.y: {sample.y.shape if sample.y is not None else 'None'}")

        return dataset

    except Exception as e:
        print(f"‚ùå Toy dataset test failed: {e}")
        return None

# Execute toy dataset test
toy_dataset = test_toy_dataset()



  Test Toy Dataset
üì¶ Starting to download toy dataset...
‚úì Toy dataset loaded successfully!

üìä Data Dimension Info:
  x (features)   : torch.Size([539, 47, 4]) | torch.float32 | Memory: 395.8 KB
                   Range: [-0.6547, 5908.0000]
  y (targets)    : torch.Size([539, 47]) | torch.float32 | Memory: 99.0 KB
                   Range: [0.0000, 5908.0000]
  states         : torch.Size([539, 47, 3]) | torch.float32 | Memory: 296.9 KB
                   Range: [0.0000, 14047001.0000]
  graph (static) : torch.Size([47, 47]) | torch.float32 | Memory: 8.6 KB
                   Range: [0.0000, 5424283.0000]
  dynamic_graph  : torch.Size([539, 47, 47, 1]) | torch.float32 | Memory: 4651.0 KB
                   Range: [0.0000, 969939.0000]
  edge_index     : torch.Size([2, 2189]) | torch.int64 | Memory: 34.2 KB
                   Range: [0.0000, 46.0000]
  edge_weight    : torch.Size([2189]) | torch.float32 | Memory: 8.6 KB
                   Range: [1.0000, 5424283.0000]

üìè Da

## Test Summary

The following summarizes test results and displays basic dataset information.


In [7]:
# Summarize test results
print_separator("Test Summary")

datasets = []
if 'csv_dataset_auto' in locals() and csv_dataset_auto is not None:
    datasets.append(("CSV Auto-inference", csv_dataset_auto))
if 'csv_dataset_manual' in locals() and csv_dataset_manual is not None:
    datasets.append(("CSV Manual Specification", csv_dataset_manual))
if 'toy_dataset' in locals() and toy_dataset is not None:
    datasets.append(("Toy Dataset", toy_dataset))

print(f"‚úÖ Successfully loaded datasets: {len(datasets)}")
for name, dataset in datasets:
    print(f"  - {name}")
    
if not datasets:
    print("‚ùå No datasets loaded successfully")
else:
    print(f"\n‚è∞ Test completion time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print("üéâ Dataset testing completed! You can now start using EpiLearn for epidemiological modeling.")
    
print("\nüìù Usage Instructions:")
print("  1. Auto-inference mode: test_csv_dataset() - automatically detect feature and target columns")
print("  2. Manual specification mode: test_csv_dataset(manual_feature_cols=['col1', 'col2'], manual_target_cols=['target'])")
print("  3. Hybrid mode: manually specify only one type, use auto-inference for the other")



  Test Summary
‚úÖ Successfully loaded datasets: 3
  - CSV Auto-inference
  - CSV Manual Specification
  - Toy Dataset

‚è∞ Test completion time: 2025-08-26 12:45:06
üéâ Dataset testing completed! You can now start using EpiLearn for epidemiological modeling.

üìù Usage Instructions:
  1. Auto-inference mode: test_csv_dataset() - automatically detect feature and target columns
  2. Manual specification mode: test_csv_dataset(manual_feature_cols=['col1', 'col2'], manual_target_cols=['target'])
  3. Hybrid mode: manually specify only one type, use auto-inference for the other
