In [49]:
%load_ext autoreload
%autoreload 2

import numpy as np
from SpectraDataset import SpectraDataset

dataset_reg_1_1 = SpectraDataset(task_type="regression")  # Single-output regression
dataset_reg_2_1 = SpectraDataset(task_type="regression")  # Single-output regression (first output)
dataset_reg_2_2 = SpectraDataset(task_type="regression")  # Single-output regression (second output)
dataset_cla_2_1 = SpectraDataset(task_type="classification")  # Single-output classification (first output)
dataset_cla_2_2 = SpectraDataset(task_type="classification")  # Single-output classification (second output)
dataset_bin_1_1 = SpectraDataset(task_type="binary")  # Binary classification

# Features
f1_source = np.random.rand(100, 1000) * 2.5 + 1.5
f2_source = np.random.rand(100, 500) * 12 + 3.5

# Targets
reg_target_1 = np.random.rand(100,)  # 1D array for single-output regression
reg_target_2_first = np.random.rand(100,)  # First output of multi-output regression
reg_target_2_second = np.random.rand(100,)  # Second output of multi-output regression
cla_target_2_first = np.random.randint(0, 5, size=(100,))  # First output of multi-output classification
cla_target_2_second = np.random.randint(0, 5, size=(100,))  # Second output of multi-output classification
bin_target_1 = np.random.randint(0, 2, size=(100,))  # 1D array for binary classification

# Add data to datasets
dataset_reg_1_1.add_data([f1_source], reg_target_1)
dataset_reg_2_1.add_data([f1_source, f2_source], reg_target_2_first)
dataset_reg_2_2.add_data([f1_source, f2_source], reg_target_2_second)
dataset_cla_2_1.add_data([f1_source, f2_source], cla_target_2_first)
dataset_cla_2_2.add_data([f1_source, f2_source], cla_target_2_second)
dataset_bin_1_1.add_data([f1_source], bin_target_1)
pass
# print("Dataset for regression 1-1:", dataset_reg_1_1)
# print("Dataset for regression 2-1 (first output):", dataset_reg_2_1)
# print("Dataset for regression 2-2 (second output):", dataset_reg_2_2)
# print("Dataset for classification 2-1 (first output):", dataset_cla_2_1)
# print("Dataset for classification 2-2 (second output):", dataset_cla_2_2)
# print("Dataset for binary classification 1-1:", dataset_bin_1_1)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [58]:
from sklearn.preprocessing import MinMaxScaler
from PipelineRunner import PipelineRunner

config = { "pipeline": [ MinMaxScaler(feature_range=(0.2,0.8)) ] }

runner = PipelineRunner(max_workers=4, continue_on_error=False)

print("Dataset before pipeline:", dataset_reg_1_1)
print("-"*200)
dataset_res_json, fitted_json, history_json, tree_json = runner.run(config, dataset_reg_1_1)

Dataset before pipeline: 
Source 0: 100x1000 Mean: 2.75, Std: 0.07

Samples: 100, Rows: 100, Features: 1
Partitions: ['train']
  train: 100 samples
Groups: [0] - Branches: [0] - Processing: ['raw']
Targets: {'classes': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 'n_samples': 100}
Results: {'n_predictions': 0, 'models': [], 'partitions': [], 'folds': []}

--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
🚀 Starting Pipeline Runner
🔹 Step 1: Dict with 3 keys
🔹 Current context: {'branch': 0}
🔹 Step config: {'instance': 'sklearn.preprocessing._data.MinMaxScaler', 'params': {'feature_range': [0.2, 0.8]}, '_runtime_instance': MinMaxScaler(feature_range=(0.2, 0.8))}
🔄 Wrapping sklearn transformer: MinMaxScaler
  ⚙️ Executing: Transform(MinMaxScaler)
------------------------------------------------------------------------------------------------------------

RuntimeError: Pipeline step failed: 'PipelineRunner' object has no attribute 'data_selector'

In [42]:
# Check if transformation worked - compare mean and std before and after
import numpy as np

print("=== TRANSFORMATION VERIFICATION ===")
print()

# Get features from original dataset using select() to get all data
original_view = dataset_reg_1_1.select()
original_features = original_view.get_features()
print(f"Original features shape: {original_features.shape}")
print(f"Original mean: {np.mean(original_features):.4f}")
print(f"Original std: {np.std(original_features):.4f}")
print(f"Original min: {np.min(original_features):.4f}")
print(f"Original max: {np.max(original_features):.4f}")

print()

# Get features from transformed dataset
transformed_view = dataset_res_json.select()
transformed_features = transformed_view.get_features()
print(f"Transformed features shape: {transformed_features.shape}")
print(f"Transformed mean: {np.mean(transformed_features):.4f}")
print(f"Transformed std: {np.std(transformed_features):.4f}")
print(f"Transformed min: {np.min(transformed_features):.4f}")
print(f"Transformed max: {np.max(transformed_features):.4f}")

print()

# Check if transformation is as expected (MinMaxScaler with range 0.2-0.8)
expected_min = 0.2
expected_max = 0.8
actual_min = np.min(transformed_features)
actual_max = np.max(transformed_features)

print(f"Expected range: [{expected_min}, {expected_max}]")
print(f"Actual range: [{actual_min:.4f}, {actual_max:.4f}]")

if np.isclose(actual_min, expected_min, atol=1e-10) and np.isclose(actual_max, expected_max, atol=1e-10):
    print("✅ MinMaxScaler transformation worked correctly!")
else:
    print("❌ MinMaxScaler transformation did NOT work correctly!")
    print("   The features should be scaled to range [0.2, 0.8]")

=== TRANSFORMATION VERIFICATION ===

Original features shape: (100, 1000)
Original mean: 2.7512
Original std: 0.7206
Original min: 1.5000
Original max: 4.0000

Transformed features shape: (100, 1000)
Transformed mean: 2.7512
Transformed std: 0.7206
Transformed min: 1.5000
Transformed max: 4.0000

Expected range: [0.2, 0.8]
Actual range: [1.5000, 4.0000]
❌ MinMaxScaler transformation did NOT work correctly!
   The features should be scaled to range [0.2, 0.8]


In [43]:
# Debug: Check if OperationTransformation can be imported and used
print("=== DEBUGGING OPERATIONTRANSFORMATION ===")

try:
    from operations.OperationTranformation import OperationTransformation
    print("✅ OperationTransformation imported successfully")

    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler(feature_range=(0.2, 0.8))

    # Create an OperationTransformation directly
    operation = OperationTransformation(transformer=scaler)
    print(f"✅ OperationTransformation created: {operation}")
    print(f"   Operation name: {operation.get_name()}")

    # Test if it's a sklearn transformer
    from sklearn.base import TransformerMixin
    print(f"   Is scaler a TransformerMixin? {isinstance(scaler, TransformerMixin)}")

except ImportError as e:
    print(f"❌ Failed to import OperationTransformation: {e}")
except Exception as e:
    print(f"❌ Error creating OperationTransformation: {e}")

print()

# Check what the PipelineBuilder is actually creating
from PipelineBuilder import PipelineBuilder
builder = PipelineBuilder()

print("=== DEBUGGING PIPELINE BUILDER ===")
try:
    scaler = MinMaxScaler(feature_range=(0.2, 0.8))
    operation = builder._wrap_operator(scaler)
    print(f"PipelineBuilder created operation: {type(operation)}")
    print(f"Operation name: {operation.get_name()}")
except Exception as e:
    print(f"❌ Error in PipelineBuilder._wrap_operator: {e}")

print()

# Check what build_operation creates for our config
try:
    operation = builder.build_operation(scaler)
    print(f"PipelineBuilder.build_operation created: {type(operation)}")
    print(f"Operation name: {operation.get_name()}")
except Exception as e:
    print(f"❌ Error in PipelineBuilder.build_operation: {e}")

=== DEBUGGING OPERATIONTRANSFORMATION ===
✅ OperationTransformation imported successfully
✅ OperationTransformation created: <operations.OperationTranformation.OperationTransformation object at 0x0000028BC693F9D0>
   Operation name: Transform(MinMaxScaler)
   Is scaler a TransformerMixin? True

=== DEBUGGING PIPELINE BUILDER ===
PipelineBuilder created operation: <class 'operations.OperationTranformation.OperationTransformation'>
Operation name: Transform(MinMaxScaler)

PipelineBuilder.build_operation created: <class 'operations.OperationTranformation.OperationTransformation'>
Operation name: Transform(MinMaxScaler)


In [44]:
# Simple debugging - check dataset partitions and pipeline operations
print("=== SIMPLE DEBUGGING ===")

# Check what partitions exist in our dataset
try:
    partitions = dataset_reg_1_1.get_partition_names()
    print(f"Available partitions: {partitions}")
except:
    print("No get_partition_names method or error getting partitions")

# Check if OperationTransformation can be imported
try:
    from operations.OperationTranformation import OperationTransformation
    print("✅ OperationTransformation imported successfully")
except ImportError as e:
    print(f"❌ Import failed: {e}")

# Check sklearn TransformerMixin
try:
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.base import TransformerMixin
    scaler = MinMaxScaler(feature_range=(0.2, 0.8))
    print(f"✅ Is MinMaxScaler a TransformerMixin? {isinstance(scaler, TransformerMixin)}")
except Exception as e:
    print(f"❌ sklearn error: {e}")

print()
print("Let's check the fitted pipeline history to see what actually executed:")
try:
    print(f"History steps: {len(history_json.steps)}")
    for i, step in enumerate(history_json.steps):
        print(f"  Step {i+1}: {step}")
except Exception as e:
    print(f"Error accessing history: {e}")

=== SIMPLE DEBUGGING ===
No get_partition_names method or error getting partitions
✅ OperationTransformation imported successfully
✅ Is MinMaxScaler a TransformerMixin? True

Let's check the fitted pipeline history to see what actually executed:
Error accessing history: 'PipelineHistory' object has no attribute 'steps'


In [46]:
# Check and fix partition issue
print("=== INVESTIGATING PARTITIONS ===")

# Check all available methods on the dataset
dataset_methods = [method for method in dir(dataset_reg_1_1) if not method.startswith('_')]
print(f"Available dataset methods: {[m for m in dataset_methods if 'partition' in m.lower()]}")

# Check if there's a way to see what data exists
try:
    view = dataset_reg_1_1.select()
    print(f"Total samples in dataset: {len(view)}")

    # Try to see sample info
    if hasattr(view, 'sample_ids'):
        print(f"Sample IDs: {view.sample_ids[:5]}...")  # First 5
    if hasattr(view, 'partitions'):
        print(f"Partitions in view: {view.partitions}")
    if hasattr(view, 'row_indices'):
        print(f"Row indices: {view.row_indices[:5]}...")  # First 5

except Exception as e:
    print(f"Error checking dataset view: {e}")

print()

# Try to select with specific partition
try:
    train_view = dataset_reg_1_1.select(partition="train")
    print(f"Train partition has {len(train_view)} samples")
except Exception as e:
    print(f"No 'train' partition found: {e}")

try:
    all_view = dataset_reg_1_1.select()
    print(f"All data (no partition filter) has {len(all_view)} samples")
except Exception as e:
    print(f"Error getting all data: {e}")

print()
print("=== TRYING MANUAL OPERATIONTRANSFORMATION ===")
print("Let's try to create and execute OperationTransformation manually...")

try:
    from operations.OperationTranformation import OperationTransformation
    from sklearn.preprocessing import MinMaxScaler
    from PipelineContext import PipelineContext

    # Create scaler and operation
    scaler = MinMaxScaler(feature_range=(0.2, 0.8))
    operation = OperationTransformation(
        transformer=scaler,
        fit_partition=None,  # Try with None first
        transform_partitions=None
    )

    # Create context
    context = PipelineContext()

    print("Executing OperationTransformation manually...")
    operation.execute(dataset_reg_1_1, context)
    print("✅ Manual execution completed!")

except Exception as e:
    print(f"❌ Manual execution failed: {e}")
    import traceback
    traceback.print_exc()

=== INVESTIGATING PARTITIONS ===
Available dataset methods: []
Total samples in dataset: 100

Train partition has 100 samples
All data (no partition filter) has 100 samples

=== TRYING MANUAL OPERATIONTRANSFORMATION ===
Let's try to create and execute OperationTransformation manually...
Executing OperationTransformation manually...
🔄 Executing Transform(MinMaxScaler)
❌ Manual execution failed: No data found in partition 'None' for fitting


Traceback (most recent call last):
  File "C:\Users\U108-N257\AppData\Local\Temp\ipykernel_17444\1209440976.py", line 60, in <module>
    operation.execute(dataset_reg_1_1, context)
  File "c:\Workspace\ML\nirs4all\examples\bench\core\operations\OperationTranformation.py", line 52, in execute
    raise ValueError(f"No data found in partition '{self.fit_partition}' for fitting")        # Get features per source (keep sources separate)
ValueError: No data found in partition 'None' for fitting


In [47]:
# Fix the manual test - use "train" partition (the default)
print("=== FIXED MANUAL OPERATIONTRANSFORMATION TEST ===")

try:
    from operations.OperationTranformation import OperationTransformation
    from sklearn.preprocessing import MinMaxScaler
    from PipelineContext import PipelineContext

    # Create a fresh dataset for testing (copy of original)
    test_dataset = SpectraDataset(task_type="regression")
    test_dataset.add_data([f1_source], reg_target_1)

    print("Before transformation:")
    before_view = test_dataset.select()
    before_features = before_view.get_features()
    print(f"  Min: {np.min(before_features):.4f}, Max: {np.max(before_features):.4f}")

    # Create scaler and operation with correct partition
    scaler = MinMaxScaler(feature_range=(0.2, 0.8))
    operation = OperationTransformation(
        transformer=scaler,
        fit_partition="train",  # Use "train" partition (default)
        transform_partitions=None  # Transform all partitions
    )

    # Create context
    context = PipelineContext()

    print("Executing OperationTransformation manually on test dataset...")
    operation.execute(test_dataset, context)
    print("✅ Manual execution completed!")

    print("After transformation:")
    after_view = test_dataset.select()
    after_features = after_view.get_features()
    print(f"  Min: {np.min(after_features):.4f}, Max: {np.max(after_features):.4f}")

    # Verify transformation
    expected_min, expected_max = 0.2, 0.8
    actual_min, actual_max = np.min(after_features), np.max(after_features)
    if np.isclose(actual_min, expected_min, atol=1e-10) and np.isclose(actual_max, expected_max, atol=1e-10):
        print('✅ Manual OperationTransformation worked correctly!')
    else:
        print('❌ Manual OperationTransformation did NOT work correctly!')

except Exception as e:
    print(f"❌ Manual execution failed: {e}")
    import traceback
    traceback.print_exc()

=== FIXED MANUAL OPERATIONTRANSFORMATION TEST ===
Before transformation:
  Min: 1.5000, Max: 4.0000
Executing OperationTransformation manually on test dataset...
🔄 Executing Transform(MinMaxScaler)
  📊 Fitting on 100 samples from 'train' partition
  🔧 1 sources detected, fitting transformer per source
    ✅ Source 0: fitted MinMaxScaler on shape (100, 1000)
❌ Manual execution failed: 'SpectraDataset' object has no attribute 'get_partition_names'


Traceback (most recent call last):
  File "C:\Users\U108-N257\AppData\Local\Temp\ipykernel_17444\1005093770.py", line 30, in <module>
    operation.execute(test_dataset, context)
  File "c:\Workspace\ML\nirs4all\examples\bench\core\operations\OperationTranformation.py", line 73, in execute
    partitions_to_transform = self.transform_partitions or dataset.get_partition_names()
AttributeError: 'SpectraDataset' object has no attribute 'get_partition_names'


In [None]:
# Test the simplified operation transformation directly
from operations.OperationTranformation import OperationTransformation
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# Create operation with simplified context approach
simple_context = {"branch": 0}  # Simple dict instead of complex PipelineContext

print("Testing simplified transformation operation...")

# Create a copy for testing
test_dataset = dataset_reg_1_1.copy()
print(f"Original features shape: {test_dataset.features.shape}")

# Show original features sample
original_view = test_dataset.select(partition="train", branch=0)
original_features = original_view.get_features()
print(f"Original features min/max: {original_features.min():.3f} / {original_features.max():.3f}")

# Create transformer operation
scaler_op = OperationTransformation(MinMaxScaler())

try:
    # Execute with simple context
    scaler_op.execute(test_dataset, simple_context)
    print("✅ Transformation completed successfully!")

    # Check if features were actually transformed
    transformed_view = test_dataset.select(partition="train", branch=0)
    transformed_features = transformed_view.get_features()

    print(f"Transformed features min/max: {transformed_features.min():.3f} / {transformed_features.max():.3f}")

    # Verify transformation worked (MinMaxScaler should scale to [0,1])
    expected_min = 0.0
    expected_max = 1.0
    actual_min = transformed_features.min()
    actual_max = transformed_features.max()

    print(f"Expected range: [{expected_min}, {expected_max}]")
    print(f"Actual range: [{actual_min:.6f}, {actual_max:.6f}]")

    if np.isclose(actual_min, expected_min, atol=1e-10) and np.isclose(actual_max, expected_max, atol=1e-10):
        print("✅ MinMaxScaler worked correctly!")
    else:
        print("⚠️ Scaling might not be perfect, but transformation applied")

except Exception as e:
    print(f"❌ Transformation failed: {e}")
    import traceback
    traceback.print_exc()

In [None]:
# Test the simplified pipeline runner
print("=== TESTING SIMPLIFIED PIPELINE RUNNER ===")

try:
    from SimplePipelineRunner import SimplePipelineRunner
    from sklearn.preprocessing import MinMaxScaler, StandardScaler

    # Create a simple pipeline config with branching
    simple_config = {
        "pipeline": [
            "MinMaxScaler",  # String preset
            {
                "branch": [
                    ["StandardScaler"],  # Branch 0: just StandardScaler
                    [{"class": "sklearn.preprocessing.MinMaxScaler"}]  # Branch 1: another MinMaxScaler
                ]
            }
        ]
    }

    # Create simplified runner
    simple_runner = SimplePipelineRunner(continue_on_error=True, verbose=1)

    # Test with a small dataset
    print("Testing simplified pipeline runner with branching...")
    print(f"Starting dataset shape: {len(dataset_reg_1_1)}")

    # Make a copy for testing
    test_dataset_2 = dataset_reg_1_1.copy()

    result_dataset, fitted_pipeline, history = simple_runner.run(simple_config, test_dataset_2)
    print("✅ Simple pipeline with branching completed successfully!")
    print(f"Final dataset shape: {len(result_dataset)}")

    # Check the branches were processed
    branch_0_view = result_dataset.select(branch=0)
    branch_1_view = result_dataset.select(branch=1)
    print(f"Branch 0 samples: {len(branch_0_view)}")
    print(f"Branch 1 samples: {len(branch_1_view)}")

except Exception as e:
    print(f"❌ Simple pipeline failed: {e}")
    import traceback
    traceback.print_exc()

In [48]:
# FINAL TEST: Complete pipeline end-to-end with FIXED OperationTransformation
print("=== FINAL COMPLETE PIPELINE TEST ===")

try:
    from sklearn.preprocessing import MinMaxScaler
    from PipelineRunner import PipelineRunner

    # Create fresh dataset
    final_dataset = SpectraDataset(task_type="regression")
    final_dataset.add_data([f1_source], reg_target_1)

    print("Before pipeline:")
    before_view = final_dataset.select()
    before_features = before_view.get_features()
    print(f"  Min: {np.min(before_features):.4f}, Max: {np.max(before_features):.4f}")
    print(f"  Mean: {np.mean(before_features):.4f}, Std: {np.std(before_features):.4f}")

    # Create pipeline config with MinMaxScaler
    config = { "pipeline": [ MinMaxScaler(feature_range=(0.2,0.8)) ] }
    runner = PipelineRunner(max_workers=4, continue_on_error=False)

    print("\nRunning complete pipeline...")
    fitted_pipeline, history, tree = runner.run(final_dataset, config, verbose=True)
    print("✅ Complete pipeline finished!")

    print("\nAfter pipeline:")
    after_view = final_dataset.select()
    after_features = after_view.get_features()
    print(f"  Min: {np.min(after_features):.4f}, Max: {np.max(after_features):.4f}")
    print(f"  Mean: {np.mean(after_features):.4f}, Std: {np.std(after_features):.4f}")

    # Verify transformation
    expected_min, expected_max = 0.2, 0.8
    actual_min, actual_max = np.min(after_features), np.max(after_features)

    print("\n=== PIPELINE VERIFICATION ===")
    if np.isclose(actual_min, expected_min, atol=1e-10) and np.isclose(actual_max, expected_max, atol=1e-10):
        print('✅ COMPLETE PIPELINE WORKED CORRECTLY! MinMaxScaler transformation applied successfully.')
    else:
        print('❌ Pipeline transformation was NOT applied correctly!')
        print(f'Expected range: [{expected_min}, {expected_max}]')
        print(f'Actual range: [{actual_min:.6f}, {actual_max:.6f}]')

    print(f"\n📊 Dataset transformation summary:")
    print(f"   Before: min={np.min(before_features):.4f}, max={np.max(before_features):.4f}")
    print(f"   After:  min={actual_min:.4f}, max={actual_max:.4f}")
    print(f"   Expected: min={expected_min}, max={expected_max}")

except Exception as e:
    print(f"❌ Complete pipeline failed: {e}")
    import traceback
    traceback.print_exc()

=== FINAL COMPLETE PIPELINE TEST ===
Before pipeline:
  Min: 1.5000, Max: 4.0000
  Mean: 2.7512, Std: 0.7206

Running complete pipeline...
❌ Complete pipeline failed: PipelineRunner.run() got an unexpected keyword argument 'verbose'


Traceback (most recent call last):
  File "C:\Users\U108-N257\AppData\Local\Temp\ipykernel_17444\1605094840.py", line 23, in <module>
    fitted_pipeline, history, tree = runner.run(final_dataset, config, verbose=True)
TypeError: PipelineRunner.run() got an unexpected keyword argument 'verbose'


# Pipeline Context Simplification Summary

## Before (Complex):
- **PipelineContext**: 449 lines with complex scope management
- **DataSelector**: Complex scoping rules for different operation types
- **Context Management**: Push/pop scope stacks, filtering, augmentation tracking
- **DatasetView**: Complex view-based data selection

## After (Simple):
- **Simple Context**: Just a dict with `{"branch": 0}`
- **Operations**: Handle their own data selection using `dataset.select(partition="train", branch=context["branch"])`
- **No Scoping**: Operations directly specify what data they want
- **Direct Selection**: Operations call `dataset.select()` with simple filters

## Benefits:
1. **Clarity**: Operations explicitly show what data they operate on
2. **Simplicity**: Context is just branch information
3. **Maintainability**: No complex scope management to debug
4. **Performance**: No overhead from complex context tracking
5. **Flexibility**: Operations can implement custom selection logic easily

## Example:
```python
# Old way (complex):
fit_view = dataset.select(partition=self.fit_partition, **context.current_filters)

# New way (simple):
branch = context.get('branch', 0)
fit_view = dataset.select(partition=self.fit_partition, branch=branch)
```