In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import MinMaxScaler
from sklearn.cross_decomposition import PLSRegression

from nirs4all.operators.transformations import Gaussian, SavitzkyGolay, StandardNormalVariate, Haar
from nirs4all.pipeline.config import PipelineConfigs
from nirs4all.dataset.dataset_config import DatasetConfigs
from nirs4all.pipeline.runner import PipelineRunner
import json

pipeline_separated = [
    # Normalize the spectra reflectance
    MinMaxScaler(feature_range=(0.1, 0.8)),

    # Generate 10 version of feature augmentation combinations (3 elements with size 1 to 2, ie. [SG, [SNV, GS], Haar])
    {
        "feature_augmentation": {
            "_or_": [
                Gaussian, StandardNormalVariate, SavitzkyGolay, Haar,
            ],
            "size": [3, (1,2)],
            "count": 2,
        }
    },

    # Split the dataset in train and validation
    ShuffleSplit(n_splits=3, test_size=.25),

    # Normalize the y values
    {"y_processing": MinMaxScaler},

    # PLS regression with 1 to 60 components
    {
        "model": PLSRegression,
        "model_params": {
            "n_components": {
                "_range_": [1, 4],
            }
        }
    }
]

pipeline_commons = [
    # Normalize the spectra reflectance
    MinMaxScaler(),

    # Generate 10 version of feature augmentation combinations (3 elements with size 1 to 2, ie. [SG, [SNV, GS], Haar])
    {
        "feature_augmentation": {
            "_or_": [
                Gaussian, StandardNormalVariate, SavitzkyGolay, Haar,
            ],
            "size": [3, (1,2)],
            "count": 2,
        }
    },

    # Split the dataset in train and validation
    ShuffleSplit(n_splits=3, test_size=.25),

    # Normalize the y values
    {"y_processing": MinMaxScaler},
]

for i in range(1, 61):
    pipeline_commons.append(PLSRegression(n_components=i))

# create pipeline config
config = PipelineConfigs(pipeline_commons)


# path = ['../../sample_data/regression', '../../sample_data/classification', '../../sample_data/binary']
path = '../../sample_data/regression'
dataset_config_obj = DatasetConfigs(path)

runner = PipelineRunner()
results = runner.run(config, dataset_config_obj)


✅ Loaded pipeline(s) with 2 configuration(s).
✅ Loaded dataset 'regression' with 130 training and 59 test samples.
[94m🚀 Starting pipeline config_68891f28 on dataset regression[0m
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
[94m🔄 Running 64 steps in sequential mode[0m
[92m🔷 Step 1: {'class': 'sklearn.preprocessing._data.MinMaxScaler', '_runtime_instance': MinMaxScaler()}[0m
🔹 Executing controller TransformerMixinController with operator MinMaxScaler
💾 Saved file: 1_0_MinMaxScaler_1.pkl
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
[92m🔷 Step 2: {'feature_augmentation': ['nirs4all.operators.transformations.nirs.Haar', ['nirs4all.operators.transformations.nirs.SavitzkyGolay'

In [None]:
pipeline_commons = [
    MinMaxScaler(feature_range=(0.1, 0.8)),
    {
        "y_processing": {
            "class": "sklearn.preprocessing._data.MinMaxScaler",
            "params": {
                "feature_range": (0.1, 0.8)
            }
        }
    },
    {
        "model": PLSRegression,
        "model_params": {
            "n_components": {
                "_range_": [1, 4],
            }
        }
    }
]

In [17]:
# Let's debug the serialization process
import json
from nirs4all.pipeline.serialization import serialize_component, deserialize_component

print("=== Testing serialization ===")

# Test case 1: Bare class
test_step_1 = {"y_processing": MinMaxScaler}
serialized_1 = serialize_component(test_step_1)
print(f"Original: {test_step_1}")
print(f"Serialized: {serialized_1}")
print(f"JSON: {json.dumps(serialized_1, indent=2)}")

# Test deserialization
deserialized_1 = deserialize_component(serialized_1["y_processing"])
print(f"Deserialized y_processing: {deserialized_1} (type: {type(deserialized_1)})")

print("\n" + "="*50)

# Test case 2: With params
test_step_2 = {
    "model": PLSRegression,
    "model_params": {"n_components": 1}
}

from nirs4all.pipeline.config import PipelineConfigs
preprocessed_2 = PipelineConfigs._preprocess_steps(test_step_2)
serialized_2 = serialize_component(preprocessed_2)
print(f"Original: {test_step_2}")
print(f"Preprocessed: {preprocessed_2}")
print(f"Serialized: {serialized_2}")
print(f"JSON: {json.dumps(serialized_2, indent=2)}")

# Test deserialization
deserialized_2 = deserialize_component(serialized_2["model"])
print(f"Deserialized model: {deserialized_2} (type: {type(deserialized_2)})")

print("\n" + "="*50)

# Test the full pipeline config
test_pipeline = [
    MinMaxScaler,
    {"y_processing": MinMaxScaler},
    {
        "model": PLSRegression,
        "model_params": {
            "n_components": 1,
        }
    }
]

print(f"Original pipeline: {test_pipeline}")
preprocessed = PipelineConfigs._preprocess_steps(test_pipeline)
print(f"Preprocessed pipeline: {preprocessed}")
serialized = serialize_component(preprocessed)
print(f"Serialized pipeline: {json.dumps(serialized, indent=2, default=str)}")

=== Testing serialization ===
Original: {'y_processing': <class 'sklearn.preprocessing._data.MinMaxScaler'>}
Serialized: {'y_processing': {'class': 'sklearn.preprocessing._data.MinMaxScaler'}}
JSON: {
  "y_processing": {
    "class": "sklearn.preprocessing._data.MinMaxScaler"
  }
}
Deserialized y_processing: MinMaxScaler() (type: <class 'sklearn.preprocessing._data.MinMaxScaler'>)

Original: {'model': {'class': <class 'sklearn.cross_decomposition._pls.PLSRegression'>, 'params': {'n_components': 1}}}
Preprocessed: {'model': {'class': <class 'sklearn.cross_decomposition._pls.PLSRegression'>, 'params': {'n_components': 1}}}
Serialized: {'model': {'class': 'sklearn.cross_decomposition._pls.PLSRegression', 'params': {'n_components': 1}}}
JSON: {
  "model": {
    "class": "sklearn.cross_decomposition._pls.PLSRegression",
    "params": {
      "n_components": 1
    }
  }
}
Deserialized model: PLSRegression(n_components=1) (type: <class 'sklearn.cross_decomposition._pls.PLSRegression'>)

Origi

In [18]:
# Test the actual config object and examine its steps
print("=== Checking actual config steps ===")
print(f"Config steps type: {type(config.steps)}")
print(f"Number of pipeline configurations: {len(config.steps)}")

for i, steps in enumerate(config.steps):
    print(f"\nPipeline config {i}:")
    print(f"  Type: {type(steps)}")
    print(f"  Length: {len(steps) if hasattr(steps, '__len__') else 'N/A'}")

    if isinstance(steps, list):
        for j, step in enumerate(steps):
            print(f"  Step {j}: {step} (type: {type(step)})")
            if isinstance(step, dict):
                for key, value in step.items():
                    print(f"    {key}: {value} (type: {type(value)})")
                    if isinstance(value, dict) and 'class' in value:
                        print(f"      class: {value['class']} (type: {type(value['class'])})")
                        if 'params' in value:
                            print(f"      params: {value['params']} (type: {type(value['params'])})")

=== Checking actual config steps ===
Config steps type: <class 'list'>
Number of pipeline configurations: 1

Pipeline config 0:
  Type: <class 'list'>
  Length: 3
  Step 0: sklearn.preprocessing._data.MinMaxScaler (type: <class 'str'>)
  Step 1: {'y_processing': {'class': 'sklearn.preprocessing._data.MinMaxScaler'}} (type: <class 'dict'>)
    y_processing: {'class': 'sklearn.preprocessing._data.MinMaxScaler'} (type: <class 'dict'>)
      class: sklearn.preprocessing._data.MinMaxScaler (type: <class 'str'>)
  Step 2: {'model': {'class': 'sklearn.cross_decomposition._pls.PLSRegression', 'params': {'n_components': 1}}} (type: <class 'dict'>)
    model: {'class': 'sklearn.cross_decomposition._pls.PLSRegression', 'params': {'n_components': 1}} (type: <class 'dict'>)
      class: sklearn.cross_decomposition._pls.PLSRegression (type: <class 'str'>)
      params: {'n_components': 1} (type: <class 'dict'>)


In [19]:
# Let's test the exact deserialization that should happen in the runner
print("=== Testing runner logic simulation ===")

# Simulate step 3 processing
step = {'model': {'class': 'sklearn.cross_decomposition._pls.PLSRegression', 'params': {'n_components': 1}}}
print(f"Step: {step}")

# Check what key is found
WORKFLOW_OPERATORS = ["sample_augmentation", "feature_augmentation", "branch", "dispatch", "model", "stack",
                      "scope", "cluster", "merge", "uncluster", "unscope", "chart_2d", "chart_3d", "fold_chart",
                      "model", "y_processing", "y_chart"]

key = next((k for k in step if k in WORKFLOW_OPERATORS), None)
print(f"Found key: {key}")

if key:
    print(f"step[key]: {step[key]}")
    print(f"'class' in step[key]: {'class' in step[key]}")

    if 'class' in step[key]:
        print("Deserializing step[key]...")
        operator = deserialize_component(step[key])
        print(f"Operator after deserialization: {operator}")
        print(f"Operator type: {type(operator)}")
        print(f"Has fit method: {hasattr(operator, 'fit')}")

        if hasattr(operator, 'fit'):
            print("✅ Operator correctly deserialized!")
        else:
            print("❌ Operator missing fit method!")
            print(f"Operator attributes: {dir(operator)}")
    else:
        print("No 'class' key found in step[key]")
else:
    print("No workflow operator key found")

=== Testing runner logic simulation ===
Step: {'model': {'class': 'sklearn.cross_decomposition._pls.PLSRegression', 'params': {'n_components': 1}}}
Found key: model
step[key]: {'class': 'sklearn.cross_decomposition._pls.PLSRegression', 'params': {'n_components': 1}}
'class' in step[key]: True
Deserializing step[key]...
Operator after deserialization: PLSRegression(n_components=1)
Operator type: <class 'sklearn.cross_decomposition._pls.PLSRegression'>
Has fit method: True
✅ Operator correctly deserialized!


In [23]:
# Test both syntax examples from the user's request
print("=== Testing both requested pipeline syntaxes ===")

# Test syntax 1: Without model_params
pipeline_1 = [
    MinMaxScaler,
    {"y_processing": MinMaxScaler},
    {
        "model": PLSRegression,
    }
]

print("Pipeline 1 (without model_params):")
config_1 = PipelineConfigs(pipeline_1)
print(f"✅ Configuration created with {len(config_1.steps)} step(s)")
print(f"Steps: {json.dumps(config_1.steps[0], indent=2, default=str)}")

print("\n" + "="*70)

# Test syntax 2: With model_params (already tested above)
pipeline_2 = [
    MinMaxScaler,
    {"y_processing": MinMaxScaler},
    {
        "model": PLSRegression,
        "model_params": {
            "n_components":1,
        }
    }
]

print("Pipeline 2 (with model_params):")
config_2 = PipelineConfigs(pipeline_2)
print(f"✅ Configuration created with {len(config_2.steps)} step(s)")
print(f"Steps: {json.dumps(config_2.steps[0], indent=2, default=str)}")

# Verify the expected serialization format
expected_format = [
  {
    "class": "sklearn.preprocessing._data.MinMaxScaler",
  },
  {
    "y_processing": {
        "class": "sklearn.preprocessing._data.MinMaxScaler",
      }
  },
  {
    "model": {
        "class": "sklearn.cross_decomposition._pls.PLSRegression",
        "params": {
          "n_components": 1,
        }
      }
  }
]

print(f"\n✅ Both syntaxes now work and produce the expected serialization format!")

=== Testing both requested pipeline syntaxes ===
Pipeline 1 (without model_params):
✅ Loaded pipeline(s) with 1 configuration(s).
✅ Configuration created with 1 step(s)
Steps: [
  "sklearn.preprocessing._data.MinMaxScaler",
  {
    "y_processing": {
      "class": "sklearn.preprocessing._data.MinMaxScaler"
    }
  },
  {
    "model": {
      "class": "sklearn.cross_decomposition._pls.PLSRegression"
    }
  }
]

Pipeline 2 (with model_params):
✅ Loaded pipeline(s) with 1 configuration(s).
✅ Configuration created with 1 step(s)
Steps: [
  "sklearn.preprocessing._data.MinMaxScaler",
  {
    "y_processing": {
      "class": "sklearn.preprocessing._data.MinMaxScaler"
    }
  },
  {
    "model": {
      "class": "sklearn.cross_decomposition._pls.PLSRegression",
      "params": {
        "n_components": 1
      }
    }
  }
]

✅ Both syntaxes now work and produce the expected serialization format!


In [25]:
# Test the improved agnostic preprocessing logic
print("=== Testing Agnostic XX/XX_params Pattern Detection ===")

# Test case 1: Multiple different component types with params
test_cases = [
    # Case 1: model/model_params
    {"model": PLSRegression, "model_params": {"n_components": 2}},

    # Case 2: y_processing/y_processing_params
    {"y_processing": MinMaxScaler, "y_processing_params": {"feature_range": (0, 1)}},

    # Case 3: splitter/splitter_params
    {"splitter": ShuffleSplit, "splitter_params": {"n_splits": 5, "test_size": 0.3}},

    # Case 4: Direct class/params format
    {"class": ShuffleSplit, "params": {"n_splits": 3, "test_size": 0.25}},

    # Case 5: Mixed components in one dict
    {
        "model": PLSRegression,
        "model_params": {"n_components": 1},
        "y_processing": MinMaxScaler,
        "other_key": "some_value"
    }
]

from nirs4all.pipeline.config import PipelineConfigs

for i, test_case in enumerate(test_cases, 1):
    print(f"\nTest Case {i}:")
    print(f"Original: {test_case}")

    preprocessed = PipelineConfigs._preprocess_steps(test_case)
    print(f"Preprocessed: {preprocessed}")

    serialized = serialize_component(preprocessed)
    print(f"Serialized: {json.dumps(serialized, indent=2, default=str)}")

=== Testing Agnostic XX/XX_params Pattern Detection ===

Test Case 1:
Original: {'model': <class 'sklearn.cross_decomposition._pls.PLSRegression'>, 'model_params': {'n_components': 2}}
Preprocessed: {'model': {'class': <class 'sklearn.cross_decomposition._pls.PLSRegression'>, 'params': {'n_components': 2}}}
Serialized: {
  "model": {
    "class": "sklearn.cross_decomposition._pls.PLSRegression",
    "params": {
      "n_components": 2
    }
  }
}

Test Case 2:
Original: {'y_processing': <class 'sklearn.preprocessing._data.MinMaxScaler'>, 'y_processing_params': {'feature_range': (0, 1)}}
Preprocessed: {'y_processing': {'class': <class 'sklearn.preprocessing._data.MinMaxScaler'>, 'params': {'feature_range': (0, 1)}}}
Serialized: {
  "y_processing": {
    "class": "sklearn.preprocessing._data.MinMaxScaler",
    "params": {
      "feature_range": [
        0,
        1
      ]
    }
  }
}

Test Case 3:
Original: {'splitter': <class 'sklearn.model_selection._split.ShuffleSplit'>, 'splitter_

In [26]:
# Test consistency: different syntax variations should produce identical serialized output
print("\n=== Testing Serialization Consistency ===")

# Define equivalent configurations using different syntax
equivalent_configs = [
    # Variation 1: XX/XX_params pattern
    {"model": PLSRegression, "model_params": {"n_components": 2}},

    # Variation 2: Direct class/params pattern
    {"model": {"class": PLSRegression, "params": {"n_components": 2}}},

    # Variation 3: Already preprocessed format
    {"model": {"class": "sklearn.cross_decomposition._pls.PLSRegression", "params": {"n_components": 2}}},
]

print("Testing equivalent configurations...")
serialized_results = []

for i, config in enumerate(equivalent_configs, 1):
    print(f"\nVariation {i}: {config}")

    # Apply full pipeline processing
    preprocessed = PipelineConfigs._preprocess_steps(config)
    serialized = serialize_component(preprocessed)
    serialized_results.append(serialized)

    print(f"Serialized: {json.dumps(serialized, indent=2, default=str)}")

# Check if all serialized results are identical
print(f"\n=== Consistency Check ===")
all_equal = all(serialized_results[0] == result for result in serialized_results[1:])
print(f"All variations produce identical serialized output: {all_equal}")

if all_equal:
    print("✅ Serialization is consistent across different syntax variations!")

    # Test hash consistency
    hash1 = PipelineConfigs.get_hash([serialized_results[0]])
    hash2 = PipelineConfigs.get_hash([serialized_results[1]])
    hash3 = PipelineConfigs.get_hash([serialized_results[2]])

    print(f"Hash 1: {hash1}")
    print(f"Hash 2: {hash2}")
    print(f"Hash 3: {hash3}")
    print(f"All hashes equal: {hash1 == hash2 == hash3}")
else:
    print("❌ Serialization is not consistent!")
    for i, result in enumerate(serialized_results):
        print(f"Result {i+1}: {result}")


=== Testing Serialization Consistency ===
Testing equivalent configurations...

Variation 1: {'model': <class 'sklearn.cross_decomposition._pls.PLSRegression'>, 'model_params': {'n_components': 2}}
Serialized: {
  "model": {
    "class": "sklearn.cross_decomposition._pls.PLSRegression",
    "params": {
      "n_components": 2
    }
  }
}

Variation 2: {'model': {'class': <class 'sklearn.cross_decomposition._pls.PLSRegression'>, 'params': {'n_components': 2}}}
Serialized: {
  "model": {
    "class": "sklearn.cross_decomposition._pls.PLSRegression",
    "params": {
      "n_components": 2
    }
  }
}

Variation 3: {'model': {'class': 'sklearn.cross_decomposition._pls.PLSRegression', 'params': {'n_components': 2}}}
Serialized: {
  "model": {
    "class": "sklearn.cross_decomposition._pls.PLSRegression",
    "params": {
      "n_components": 2
    }
  }
}

=== Consistency Check ===
All variations produce identical serialized output: True
✅ Serialization is consistent across different synt

In [28]:
# Test the enhanced preprocessing that handles bare classes in component keys
print("=== Testing Enhanced Preprocessing for Bare Classes ===")

test_cases = [
    # Case 1: Bare class in component key (like y_processing)
    {"y_processing": MinMaxScaler},

    # Case 2: Combination of bare class and XX/XX_params
    {"model": PLSRegression, "model_params": {"n_components": 1}, "y_processing": MinMaxScaler},

    # Case 3: Direct class/params format
    {"class": ShuffleSplit, "params": {"n_splits": 3}},

    # Case 4: Mixed scenarios
    {"splitter": ShuffleSplit, "validator": MinMaxScaler, "other_key": "some_value"}
]

for i, test_case in enumerate(test_cases, 1):
    print(f"\nTest Case {i}:")
    print(f"Original: {test_case}")

    preprocessed = PipelineConfigs._preprocess_steps(test_case)
    print(f"Preprocessed: {preprocessed}")

    serialized = serialize_component(preprocessed)
    print(f"Serialized: {json.dumps(serialized, indent=2, default=str)}")

# Test our original pipeline case
print("\n=== Testing Original Pipeline Case ===")
original_pipeline = [
    MinMaxScaler,
    {"y_processing": MinMaxScaler},
    {
        "model": PLSRegression,
        "model_params": {
            "n_components": 1,
        }
    }
]

print(f"Original pipeline: {original_pipeline}")
preprocessed_pipeline = PipelineConfigs._preprocess_steps(original_pipeline)
print(f"Preprocessed pipeline: {preprocessed_pipeline}")
serialized_pipeline = serialize_component(preprocessed_pipeline)
print(f"Serialized pipeline: {json.dumps(serialized_pipeline, indent=2, default=str)}")

=== Testing Enhanced Preprocessing for Bare Classes ===

Test Case 1:
Original: {'y_processing': <class 'sklearn.preprocessing._data.MinMaxScaler'>}
Preprocessed: {'y_processing': {'class': <class 'sklearn.preprocessing._data.MinMaxScaler'>}}
Serialized: {
  "y_processing": {
    "class": "sklearn.preprocessing._data.MinMaxScaler"
  }
}

Test Case 2:
Original: {'model': <class 'sklearn.cross_decomposition._pls.PLSRegression'>, 'model_params': {'n_components': 1}, 'y_processing': <class 'sklearn.preprocessing._data.MinMaxScaler'>}
Preprocessed: {'model': {'class': <class 'sklearn.cross_decomposition._pls.PLSRegression'>, 'params': {'n_components': 1}}, 'y_processing': {'class': <class 'sklearn.preprocessing._data.MinMaxScaler'>}}
Serialized: {
  "model": {
    "class": "sklearn.cross_decomposition._pls.PLSRegression",
    "params": {
      "n_components": 1
    }
  },
  "y_processing": {
    "class": "sklearn.preprocessing._data.MinMaxScaler"
  }
}

Test Case 3:
Original: {'class': <cla

In [30]:
# Test the final consistency check for the two requested syntaxes
print("=== Final Consistency Check for Requested Syntaxes ===")

# The two syntaxes from the user's request
syntax_1 = [
    MinMaxScaler,
    {"y_processing": MinMaxScaler},
    {"model": PLSRegression}
]

syntax_2 = [
    MinMaxScaler,
    {"y_processing": MinMaxScaler},
    {
        "model": PLSRegression,
        "model_params": {"n_components": 1}
    }
]

print("Syntax 1 (without model_params):")
config_1 = PipelineConfigs(syntax_1)
hash_1 = config_1.get_hash(config_1.steps[0])
print(f"Hash: {hash_1}")
print(f"Serialized: {json.dumps(config_1.steps[0], indent=2, default=str)}")

print("\nSyntax 2 (with model_params):")
config_2 = PipelineConfigs(syntax_2)
hash_2 = config_2.get_hash(config_2.steps[0])
print(f"Hash: {hash_2}")
print(f"Serialized: {json.dumps(config_2.steps[0], indent=2, default=str)}")

print(f"\nSame hash (as expected, different params): {hash_1 == hash_2}")

# Test equivalent configurations with same params
syntax_2_alt_formats = [
    # Format 1: XX/XX_params
    [MinMaxScaler, {"y_processing": MinMaxScaler}, {"model": PLSRegression, "model_params": {"n_components": 1}}],

    # Format 2: nested class/params
    [MinMaxScaler, {"y_processing": MinMaxScaler}, {"model": {"class": PLSRegression, "params": {"n_components": 1}}}],

    # Format 3: string class name
    [MinMaxScaler, {"y_processing": MinMaxScaler}, {"model": {"class": "sklearn.cross_decomposition._pls.PLSRegression", "params": {"n_components": 1}}}]
]

print(f"\n=== Testing Equivalent Configurations ===")
hashes = []

for i, config_def in enumerate(syntax_2_alt_formats, 1):
    config = PipelineConfigs(config_def)
    hash_val = config.get_hash(config.steps[0])
    hashes.append(hash_val)
    print(f"Format {i} hash: {hash_val}")

all_same = all(h == hashes[0] for h in hashes[1:])
print(f"\nAll equivalent configurations have same hash: {all_same}")

if all_same:
    print("✅ Perfect! All equivalent syntax variations produce identical hashes!")
else:
    print("❌ Issue: Equivalent configurations produce different hashes")

=== Final Consistency Check for Requested Syntaxes ===
Syntax 1 (without model_params):
✅ Loaded pipeline(s) with 1 configuration(s).
Hash: 4380155b
Serialized: [
  "sklearn.preprocessing._data.MinMaxScaler",
  {
    "y_processing": {
      "class": "sklearn.preprocessing._data.MinMaxScaler"
    }
  },
  {
    "model": {
      "class": "sklearn.cross_decomposition._pls.PLSRegression"
    }
  }
]

Syntax 2 (with model_params):
✅ Loaded pipeline(s) with 1 configuration(s).
Hash: 7b748deb
Serialized: [
  "sklearn.preprocessing._data.MinMaxScaler",
  {
    "y_processing": {
      "class": "sklearn.preprocessing._data.MinMaxScaler"
    }
  },
  {
    "model": {
      "class": "sklearn.cross_decomposition._pls.PLSRegression",
      "params": {
        "n_components": 1
      }
    }
  }
]

Same hash (as expected, different params): False

=== Testing Equivalent Configurations ===
✅ Loaded pipeline(s) with 1 configuration(s).
Format 1 hash: 7b748deb
✅ Loaded pipeline(s) with 1 configuration(s

In [37]:
# Investigate the naming collision issue
print("=== Investigating Pipeline Configuration Generation ===")

# Check how many configurations are generated
print(f"Number of configurations generated: {len(config.steps)}")
print(f"Configuration names: {config.names}")

# Check each configuration
for i, (steps, name) in enumerate(zip(config.steps, config.names)):
    print(f"\n--- Configuration {i+1}: {name} ---")
    for j, step in enumerate(steps):
        if isinstance(step, dict):
            print(f"  Step {j}: {step}")
        else:
            print(f"  Step {j}: {step}")

# The issue is that all configurations have identical y_processing steps
# Let's see what the y_processing step looks like in each config
print(f"\n=== Y-Processing Steps in Each Configuration ===")
for i, steps in enumerate(config.steps):
    y_processing_step = steps[1]  # Should be the y_processing step
    print(f"Config {i+1}: {y_processing_step}")

print(f"\n=== Analysis ===")
print("The issue: All configurations share identical y_processing steps.")
print("When the pipeline runner executes them sequentially on the same dataset,")
print("they try to create the same processing name 'numeric_MinMaxScaler1',")
print("causing a collision after the first configuration completes.")

=== Investigating Pipeline Configuration Generation ===
Number of configurations generated: 4
Configuration names: ['config_408030b0', 'config_d0a3f61e', 'config_bc2891fd', 'config_5b9b41c8']

--- Configuration 1: config_408030b0 ---
  Step 0: {'class': 'sklearn.preprocessing._data.MinMaxScaler', 'params': {'feature_range': [0.1, 0.8]}, '_runtime_instance': MinMaxScaler(feature_range=(0.1, 0.8))}
  Step 1: {'y_processing': {'class': 'sklearn.preprocessing._data.MinMaxScaler', 'params': {'feature_range': [0.1, 0.8]}}}
  Step 2: {'model': {'class': 'sklearn.cross_decomposition._pls.PLSRegression', 'params': {'n_components': 1}}}

--- Configuration 2: config_d0a3f61e ---
  Step 0: {'class': 'sklearn.preprocessing._data.MinMaxScaler', 'params': {'feature_range': [0.1, 0.8]}, '_runtime_instance': MinMaxScaler(feature_range=(0.1, 0.8))}
  Step 1: {'y_processing': {'class': 'sklearn.preprocessing._data.MinMaxScaler', 'params': {'feature_range': [0.1, 0.8]}}}
  Step 2: {'model': {'class': 'skl

In [41]:
# Analyze the current pipeline issue
print("=== Current Pipeline Analysis ===")
print(f"Configuration has {len(config.steps)} pipeline(s)")
print(f"Each pipeline has {len(config.steps[0])} steps")

# Check how many PLS regression models are created
pls_models = [step for step in config.steps[0] if isinstance(step, dict) and 'n_components' in str(step)]
print(f"Number of PLS regression models: {len(pls_models)}")

# Check the n_components range
n_components_values = []
for step in config.steps[0]:
    if isinstance(step, dict) and step.get('params', {}).get('n_components'):
        n_components_values.append(step['params']['n_components'])

print(f"n_components range: {min(n_components_values) if n_components_values else 'N/A'} to {max(n_components_values) if n_components_values else 'N/A'}")
print(f"Dataset sizes:")
for d_config in dataset_config_obj.data_configs:
    dataset = dataset_config_obj.get_dataset(d_config)
    print(f"  - {dataset.name}: {len(dataset.X_train)} training samples")

print(f"\n=== Issue Analysis ===")
print("The problem: PLS regression n_components cannot exceed the number of training samples.")
print("Classification dataset has 48 training samples, but the pipeline tries n_components up to 60.")
print("Solution: Limit n_components to be <= min(training samples across all datasets)")

=== Current Pipeline Analysis ===
Configuration has 2 pipeline(s)
Each pipeline has 63 steps
Number of PLS regression models: 59
n_components range: 1 to 60
Dataset sizes:
✅ Loaded dataset 'regression' with 130 training and 59 test samples.


AttributeError: 'SpectroDataset' object has no attribute 'X_train'

In [None]:
# Test the new score management functionality with simpler pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.cross_decomposition import PLSRegression

# Simpler pipeline configuration to test score functionality
simple_pipeline = [
    MinMaxScaler(),
    {"y_processing": MinMaxScaler()},
    {
        "model": PLSRegression,
        "model_params": {"n_components": 2},
        "train_params": {"verbose": 1}  # Enable verbose output to see scores
    }
]

print("Creating simple test configuration...")
config_test = PipelineConfigs(simple_pipeline)
print(f"Configuration created with {len(config_test.steps)} step(s)")

# Use existing dataset config from previous cell
print("Running test with new score management...")
runner_test = PipelineRunner()

# Select only regression dataset for testing
path_test = '../../sample_data/regression'
dataset_config_test = DatasetConfigs(path_test)

# Run just the first dataset config
results_test = runner_test.run(config_test, dataset_config_test)

Creating simple test configuration...
✅ Loaded pipeline(s) with 1 configuration(s).
Configuration created with 1 step(s)
Running test with new score management...
✅ Loaded dataset 'regression' with 130 training and 59 test samples.
[94m🚀 Starting pipeline config_5138d6a7 on dataset regression[0m
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
[94m🔄 Running 3 steps in sequential mode[0m
[92m🔷 Step 1: {'class': 'sklearn.preprocessing._data.MinMaxScaler', '_runtime_instance': MinMaxScaler()}[0m
🔹 Executing controller TransformerMixinController with operator MinMaxScaler
💾 Saved file: 1_0_MinMaxScaler_1.pkl
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
[92m🔷 Step 2: {'y_processing

In [None]:
# Test the new enhanced Predictions functionality
print("🔍 Testing enhanced Predictions functionality...")

# Access the dataset that was used in the pipeline run
# The dataset object gets modified during the pipeline run
dataset_result, history_result, pipeline_result = results_test[0]

print("\n📊 Dataset Predictions Summary:")
print(dataset_result._predictions)

if len(dataset_result._predictions) > 0:
    print("\n🏆 Best Scores Summary:")
    dataset_result._predictions.print_best_scores_summary()

    print("\n📈 All Scores in DataFrame:")
    scores_df = dataset_result._predictions.get_all_scores_summary()
    print(scores_df)

    print("\n🥇 Top Rankings by MSE:")
    mse_rankings = dataset_result._predictions.get_scores_ranking('mse', ascending=True)
    for i, (key, score) in enumerate(mse_rankings[:5], 1):
        print(f"{i}. {key}: MSE = {score:.4f}")

    print("\n🥈 Best Score:")
    best_result = dataset_result._predictions.get_best_score('mse')
    if best_result:
        key, score = best_result
        print(f"Best model: {key} with MSE: {score:.4f}")

    print("\n💾 Saving predictions to CSV...")
    dataset_result._predictions.save_predictions_to_csv(
        '../../results/test_predictions.csv',
        include_scores=True
    )

    print("\n✅ Enhanced Predictions functionality test complete!")
else:
    print("⚠️ No predictions found in dataset object")

    # Let's check what keys are available in the dataset
    print("Available prediction keys:", dataset_result._predictions.list_keys())
    print("Available datasets:", dataset_result._predictions.list_datasets())
    print("Available pipelines:", dataset_result._predictions.list_pipelines())
    print("Available models:", dataset_result._predictions.list_models())

🔍 Testing enhanced Predictions functionality...

📊 Dataset Predictions Summary:
📈 Predictions: 1 entries
   Datasets: ['regression']
   Pipelines: ['config_5138d6a7']
   Models: ['PLSRegression_3']

🏆 Best Scores Summary:
🏆 Best Scores Summary (mse):
📊 Task Type: regression
📈 Optimization: Lower is better
--------------------------------------------------------------------------------
 1. PLSRegression_3           | test            | mse: 413.8856 ↓

📈 All Scores in DataFrame:
                                    prediction_key     dataset  \
0  regression_config_5138d6a7_PLSRegression_3_test  regression   

          pipeline            model partition fold_idx  n_samples         mse  \
0  config_5138d6a7  PLSRegression_3      test     None         59  413.885571   

         mae  
0  16.455721  

🥇 Top Rankings by MSE:
1. regression_config_5138d6a7_PLSRegression_3_test: MSE = 413.8856

🥈 Best Score:
Best model: regression_config_5138d6a7_PLSRegression_3_test with MSE: 413.8856

💾 Savi

In [None]:
# Test with multiple models to show ranking functionality
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge

print("🚀 Testing multiple models for comprehensive ranking...")

# Create pipeline with multiple models
multi_model_pipeline = [
    MinMaxScaler(),
    {"y_processing": MinMaxScaler()},
    {
        "model": PLSRegression,
        "model_params": {"n_components": 3},
        "train_params": {"verbose": 1}
    }
]

# Add another configuration with RandomForest
multi_model_pipeline_2 = [
    MinMaxScaler(),
    {"y_processing": MinMaxScaler()},
    {
        "model": RandomForestRegressor,
        "model_params": {"n_estimators": 50, "random_state": 42},
        "train_params": {"verbose": 1}
    }
]

# Add Ridge regression
multi_model_pipeline_3 = [
    MinMaxScaler(),
    {"y_processing": MinMaxScaler()},
    {
        "model": Ridge,
        "model_params": {"alpha": 1.0},
        "train_params": {"verbose": 1}
    }
]

# Create configs for all three models
config_pls = PipelineConfigs(multi_model_pipeline)
config_rf = PipelineConfigs(multi_model_pipeline_2)
config_ridge = PipelineConfigs(multi_model_pipeline_3)

print("🏃 Running multiple models...")
for i, (config, name) in enumerate([(config_pls, "PLS"), (config_rf, "RandomForest"), (config_ridge, "Ridge")], 1):
    print(f"\n=== Running Model {i}: {name} ===")
    runner = PipelineRunner()
    results = runner.run(config, dataset_config_test)

    if i == 1:
        # Store the first result for comparison
        first_dataset = results[0][0]
    else:
        # Merge predictions from other models into first dataset for comparison
        current_dataset = results[0][0]
        for key, pred_data in current_dataset._predictions._predictions.items():
            first_dataset._predictions._predictions[key] = pred_data

print("\n🏆 Final Multi-Model Comparison:")
print("=====================================")
first_dataset._predictions.print_best_scores_summary()

print("\n📊 Detailed Scores DataFrame:")
detailed_scores = first_dataset._predictions.get_all_scores_summary()
print(detailed_scores[['model', 'mse', 'mae']].sort_values('mse'))

🚀 Testing multiple models for comprehensive ranking...
✅ Loaded pipeline(s) with 1 configuration(s).
✅ Loaded pipeline(s) with 1 configuration(s).
✅ Loaded pipeline(s) with 1 configuration(s).
🏃 Running multiple models...

=== Running Model 1: PLS ===
✅ Loaded dataset 'regression' with 130 training and 59 test samples.
[94m🚀 Starting pipeline config_1990306e on dataset regression[0m
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
[94m🔄 Running 3 steps in sequential mode[0m
[92m🔷 Step 1: {'class': 'sklearn.preprocessing._data.MinMaxScaler', '_runtime_instance': MinMaxScaler()}[0m
🔹 Executing controller TransformerMixinController with operator MinMaxScaler
💾 Saved file: 1_0_MinMaxScaler_1.pkl
---------------------------------------------------------------------------------------------------------------------------------------------

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    5.4s
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    5.5s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.0s finished
