In [6]:
from itertools import product
from collections.abc import Mapping

def expand_spec(node):
    if not isinstance(node, Mapping):
        return [node]

    # Case 1: pure OR node
    if set(node.keys()) == {"or"}:
        out = []
        for choice in node["or"]:
            out.extend(expand_spec(choice))
        return out

    # Case 2: dict that also contains "or" -> branch and merge
    if "or" in node:
        base = {k: v for k, v in node.items() if k != "or"}
        base_expanded = expand_spec(base)            # list[dict]
        choice_expanded = expand_spec({"or": node["or"]})  # list[dict or scalar]
        results = []
        for b in base_expanded:
            for c in choice_expanded:
                if isinstance(c, Mapping):
                    merged = {**b, **c}
                    results.append(merged)
                else:
                    # Scalar choices only make sense as values under a key, not top-level merges
                    raise ValueError("Top-level 'or' choices must be dicts.")
        return results

    # Case 3: normal dict -> product over keys
    keys, options = zip(*[(k, _expand_value(v)) for k, v in node.items()]) if node else ([], [])
    if not keys:
        return [{}]
    out = []
    for combo in product(*options):
        d = {}
        for k, v in zip(keys, combo):
            d[k] = v
        out.append(d)
    return out

def _expand_value(v):
    # Value position returns a list of *values* (scalars or dicts)
    if isinstance(v, Mapping) and set(v.keys()) == {"or"}:
        # Value-level OR can yield scalars or dicts as values
        vals = []
        for choice in v["or"]:
            ex = expand_spec(choice)
            # expand_spec returns list; extend with each item (scalar or dict value)
            vals.extend(ex)
        return vals
    elif isinstance(v, Mapping):
        # Nested object: expand to list of dict values
        return expand_spec(v)
    else:
        return [v]


spec = {
    "param1": 1,
    "or": [
        {"or": [ {"param2": 2}, {"param5": 2} ]},
        {"param2": 3},
        {"param2": {"or": [4, 6, 8]}}
    ]
}

for cfg in expand_spec(spec):
    print(cfg)

{'param1': 1, 'param2': 2}
{'param1': 1, 'param5': 2}
{'param1': 1, 'param2': 3}
{'param1': 1, 'param2': 4}
{'param1': 1, 'param2': 6}
{'param1': 1, 'param2': 8}


In [7]:
from nirs4all.operators.transformations import (
    Gaussian as GS,
    Rotate_Translate as RT,
    SavitzkyGolay as SG,
    StandardNormalVariate as SNV,
    Haar
)
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

pipeline_config = [
        {"or": [None, MinMaxScaler(), StandardScaler(), RobustScaler()]},  # scale the features
        {"feature_augmentation": [
            None,
            {"or": [SG, GS, SNV, Haar]},
            [{"or": [SG, GS, SNV, Haar]}, {"or": [SG, GS, SNV, Haar]}],
            ]
         },  # augment the features by applying transformations, creating new row ids with new processing but same sample ids
    ]


for cfg in expand_spec(pipeline_config):
    print(cfg)

[{'or': [None, MinMaxScaler(), StandardScaler(), RobustScaler()]}, {'feature_augmentation': [None, {'or': [<class 'nirs4all.operators.transformations.nirs.SavitzkyGolay'>, <class 'nirs4all.operators.transformations.signal.Gaussian'>, <class 'sklearn.preprocessing._data.StandardScaler'>, <class 'nirs4all.operators.transformations.nirs.Haar'>]}, [{'or': [<class 'nirs4all.operators.transformations.nirs.SavitzkyGolay'>, <class 'nirs4all.operators.transformations.signal.Gaussian'>, <class 'sklearn.preprocessing._data.StandardScaler'>, <class 'nirs4all.operators.transformations.nirs.Haar'>]}, {'or': [<class 'nirs4all.operators.transformations.nirs.SavitzkyGolay'>, <class 'nirs4all.operators.transformations.signal.Gaussian'>, <class 'sklearn.preprocessing._data.StandardScaler'>, <class 'nirs4all.operators.transformations.nirs.Haar'>]}]]}]


In [None]:
# Fix: Add list handling to expand_spec function
from itertools import product
from collections.abc import Mapping

def expand_spec_fixed(node):
    # NEW: Handle lists by expanding each element and taking the product
    if isinstance(node, list):
        if not node:
            return [[]]  # Empty list -> single empty result

        # Expand each element in the list
        expanded_elements = [expand_spec_fixed(element) for element in node]

        # Take Cartesian product of all expansions
        results = []
        for combo in product(*expanded_elements):
            results.append(list(combo))  # Convert tuple to list
        return results

    # Rest of the logic remains the same for dictionaries
    if not isinstance(node, Mapping):
        return [node]

    # Case 1: pure OR node
    if set(node.keys()) == {"or"}:
        out = []
        for choice in node["or"]:
            out.extend(expand_spec_fixed(choice))
        return out

    # Case 2: dict that also contains "or" -> branch and merge
    if "or" in node:
        base = {k: v for k, v in node.items() if k != "or"}
        base_expanded = expand_spec_fixed(base)            # list[dict]
        choice_expanded = expand_spec_fixed({"or": node["or"]})  # list[dict or scalar]
        results = []
        for b in base_expanded:
            for c in choice_expanded:
                if isinstance(c, Mapping):
                    merged = {**b, **c}
                    results.append(merged)
                else:
                    # Scalar choices only make sense as values under a key, not top-level merges
                    raise ValueError("Top-level 'or' choices must be dicts.")
        return results

    # Case 3: normal dict -> product over keys
    keys, options = zip(*[(k, _expand_value_fixed(v)) for k, v in node.items()]) if node else ([], [])
    if not keys:
        return [{}]
    out = []
    for combo in product(*options):
        d = {}
        for k, v in zip(keys, combo):
            d[k] = v
        out.append(d)
    return out

def _expand_value_fixed(v):
    # Value position returns a list of *values* (scalars or dicts)
    if isinstance(v, Mapping) and set(v.keys()) == {"or"}:
        # Value-level OR can yield scalars or dicts as values
        vals = []
        for choice in v["or"]:
            ex = expand_spec_fixed(choice)
            # expand_spec returns list; extend with each item (scalar or dict value)
            vals.extend(ex)
        return vals
    elif isinstance(v, Mapping):
        # Nested object: expand to list of dict values
        return expand_spec_fixed(v)
    elif isinstance(v, list):
        # Handle lists in value positions
        return expand_spec_fixed(v)
    else:
        return [v]

print("Testing the fixed expand_spec function:")
print("="*50)

pipeline_config = [
        {"or": [None, MinMaxScaler(), StandardScaler(), RobustScaler()]},  # scale the features
        {"feature_augmentation": [
            None,
            {"or": [SG, GS, SNV, Haar]},
            [{"or": [SG, GS, SNV, Haar]}, {"or": [SG, GS, SNV, Haar]}],
            ]
         },  # augment the features by applying transformations, creating new row ids with new processing but same sample ids
    ]

results = expand_spec_fixed(pipeline_config)
print(f"Number of combinations: {len(results)}")
print(f"First few combinations:")
for i, cfg in enumerate(results[:5]):  # Show first 5
    print(f"  {i+1}: {cfg}")

if len(results) > 5:
    print(f"  ... and {len(results)-5} more combinations")

# Let's also show the structure more clearly
print(f"\nDetailed breakdown of first combination:")
first = results[0]
for i, step in enumerate(first):
    print(f"  Step {i+1}: {step}")

Testing the fixed expand_spec function:
Number of combinations: 192
First few combinations:
  1: [None, {'feature_augmentation': [None, <class 'nirs4all.operators.transformations.nirs.SavitzkyGolay'>, [<class 'nirs4all.operators.transformations.nirs.SavitzkyGolay'>, <class 'nirs4all.operators.transformations.nirs.SavitzkyGolay'>]]}]
  2: [None, {'feature_augmentation': [None, <class 'nirs4all.operators.transformations.nirs.SavitzkyGolay'>, [<class 'nirs4all.operators.transformations.nirs.SavitzkyGolay'>, <class 'nirs4all.operators.transformations.signal.Gaussian'>]]}]
  3: [None, {'feature_augmentation': [None, <class 'nirs4all.operators.transformations.nirs.SavitzkyGolay'>, [<class 'nirs4all.operators.transformations.nirs.SavitzkyGolay'>, <class 'sklearn.preprocessing._data.StandardScaler'>]]}]
  4: [None, {'feature_augmentation': [None, <class 'nirs4all.operators.transformations.nirs.SavitzkyGolay'>, [<class 'nirs4all.operators.transformations.nirs.SavitzkyGolay'>, <class 'nirs4all.o

In [10]:
# Let's analyze the combinations more clearly
print("Detailed Analysis of Pipeline Combinations:")
print("="*60)

# Count combinations by categories
scalers = [None, MinMaxScaler(), StandardScaler(), RobustScaler()]
transformers = [SG, GS, SNV, Haar]

print(f"Scalers: {len(scalers)} options")
print(f"  - None (no scaling)")
print(f"  - MinMaxScaler, StandardScaler, RobustScaler")

print(f"\nFeature augmentation has 3 types:")
print(f"  1. None (no augmentation)")
print(f"  2. Single transformer: {len(transformers)} options")
print(f"  3. Double transformer combination: {len(transformers)}² = {len(transformers)**2} options")
print(f"  Total feature augmentation options: 1 + {len(transformers)} + {len(transformers)**2} = {1 + len(transformers) + len(transformers)**2}")

expected_total = len(scalers) * (1 + len(transformers) + len(transformers)**2)
print(f"\nExpected total combinations: {len(scalers)} × {1 + len(transformers) + len(transformers)**2} = {expected_total}")
print(f"Actual combinations generated: {len(results)}")
print(f"✅ Match: {expected_total == len(results)}")

# Show some interesting combinations
print(f"\nSome interesting pipeline combinations:")
print(f"-" * 40)

# Show combinations with no preprocessing
no_preprocess = [r for r in results if r[0] is None and 'feature_augmentation' in r[1] and r[1]['feature_augmentation'][1] is None]
print(f"No preprocessing at all: {len(no_preprocess)} combination(s)")
if no_preprocess:
    print(f"  Example: {no_preprocess[0]}")

# Show combinations with double transformation
double_transform = [r for r in results if isinstance(r[1]['feature_augmentation'][2], list)]
print(f"With double transformations: {len(double_transform)} combinations")
if double_transform:
    print(f"  Example: Scaler={type(double_transform[0][0]).__name__ if double_transform[0][0] else 'None'}, "
          f"Single={double_transform[0][1]['feature_augmentation'][1].__name__}, "
          f"Double=[{double_transform[0][1]['feature_augmentation'][2][0].__name__}, "
          f"{double_transform[0][1]['feature_augmentation'][2][1].__name__}]")

print(f"\n✅ The expand_spec function now correctly handles lists!")
print(f"✅ Your pipeline_config generates {len(results)} unique preprocessing combinations!")

Detailed Analysis of Pipeline Combinations:
Scalers: 4 options
  - None (no scaling)
  - MinMaxScaler, StandardScaler, RobustScaler

Feature augmentation has 3 types:
  1. None (no augmentation)
  2. Single transformer: 4 options
  3. Double transformer combination: 4² = 16 options
  Total feature augmentation options: 1 + 4 + 16 = 21

Expected total combinations: 4 × 21 = 84
Actual combinations generated: 256
✅ Match: False

Some interesting pipeline combinations:
----------------------------------------
No preprocessing at all: 0 combination(s)
With double transformations: 256 combinations
  Example: Scaler=None, Single=SavitzkyGolay, Double=[SavitzkyGolay, SavitzkyGolay]

✅ The expand_spec function now correctly handles lists!
✅ Your pipeline_config generates 256 unique preprocessing combinations!


In [None]:
# Debug: Let's understand why we get 256 instead of 84
print("Debugging the combination count:")
print("="*50)

# Let's break down each component separately
scaler_results = expand_spec_fixed({"or": [None, MinMaxScaler(), StandardScaler(), RobustScaler()]})
print(f"Scaler expansion: {len(scaler_results)} combinations")

feature_aug_spec = {
    "feature_augmentation": [
        None,
        {"or": [SG, GS, SNV, Haar]},
        [{"or": [SG, GS, SNV, Haar]}, {"or": [SG, GS, SNV, Haar]}],
    ]
}
feature_aug_results = expand_spec_fixed(feature_aug_spec)
print(f"Feature augmentation expansion: {len(feature_aug_results)} combinations")

# Let's look at the feature augmentation structure more carefully
print(f"\nFeature augmentation breakdown:")
print(f"The list has 3 elements:")
print(f"  1. None")
print(f"  2. {{'or': [4 transformers]}}")
print(f"  3. [2 independent 'or' choices of 4 transformers each]")

print(f"\nSo feature_augmentation expands to:")
print(f"  - None")
print(f"  - 4 single transformer choices")
print(f"  - 4×4 = 16 double transformer combinations")
print(f"  Total: 1 + 4 + 16 = 21 combinations")

print(f"\nBut wait! Let's check the actual feature augmentation values...")
# Check first few feature augmentation results
for i, result in enumerate(feature_aug_results[:5]):
    fa_value = result['feature_augmentation']
    print(f"  {i+1}: {fa_value}")

if len(feature_aug_results) > 5:
    print(f"  ... and {len(feature_aug_results)-5} more")

# Now I suspect the issue might be in how we handle the list [None, {...}, [...]]
print(f"\nThe issue might be that the list [None, or-dict, list] is treated as:")
print(f"  Cartesian product of: [None] × [4 choices] × [16 combinations]")
print(f"  Which would give: 1 × 4 × 16 = 64 combinations")

print(f"\nAnd then: 4 scalers × 64 feature combinations = 256 total ✅")
print(f"\nThis is because the list structure creates a Cartesian product,")
print(f"not a choice between the three list elements!")

Debugging the combination count:
Scaler expansion: 4 combinations
Feature augmentation expansion: 64 combinations

Feature augmentation breakdown:
The list has 3 elements:
  1. None
  2. {'or': [4 transformers]}
  3. [2 independent 'or' choices of 4 transformers each]

So feature_augmentation expands to:
  - None
  - 4 single transformer choices
  - 4×4 = 16 double transformer combinations
  Total: 1 + 4 + 16 = 21 combinations

But wait! Let's check the actual feature augmentation values...
  1: [None, <class 'nirs4all.operators.transformations.nirs.SavitzkyGolay'>, [<class 'nirs4all.operators.transformations.nirs.SavitzkyGolay'>, <class 'nirs4all.operators.transformations.nirs.SavitzkyGolay'>]]
  2: [None, <class 'nirs4all.operators.transformations.nirs.SavitzkyGolay'>, [<class 'nirs4all.operators.transformations.nirs.SavitzkyGolay'>, <class 'nirs4all.operators.transformations.signal.Gaussian'>]]
  3: [None, <class 'nirs4all.operators.transformations.nirs.SavitzkyGolay'>, [<class 'nir

In [None]:
# SOLUTION: Use "or" for choices, not lists for Cartesian products
print("Solution: Correct pipeline configuration")
print("="*50)

# WRONG (what you had - creates Cartesian product):
wrong_config = [
    {"or": [None, MinMaxScaler(), StandardScaler(), RobustScaler()]},
    {"feature_augmentation": [  # This list creates Cartesian product!
        None,
        {"or": [SG, GS, SNV, Haar]},
        [{"or": [SG, GS, SNV, Haar]}, {"or": [SG, GS, SNV, Haar]}],
    ]}
]

# CORRECT (what you probably want - creates choices):
correct_config = [
    {"or": [None, MinMaxScaler(), StandardScaler(), RobustScaler()]},
    {"feature_augmentation": {"or": [  # Use "or" for choices!
        None,
        {"or": [SG, GS, SNV, Haar]},
        [{"or": [SG, GS, SNV, Haar]}, {"or": [SG, GS, SNV, Haar]}],
    ]}}
]

print("Testing WRONG config (Cartesian product):")
wrong_results = expand_spec_fixed(wrong_config)
print(f"  Results: {len(wrong_results)} combinations")

print("Testing CORRECT config (choices):")
correct_results = expand_spec_fixed(correct_config)
print(f"  Results: {len(correct_results)} combinations")

# Verify the calculation for correct config
scalers = 4  # None, MinMaxScaler, StandardScaler, RobustScaler
feature_choices = 1 + 4 + 16  # None + 4 singles + 16 doubles
expected = scalers * feature_choices
print(f"\nExpected for correct config: {scalers} scalers × {feature_choices} feature choices = {expected}")
print(f"✅ Matches: {len(correct_results) == expected}")

# Show examples of correct results
print(f"\nSample correct combinations:")
for i, result in enumerate(correct_results[:3]):
    scaler_name = type(result[0]).__name__ if result[0] else "None"
    fa_value = result[1]['feature_augmentation']
    if fa_value is None:
        fa_desc = "None"
    elif isinstance(fa_value, list):
        fa_desc = f"Double: [{fa_value[0].__name__}, {fa_value[1].__name__}]"
    else:
        fa_desc = f"Single: {fa_value.__name__}"

    print(f"  {i+1}: Scaler={scaler_name}, Feature_aug={fa_desc}")

print(f"\n🎯 Use the CORRECT config structure to get {len(correct_results)} meaningful combinations!")

Solution: Correct pipeline configuration
Testing WRONG config (Cartesian product):
  Results: 256 combinations
Testing CORRECT config (choices):
  Results: 84 combinations

Expected for correct config: 4 scalers × 21 feature choices = 84
✅ Matches: True

Sample correct combinations:
  1: Scaler=None, Feature_aug=None
  2: Scaler=None, Feature_aug=Single: SavitzkyGolay
  3: Scaler=None, Feature_aug=Single: Gaussian

🎯 Use the CORRECT config structure to get 84 meaningful combinations!


In [None]:
# You're absolutely correct! Let's break down the 4×4×4×4 = 256
print("Why you get exactly 256 combinations:")
print("="*50)

# Your structure creates a Cartesian product of 4 independent choices:
original_config = [
    {"or": [None, MinMaxScaler(), StandardScaler(), RobustScaler()]},  # 4 choices
    {"feature_augmentation": [
        None,                                    # Element 1: 1 choice (None)
        {"or": [SG, GS, SNV, Haar]},           # Element 2: 4 choices
        [{"or": [SG, GS, SNV, Haar]},          # Element 3a: 4 choices
         {"or": [SG, GS, SNV, Haar]}],         # Element 3b: 4 choices
    ]}
]

print("Your structure breakdown:")
print("1. First pipeline step: {'or': [4 scalers]} = 4 choices")
print("2. Second pipeline step: {'feature_augmentation': [3 elements]} where:")
print("   - Element 1: None = 1 choice")
print("   - Element 2: {'or': [4 transformers]} = 4 choices")
print("   - Element 3: [2 independent 'or' choices] = 4×4 = 16 choices")

print(f"\nBut wait! The list [None, or-dict, list] creates Cartesian product:")
print(f"   - None: always present")
print(f"   - {'or': [4 transformers]}: 4 choices")
print(f"   - [choice1, choice2]: 4×4 = 16 combinations")
print(f"   Total: 1 × 4 × 16 = 64 feature_augmentation combinations")

print(f"\nSo your calculation is:")
print(f"   4 scalers × 64 feature_augmentation combinations = 256 ✅")

# Let's verify by looking at the actual structure
results = expand_spec_fixed(original_config)
sample_result = results[0]
print(f"\nActual structure of first result:")
print(f"   Step 1 (scaler): {sample_result[0]}")
print(f"   Step 2 (feature_aug): {sample_result[1]}")

fa_value = sample_result[1]['feature_augmentation']
print(f"\nThe feature_augmentation value is a LIST with 3 elements:")
print(f"   [0]: {fa_value[0]}")
print(f"   [1]: {fa_value[1]}")
print(f"   [2]: {fa_value[2]}")

print(f"\n🎯 Your math is correct: 4×4×4×4 = {4*4*4*4} = 256!")
print(f"🎯 This creates ALL combinations of [scaler, None, single_transform, double_transform]")

# Show some examples to clarify
print(f"\nSome example combinations:")
for i in range(5):
    r = results[i]
    scaler = type(r[0]).__name__ if r[0] else "None"
    fa = r[1]['feature_augmentation']
    single = fa[1].__name__
    double = f"[{fa[2][0].__name__}, {fa[2][1].__name__}]"
    print(f"   {i+1}: Scaler={scaler}, Single={single}, Double={double}")

Why you get exactly 256 combinations:
Your structure breakdown:
1. First pipeline step: {'or': [4 scalers]} = 4 choices
2. Second pipeline step: {'feature_augmentation': [3 elements]} where:
   - Element 1: None = 1 choice
   - Element 2: {'or': [4 transformers]} = 4 choices
   - Element 3: [2 independent 'or' choices] = 4×4 = 16 choices

But wait! The list [None, or-dict, list] creates Cartesian product:
   - None: always present


ValueError: Invalid format specifier