# NIRS Features Simple Demo

Simple test of Features class with single and multi-source data.

In [7]:
%load_ext autoreload
%autoreload 2
# Setup and create fake data
import sys
sys.path.append('../')
import numpy as np
from nirs4all.dataset.features import Features
from nirs4all.dataset.dataset import SpectroDataset

# Create fake spectral data
np.random.seed(42)

# Single source: 20 samples, 100 wavelengths
single_data = np.random.randn(20, 100) + 2.0
print(f"Single source data: {single_data.shape}")

# Multi-source: 3 sources with different wavelength counts
source1 = np.random.randn(15, 80) + 1.5
source2 = np.random.randn(15, 120) + 1.8
source3 = np.random.randn(15, 90) + 2.2
multi_data = [source1, source2, source3]
print(f"Multi-source data: {[s.shape for s in multi_data]}")

print("✓ Fake data created")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Single source data: (20, 100)
Multi-source data: [(15, 80), (15, 120), (15, 90)]
✓ Fake data created


In [8]:
# Test single source
features_single = Features()
dataset_single = SpectroDataset()

# Add single source data to Features
features_single.add_samples(single_data)
print(f"Features single - Sources: {len(features_single.sources)}, Samples: {features_single.num_samples}, Features: {features_single.num_features}")
print(f"Features representation: {features_single}")

# Add single source data to Dataset
dataset_single.add_samples(single_data)
print(f"\nDataset single - Sources: {dataset_single.n_sources}, Multi-source: {dataset_single.is_multi_source()}")
dataset_single.print_summary()

Features single - Sources: 1, Samples: 20, Features: 100
Features representation: FeatureBlock with 1 sources and 20 samples
Source 0: FeatureSource(shape=(20, 1, 100), dtype=float32, processing_ids=['raw'], mean=2.045084238052368, variance=0.9765757918357849)

Dataset single - Sources: 1, Multi-source: False
=== SpectroDataset Summary ===

📊 Features: 20 samples, 1 source(s)
Features: 100, processings: 1
Processing IDs: ['raw']



In [9]:
# Test multi-source
features_multi = Features()
dataset_multi = SpectroDataset()

# Add multi-source data to Features
features_multi.add_samples(multi_data)
print(f"Features multi - Sources: {len(features_multi.sources)}, Samples: {features_multi.num_samples}, Features: {features_multi.num_features}")
print(f"Features representation: {features_multi}")

# Add multi-source data to Dataset
dataset_multi.add_samples(multi_data)
print(f"\nDataset multi - Sources: {dataset_multi.n_sources}, Multi-source: {dataset_multi.is_multi_source()}")
dataset_multi.print_summary()

# Show individual source info
for i, source in enumerate(features_multi.sources):
    print(f"Source {i}: {source._array.shape} - {source.num_features} features")

print(dataset_multi._indexer)

Features multi - Sources: 3, Samples: 15, Features: [80, 120, 90]
Features representation: FeatureBlock with 3 sources and 15 samples
Source 0: FeatureSource(shape=(15, 1, 80), dtype=float32, processing_ids=['raw'], mean=1.481632113456726, variance=0.9760969877243042)
Source 1: FeatureSource(shape=(15, 1, 120), dtype=float32, processing_ids=['raw'], mean=1.7777127027511597, variance=1.0189955234527588)
Source 2: FeatureSource(shape=(15, 1, 90), dtype=float32, processing_ids=['raw'], mean=2.1727027893066406, variance=0.9904318451881409)

Dataset multi - Sources: 3, Multi-source: True
=== SpectroDataset Summary ===

📊 Features: 15 samples, 3 source(s)
Features: [80, 120, 90], processings: [1, 1, 1]
Processing IDs: [['raw'], ['raw'], ['raw']]

Source 0: (15, 1, 80) - 80 features
Source 1: (15, 1, 120) - 120 features
Source 2: (15, 1, 90) - 90 features
shape: (15, 8)
┌─────┬────────┬────────┬───────────┬───────┬────────┬─────────────┬──────────────┐
│ row ┆ sample ┆ origin ┆ partition ┆ gr

In [11]:
# Test add_features for single source
print("=== Testing add_features for single source ===")

# Create new processed versions of single source data
savgol_data = single_data + np.random.randn(*single_data.shape) * 0.1  # Simulated savgol filtering
msc_data = single_data * 0.9 + 0.05  # Simulated MSC correction

# Test adding features to existing single source dataset
try:
    dataset_single.add_features([savgol_data, msc_data], ["savgol", "msc"])
    print("✓ Successfully added new processings to single source")
    print(f"Dataset sources: {dataset_single.n_sources}")
    print(f"Processing IDs: {dataset_single._features.preprocessing_str}")
    print(f"Number of processings: {dataset_single._features.num_processings}")

    # Check the source details
    source = dataset_single._features.sources[0]
    print(f"Source array shape: {source._array.shape} (samples, processings, features)")
    print(f"Source processing IDs: {source._processing_ids}")

except Exception as e:
    print(f"✗ Error adding features to single source: {e}")

print(f"\nUpdated dataset summary:")
dataset_single.print_summary()

=== Testing add_features for single source ===
✓ Successfully added new processings to single source
Dataset sources: 1
Processing IDs: ['raw', 'savgol', 'msc']
Number of processings: 3
Source array shape: (20, 3, 100) (samples, processings, features)
Source processing IDs: ['raw', 'savgol', 'msc']

Updated dataset summary:
=== SpectroDataset Summary ===

📊 Features: 20 samples, 1 source(s)
Features: 100, processings: 3
Processing IDs: ['raw', 'savgol', 'msc']



In [12]:
# Test add_features for multi-source
print("=== Testing add_features for multi-source ===")

# Create new processed versions for each source
processed_source1 = [
    source1 + np.random.randn(*source1.shape) * 0.05,  # Simulated detrend
    source1 * 1.1 + 0.02  # Simulated normalization
]
processed_source2 = [
    source2 + np.random.randn(*source2.shape) * 0.08,  # Simulated detrend
    source2 * 0.95 - 0.01  # Simulated normalization
]
processed_source3 = [
    source3 + np.random.randn(*source3.shape) * 0.06,  # Simulated detrend
    source3 * 1.05 + 0.03  # Simulated normalization
]

multi_processed_data = [processed_source1, processed_source2, processed_source3]
processing_names = ["detrend", "normalize"]

# Test adding features to existing multi-source dataset
try:
    dataset_multi.add_features(multi_processed_data, processing_names)
    print("✓ Successfully added new processings to multi-source")
    print(f"Dataset sources: {dataset_multi.n_sources}")
    print(f"Processing IDs: {dataset_multi._features.preprocessing_str}")
    print(f"Number of processings: {dataset_multi._features.num_processings}")

    # Check each source details
    for i, source in enumerate(dataset_multi._features.sources):
        print(f"Source {i} array shape: {source._array.shape} (samples, processings, features)")
        print(f"Source {i} processing IDs: {source._processing_ids}")

except Exception as e:
    print(f"✗ Error adding features to multi-source: {e}")

print(f"\nUpdated multi-source dataset summary:")
dataset_multi.print_summary()

# Compare original vs processed data for one source
print(f"\nData comparison for source 0:")
s0 = dataset_multi._features.sources[0]
print(f"Raw data mean: {np.mean(s0._array[:, 0, :]):.3f}")  # First processing (raw)
print(f"Detrend data mean: {np.mean(s0._array[:, 1, :]):.3f}")  # Second processing (detrend)
print(f"Normalize data mean: {np.mean(s0._array[:, 2, :]):.3f}")  # Third processing (normalize)

=== Testing add_features for multi-source ===
✓ Successfully added new processings to multi-source
Dataset sources: 3
Processing IDs: [['raw', 'detrend', 'normalize'], ['raw', 'detrend', 'normalize'], ['raw', 'detrend', 'normalize']]
Number of processings: [3, 3, 3]
Source 0 array shape: (15, 3, 80) (samples, processings, features)
Source 0 processing IDs: ['raw', 'detrend', 'normalize']
Source 1 array shape: (15, 3, 120) (samples, processings, features)
Source 1 processing IDs: ['raw', 'detrend', 'normalize']
Source 2 array shape: (15, 3, 90) (samples, processings, features)
Source 2 processing IDs: ['raw', 'detrend', 'normalize']

Updated multi-source dataset summary:
=== SpectroDataset Summary ===

📊 Features: 15 samples, 3 source(s)
Features: [80, 120, 90], processings: [3, 3, 3]
Processing IDs: [['raw', 'detrend', 'normalize'], ['raw', 'detrend', 'normalize'], ['raw', 'detrend', 'normalize']]


Data comparison for source 0:
Raw data mean: 1.482
Detrend data mean: 1.480
Normalize d

In [None]:
# Test the new update_features functionality
print("=== Testing new update_features API ===")

# Create a fresh feature source for testing
from nirs4all.dataset.feature_source import FeatureSource
test_source = FeatureSource()

# Add initial data
initial_data = np.random.randn(10, 50) + 1.0
test_source.add_samples(initial_data)
print(f"Initial: {test_source._processing_ids}")

# Test 1: Add new features using simplified add_features
new_data1 = initial_data + np.random.randn(*initial_data.shape) * 0.1
new_data2 = initial_data * 0.9
test_source.update_features(["", ""], [new_data1, new_data2], ["savgol", "msc"])
print(f"After add_features: {test_source._processing_ids}")

# Test 2: Replace features using simplified replace_features
replacement_data = initial_data * 1.1 + 0.05
test_source.update_features(["raw"], [replacement_data], ["normalized"])
print(f"After replace_features: {test_source._processing_ids}")

# Test 3: Mixed add/replace using update_features
mixed_data1 = initial_data + 0.1  # New processing
mixed_data2 = initial_data * 0.8  # Replace existing
mixed_data3 = initial_data - 0.05 # New processing

test_source.update_features(
    ["", "msc", ""],  # "" = add new, "msc" = replace existing
    [mixed_data1, mixed_data2, mixed_data3],
    ["detrend", "msc_v2", "baseline"]
)
print(f"After update_features: {test_source._processing_ids}")
print(f"Final shape: {test_source._array.shape} (samples, processings, features)")

=== Testing new update_features API ===
Initial: ['raw']
After add_features: ['raw', 'savgol', 'msc']
After replace_features: ['normalized', 'savgol', 'msc']
After update_features: ['normalized', 'savgol', 'msc_v2', 'detrend', 'baseline']
Final shape: (10, 5, 50) (samples, processings, features)
✓ New API working correctly!
