In [21]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from nirs4all.transformations import StandardNormalVariate as SNV, SavitzkyGolay as SG, Gaussian as GS
from nirs4all.transformations import Rotate_Translate as RT

from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import RepeatedStratifiedKFold, ShuffleSplit
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.tree import DecisionTreeClassifier


config = {
    "data": {
        "action": "classification",
        "dataset": "data/sample_data.csv"
    },

    "pipeline": [
        MinMaxScaler(),
        { "feature_augmentation": [ None, SG, [SNV, GS] ] },
        { "sample_augmentation": [ RT, RT(p_range=3) ] },

        ShuffleSplit(), # First one is target:test by default

        { "cluster": KMeans(n_clusters=5, random_state=42) },

        {
            "class": "sklearn.model_selection.RepeatedStratifiedKFold",
            "params": { "n_splits": 5, "n_repeats": 2, "random_state": 42 }
        },

        "uncluster",

        "PlotData",
        "PlotClusters",
        "PlotResults",

        {
            "branch": [
                {
                    "model": RandomForestClassifier(random_state=42, n_estimators=100, max_depth=10),
                    "y_pipeline": "StandardScaler",
                },
                {
                    "model": SVC(kernel='linear', C=1.0, random_state=42),
                    "y_pipeline": [MinMaxScaler(), RobustScaler()],
                    "finetune_params": {
                        "C": [0.1, 1.0, 10.0]
                    },
                },
                {
                    "stack": {
                        "model": RandomForestClassifier(random_state=42, n_estimators=100, max_depth=10),
                        "y_pipeline": StandardScaler(),
                        "base_learners": [
                            {
                                "model": GradientBoostingClassifier(random_state=42, n_estimators=100, max_depth=5),
                                "y_pipeline": MinMaxScaler(),
                            },
                            {
                                "model": DecisionTreeClassifier(random_state=42, max_depth=5),
                                "y_pipeline": MinMaxScaler(),
                                "finetune_params": {
                                    "max_depth": [3, 5, 7]
                                }
                            }
                        ]
                    }
                }
            ]
        },

        "PlotModelPerformance",
        "PlotFeatureImportance",
        "PlotConfusionMatrix"
    ]
}

presets = {
    'StandardScaler': {'class': "sklearn.preprocessing.StandardScaler"}
}

In [None]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
from spectradataset import SpectraDataset
import polars as pl
from pipeline import PipelineRunner
from sklearn.preprocessing import MinMaxScaler

features = [np.random.rand(1000, 2500), np.random.rand(1000, 500)]
targets = np.random.rand(1000, 2)
metadata = pd.DataFrame({
    'sample_id': np.arange(1000),
    'condition': np.random.choice(['A', 'B'], size=1000),
    'age': np.random.randint(20, 60, size=1000)
})

dataset = SpectraDataset()
# print(dataset.indices)
dataset.add_spectra(
    sources=features,
    targets=targets,
    metadata=metadata,
)
# print(dataset.indices)

dataset.add_spectra(
    sources=features,
    targets=targets,
    metadata=metadata,
    # sample=[i for i in range(1000)],
    spectra_type="raman"
)
# print(dataset.indices)
# print(dataset.features)


# x = dataset.x({
#     "sample": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
#     # "spectra_type": "raman",
#     },
#     source_merge=True,
# )


pipeline = [
    MinMaxScaler(feature_range=(0, 2), copy=False),
]
runner = PipelineRunner()
print(dataset.x()[0])
runner.run_pipeline(pipeline, data=dataset)
print(dataset.x()[0])

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
[0.96557783 0.04731449 0.01774207 ... 0.55277315 0.22486138 0.54856955]
Running pipeline
Transforming source 0 with shape (2000, 2500)
Transforming source 1 with shape (2000, 500)
Pipeline finished.
[0.96557783 0.04731449 0.01774207 ... 0.55277315 0.22486138 0.54856955]


In [None]:
%load_ext autoreload
%autoreload 2

import json
import builder, data_loader, runner


spectra_set = data_loader.load_data(config["data"])
print(spectra_set.X().shape, spectra_set.y().shape)
print(spectra_set.features)

pipeline_builder = builder.PipelineBuilder(presets)
pipeline = pipeline_builder.build_pipeline(config["pipeline"])
print(json.dumps(pipeline, indent=4, default=str))

pipeline_runner = runner.PipelineRunner()
pipeline_runner.run_pipeline(pipeline, spectra_set)


In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
from new4all import SpectraDataset, TransformOperation, ClusterOperation, ModelOperation, PipelineRunner

def example_usage():
    """Example of how to use the improved pipeline."""
    from sklearn.preprocessing import StandardScaler
    from sklearn.cluster import KMeans
    from sklearn.ensemble import RandomForestClassifier

    # Create dataset
    dataset = SpectraDataset()

    # Add some dummy data
    X_train = np.random.randn(100, 1000)  # 100 samples, 1000 features
    y_train = np.random.randint(0, 3, 100)  # 3 classes
    X_train = X_train.astype(np.float64)
    y_train = y_train.astype(np.float64)

    X_test = np.random.randn(30, 1000)
    y_test = np.random.randint(0, 3, 30)
    X_test = X_test.astype(np.float64)
    y_test = y_test.astype(np.float64)

    dataset.add_data(X_train, y_train, partition="train")
    dataset.add_data(X_test, y_test, partition="test")

    # Define pipeline
    pipeline = [
        TransformOperation(StandardScaler()),
        ClusterOperation(KMeans(n_clusters=3)),
        ModelOperation(RandomForestClassifier(n_estimators=100))
    ]

    # Run pipeline
    runner = PipelineRunner()
    results = runner.run(pipeline, dataset)

    print("Pipeline results:", results)


if __name__ == "__main__":
    example_usage()

TypeError: 'module' object is not callable