In [4]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from nirs4all.transformations import StandardNormalVariate as SNV, SavitzkyGolay as SG, Gaussian as GS
from nirs4all.transformations import Rotate_Translate as RT

from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import RepeatedStratifiedKFold, ShuffleSplit
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.tree import DecisionTreeClassifier


config = {
    "data": {
        "action": "classification",
        "dataset": "data/sample_data.csv"
    },

    "pipeline": [
        MinMaxScaler(),
        { "feature_augmentation": [ None, SG, [SNV, GS] ] },
        { "sample_augmentation": [ RT, RT(p_range=3) ] },

        ShuffleSplit(), # First one is target:test by default
        
        { "cluster": KMeans(n_clusters=5, random_state=42) },
        
        {
            "class": "sklearn.model_selection.RepeatedStratifiedKFold",
            "params": { "n_splits": 5, "n_repeats": 2, "random_state": 42 }
        },
        
        "uncluster",
        
        "PlotData",
        "PlotClusters",
        "PlotResults",
        
        {
            "branch": [
                {
                    "model": RandomForestClassifier(random_state=42, n_estimators=100, max_depth=10),
                    "y_pipeline": "StandardScaler",
                },
                {
                    "model": SVC(kernel='linear', C=1.0, random_state=42),
                    "y_pipeline": [MinMaxScaler(), RobustScaler()],
                    "finetune_params": {
                        "C": [0.1, 1.0, 10.0]
                    },
                },
                {
                    "stack": {
                        "model": RandomForestClassifier(random_state=42, n_estimators=100, max_depth=10),
                        "y_pipeline": StandardScaler(),
                        "base_learners": [
                            {
                                "model": GradientBoostingClassifier(random_state=42, n_estimators=100, max_depth=5),
                                "y_pipeline": MinMaxScaler(),
                            },
                            {
                                "model": DecisionTreeClassifier(random_state=42, max_depth=5),
                                "y_pipeline": MinMaxScaler(),
                                "finetune_params": {
                                    "max_depth": [3, 5, 7]
                                }
                            }
                        ]
                    }
                }
            ]
        },

        "PlotModelPerformance",
        "PlotFeatureImportance",
        "PlotConfusionMatrix"
    ]   
}

presets = {
    'StandardScaler': {'class': "sklearn.preprocessing.StandardScaler"}
}

In [None]:
%load_ext autoreload
%autoreload 2

import json
import builder, data_loader, runner


spectra_set = data_loader.load_data(config["data"])
print(spectra_set.X().shape, spectra_set.y().shape)

pipeline_builder = builder.PipelineBuilder(presets)
pipeline = pipeline_builder.build_pipeline(config["pipeline"])
print(json.dumps(pipeline, indent=4, default=str))

runner.run(pipeline, spectra_set)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
(1000, 1000) (1000,)
[
    "MinMaxScaler()",
    {
        "feature_augmentation": [
            null,
            "SavitzkyGolay()",
            [
                "StandardScaler()",
                "Gaussian()"
            ]
        ]
    },
    {
        "sample_augmentation": [
            "Rotate_Translate()",
            "Rotate_Translate(p_range=3)"
        ]
    },
    "ShuffleSplit(n_splits=10, random_state=None, test_size=None, train_size=None)",
    {
        "cluster": "KMeans(n_clusters=5, random_state=42)"
    },
    "RepeatedStratifiedKFold(n_repeats=2, n_splits=5, random_state=42)",
    "uncluster",
    "PlotData",
    "PlotClusters",
    "PlotResults",
    {
        "branch": [
            {
                "model": "RandomForestClassifier(max_depth=10, random_state=42)",
                "y_pipeline": "StandardScaler()"
            },
            {
                "model": "SVC(kern

TypeError: Pipeline.run() missing 1 required positional argument: 'data'