In [1]:
# Nyata is not available as a Pip package
import sys
sys.path.append("./backend/")
import nyata

In [2]:
import sklearn.ensemble
import sklearn.datasets
import sklearn.model_selection
import sklearn.tree
import sklearn.preprocessing
import sklearn.impute
import os
import numpy as np

try:
    os.mkdir("pickles")
    
except FileExistsError:
    pass

try:
    os.mkdir("test_data")
    
except FileExistsError:
    pass

## 1. Building Classifiers

In [3]:
# Building a scikit-learn forest model (also works with trees).
dataset_cls = sklearn.datasets.load_iris()

X_cls = dataset_cls.data
y_cls = dataset_cls.target
feat_names_cls = dataset_cls.feature_names

(
    X_train_cls,
    X_test_cls,
    y_train_cls,
    y_test_cls,
) = sklearn.model_selection.train_test_split(X_cls, y_cls, shuffle=True, random_state=16)

# Saving a separated test dataset to show test the 'load test dataset' Nyata feature.
np.savetxt(
    "test_data/test_data_classification.csv",
    np.column_stack((X_test_cls, y_test_cls)),
    delimiter=",",
    fmt="%.3f",
)

### Random Forest Classifiers

In [4]:
model = sklearn.ensemble.RandomForestClassifier(
    n_estimators=np.random.randint(100, 301),
    max_depth=np.random.randint(4, 8),
    min_samples_split=np.random.randint(10, 25),
)
model.fit(X_train_cls, y_train_cls)

# You can provide a fitted preprocessing pipeline (any scikit-learn
# transformer or a sequence of transformers in a scikit-learn
# Pipeline) through the 'preprocessing_pipeline' argument to transform
# your test data before predicting with your model. This is useful,
# for instance, to handle missing values before the predictions.
preprocessing = sklearn.impute.SimpleImputer(strategy="mean")
preprocessing.fit(X_train_cls, y_train_cls)

# Create a specific .pickle for Nyata input
nyata.dump(
    model=model,
    preprocessing_pipeline=preprocessing,
    output_uri="pickles/test_nyata_package_rfc_no_data.pickle",
)

# The hierarchical clustering technique 'DNA' requires the training
# data alongside the model, so we can also provide it through the
# 'train_data' argument.
nyata.dump(
    model=model,
    train_data=(X_train_cls, y_train_cls),
    attr_labels=feat_names_cls,
    preprocessing_pipeline=preprocessing,
    output_uri="pickles/test_nyata_package_rfc.pickle",
)

### Decision Tree Classifiers

In [5]:
model = sklearn.tree.DecisionTreeClassifier(
    max_depth=np.random.randint(4, 8),
    min_samples_split=np.random.randint(10, 25),
)
model.fit(X_train_cls, y_train_cls)

preprocessing = sklearn.impute.SimpleImputer(strategy="mean")
preprocessing.fit(X_train_cls, y_train_cls)

# Create a specific .pickle for Nyata input
nyata.dump(
    model=model,
    preprocessing_pipeline=preprocessing,
    output_uri="pickles/test_nyata_package_dtc.pickle",
)

## 2. Build regressors

In [6]:
# Building a scikit-learn forest model (also works with trees).
dataset_reg = sklearn.datasets.load_diabetes()

X_reg = dataset_reg.data
y_reg = dataset_reg.target
feat_names_reg = dataset_reg.feature_names

(
    X_train_reg,
    X_test_reg,
    y_train_reg,
    y_test_reg,
) = sklearn.model_selection.train_test_split(X_reg, y_reg, shuffle=True, random_state=16)

# Saving a separated test dataset to show test the 'load test dataset' Nyata feature.
np.savetxt(
    "test_data/test_data_regression.csv",
    np.column_stack((X_test_reg, y_test_reg)),
    delimiter=",",
    fmt="%.3f",
)

### Random Forest Regressors

In [7]:
model = sklearn.ensemble.RandomForestRegressor(
    n_estimators=np.random.randint(100, 301),
    max_depth=np.random.randint(4, 8),
    min_samples_split=np.random.randint(10, 25),
)
model.fit(X_train_reg, y_train_reg)

preprocessing = sklearn.impute.SimpleImputer(strategy="mean")
preprocessing.fit(X_train_reg, y_train_reg)

nyata.dump(
    model=model,
    output_uri="pickles/test_nyata_package_rfr_no_data.pickle",
    preprocessing_pipeline=preprocessing,
)

# The hierarchical clustering technique 'DNA' requires the training
# data alongside the model.
nyata.dump(
    model=model,
    train_data=(X_train_reg, y_train_reg),
    attr_labels=feat_names_reg,
    output_uri="pickles/test_nyata_package_rfr.pickle",
    preprocessing_pipeline=preprocessing,
)

### Decision Tree Regressors

In [8]:
model = sklearn.tree.DecisionTreeRegressor(
    max_depth=np.random.randint(4, 8),
    min_samples_split=np.random.randint(10, 25),
)
model.fit(X_train_reg, y_train_reg)

preprocessing = sklearn.impute.SimpleImputer(strategy="mean")
preprocessing.fit(X_train_reg, y_train_reg)

nyata.dump(
    model=model,
    output_uri="pickles/test_nyata_package_dtr.pickle",
    preprocessing_pipeline=preprocessing,
)