In [1]:
import os
import shutil
import pandas as pd
import mercury as mr
from supervised.automl import AutoML 

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
app = mr.App(title="Train AutoML (advanced) 🤓", 
             description="Train ML pipeline with MLJAR AutoML with more params")

# Train Machine Learning Pipeline with MLJAR AutoML

You can control AutoML behavior with more parameters. This notebook is running autoML in the `Compete` mode. 

You can choose:
- feature preprocessing parameters: golden features and features selection
- select algorithms, stack, and ensemble them,
- set cross-validation strategy (number of folds, stratify and shuffle),
- choose evaluation metric.

### Steps
1. Upload CSV file with data. Data should have column names in the first line.
2. Select input features and target column.
3. Select AutoML training mode, algorithms, and training time limit.
4. Directory with all ML models will be zipped and available to download.

In [4]:
data_file = mr.File(label="Upload CSV with training data", max_file_size="1MB")

mercury.File

In [5]:
if data_file.filepath is None:
    mr.Stop()

In [None]:
df = pd.read_csv(data_file.filepath)

In [None]:
mr.Markdown("### Training data")

In [None]:
df

In [None]:
x_columns = mr.MultiSelect(label="Input features", value=list(df.columns)[:-1], 
                           choices=list(df.columns))

In [None]:
y_column = mr.Select(label="Target", value=list(df.columns)[-1], choices=list(df.columns))

In [None]:
if x_columns.value is None or len(x_columns.value) == 0 or y_column.value is None:
    print("Please select input features and target column")
    mr.Stop()

In [None]:
_ = mr.Note("#### Prepare data")

In [None]:
golden_features = mr.Checkbox(label="Construct Golden Features")

In [None]:
features_selection = mr.Checkbox(label="Features Selection")

In [None]:
_ = mr.Note("#### Algorithms")

In [None]:
algos = ["Decision Tree", "Linear", "Random Forest", "Extra Trees", "LightGBM", 
                "Xgboost", "CatBoost", "Neural Network", "Nearest Neighbors"]


In [None]:
algorithms = mr.MultiSelect(label="Algorithms", value=algos, choices=algos)

In [None]:
stack_models = mr.Checkbox(label="Stack Models")

In [None]:
train_ensemble = mr.Checkbox(label="Train Ensemble")

In [None]:
_ = mr.Note("#### Validation")

In [None]:
folds = mr.Numeric(label="Number of Folds", value=5, min=2, max=100)

In [None]:
shuffle = mr.Checkbox(label="Suffle Samples")

In [None]:
stratify = mr.Checkbox(label="Stratify Samples")

In [None]:
eval_metric = mr.Select(label="Evaluation Metric", value="auto", 
                           choices=["auto", "logloss", "f1", "average_precision",
                                    "accuracy", "rmse", "mse", "mae", "r2",
                                    "mape", "spearman", "pearson"])

In [None]:
time_limit = mr.Select(label="Time Limit (seconds)", value="60", choices=["60", "120", "240", "300"])

In [None]:
start_training = mr.Button(label="Start Training", style="success")

In [None]:
output_dir = mr.OutputDir()

In [None]:
automl = AutoML(mode="Compete", 
                algorithms=algorithms.value,
                train_ensemble=train_ensemble.value,
                stack_models=stack_models.value,
                golden_features=golden_features.value,
                features_selection=features_selection.value,
                validation_strategy={
                    "validation_type": "kfold",
                    "k_folds": int(folds.value),
                    "shuffle": shuffle.value,
                    "stratify": stratify.value,
                    "random_seed": 123
                },
                eval_metric=eval_metric.value,
                total_time_limit=int(time_limit.value))

In [None]:
if start_training.clicked:
    mr.Markdown("### AutoML training logs")
    automl.fit(df[x_columns.value], df[y_column.value])
    
    output_filename = os.path.join(output_dir.path, automl._results_path)
    shutil.make_archive(output_filename, 'zip', automl._results_path)

In [None]:
if automl._best_model is None:
    mr.Stop()

In [None]:
automl.report()