In [1]:
import pyarrow as pa
import pyarrow.ipc as ipc
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits
from sklearn.metrics import accuracy_score
from supervised.automl import AutoML
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import mean_squared_error
import numpy as np

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


## Binary Classification Exemple

In [2]:
df = pd.read_csv(
    "https://raw.githubusercontent.com/pplonski/datasets-for-start/master/adult/data.csv",
    skipinitialspace=True,
)
X_train, X_test, y_train, y_test = train_test_split(
    df[df.columns[:-1]], df["income"], test_size=0.25
)

automl = AutoML(mode="Explain", explain_level=2,)
automl.fit(X_train, y_train)

predictions = automl.predict(X_test)

Linear algorithm was disabled.
AutoML directory: AutoML_1
The task is binary_classification with evaluation metric logloss
AutoML will use algorithms: ['Baseline', 'Decision Tree', 'Random Forest', 'Xgboost', 'Neural Network']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'ensemble']
* Step simple_algorithms will try to check up to 2 models
1_Baseline logloss 0.552923 trained in 0.22 seconds
log_loss_eps() got an unexpected keyword argument 'response_method'
Problem during computing permutation importance. Skipping ...




2_DecisionTree logloss 0.367314 trained in 3.66 seconds
* Step default_algorithms will try to check up to 3 models
log_loss_eps() got an unexpected keyword argument 'response_method'
Problem during computing permutation importance. Skipping ...
3_Default_Xgboost logloss 0.277682 trained in 1.95 seconds
log_loss_eps() got an unexpected keyword argument 'response_method'
Problem during computing permutation importance. Skipping ...
4_Default_NeuralNetwork logloss 0.325852 trained in 2.54 seconds
log_loss_eps() got an unexpected keyword argument 'response_method'
Problem during computing permutation importance. Skipping ...




5_Default_RandomForest logloss 0.340462 trained in 5.97 seconds
* Step ensemble will try to check up to 1 model
Ensemble logloss 0.277682 trained in 1.23 seconds
AutoML fit time: 18.87 seconds
AutoML best model: 3_Default_Xgboost


## Multi-Class Classification Example

In [None]:
# load the data
digits = load_digits()
X_train, X_test, y_train, y_test = train_test_split(
    pd.DataFrame(digits.data), digits.target, stratify=digits.target, test_size=0.25,
    random_state=123
)

# train models with AutoML
automl = AutoML(mode="Perform")
automl.fit(X_train, y_train)

# compute the accuracy on test data
predictions = automl.predict_all(X_test)
print(predictions.head())
print("Test accuracy:", accuracy_score(y_test, predictions["label"].astype(int)))

## Regression Exemple

In [2]:
# Load the data
housing = fetch_california_housing()
X_train, X_test, y_train, y_test = train_test_split(
    pd.DataFrame(housing.data, columns=housing.feature_names),
    housing.target,
    test_size=0.25,
    random_state=123,
)

# train models with AutoML
automl = AutoML(mode="Explain")
automl.fit(X_train, y_train)

# compute the MSE on test data
predictions = automl.predict(X_test)
print("Test MSE:", mean_squared_error(y_test, predictions))

Linear algorithm was disabled.
AutoML directory: AutoML_1
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['Baseline', 'Decision Tree', 'Random Forest', 'Xgboost', 'Neural Network']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'ensemble']
* Step simple_algorithms will try to check up to 2 models
1_Baseline rmse 1.175568 trained in 0.14 seconds




2_DecisionTree rmse 0.814045 trained in 3.59 seconds
* Step default_algorithms will try to check up to 3 models
3_Default_Xgboost rmse 0.466835 trained in 3.27 seconds
4_Default_NeuralNetwork rmse 0.576015 trained in 0.78 seconds




5_Default_RandomForest rmse 0.73272 trained in 2.34 seconds
* Step ensemble will try to check up to 1 model
Ensemble rmse 0.466835 trained in 0.07 seconds
AutoML fit time: 12.36 seconds
AutoML best model: 3_Default_Xgboost
Test MSE: 0.21027419363314073


## Classify Random Data

In [None]:
COLS = 10

for ROWS in [1000, 5000, 10000]:
    X = np.random.uniform(size=(ROWS, COLS))
    y = np.random.randint(0, 2, size=(ROWS,))

    automl = AutoML(results_path=f"AutoML_{ROWS//1000}k", mode="Explain", features_selection=True)
    automl.fit(X, y)