In [5]:
import pyarrow as pa
import pyarrow.ipc as ipc
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits
from sklearn.metrics import accuracy_score
from supervised.automl import AutoML
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.metrics import classification_report

## Binary Classification Exemple

In [None]:
df = pd.read_csv(
    "https://raw.githubusercontent.com/pplonski/datasets-for-start/master/adult/data.csv",
    skipinitialspace=True,
)
X_train, X_test, y_train, y_test = train_test_split(
    df[df.columns[:-1]], df["income"], test_size=0.25
)

automl = AutoML(mode="Explain", explain_level=2,)
automl.fit(X_train, y_train)

predictions = automl.predict(X_test)

print(classification_report(y_test, predictions))

In [7]:
# Chemin vers le fichier .arrow
file_path = '/Users/dominicprenovost/Programmation/AutoML/AutoML-Practice/pricing-canada.arrow'

# Lecture du fichier .arrow
with pa.memory_map(file_path, 'r') as source:
    table = ipc.RecordBatchFileReader(source).read_all()

# Conversion en DataFrame Pandas
df = table.to_pandas()

X_train, X_test, y_train, y_test = train_test_split(
    df[df.columns[:-1]], df["trading_value_1d_usd"], test_size=0.25
)

automl = AutoML()
automl.fit(X_train, y_train)

predictions = automl.predict(X_test)

#print(classification_report(y_test, predictions))



Linear algorithm was disabled.
AutoML directory: AutoML_4
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['Baseline', 'Decision Tree', 'Random Forest', 'Xgboost', 'Neural Network']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'ensemble']
* Step simple_algorithms will try to check up to 2 models
1_Baseline rmse 65722829.988101 trained in 0.17 seconds
2_DecisionTree rmse 30631625.064356 trained in 1.45 seconds
* Step default_algorithms will try to check up to 3 models
3_Default_Xgboost rmse 17250154.794919 trained in 2.2 seconds
4_Default_NeuralNetwork rmse 13518628.250886 trained in 0.95 seconds
5_Default_RandomForest rmse 17867705.455813 trained in 6.14 seconds
* Step ensemble will try to check up to 1 model
Ensemble rmse 10912067.751898 trained in 0.07 seconds
AutoML fit time: 13.47 seconds
AutoML best model: Ensemble


## Multi-Class Classification Example

In [None]:
# load the data
digits = load_digits()
X_train, X_test, y_train, y_test = train_test_split(
    pd.DataFrame(digits.data), digits.target, stratify=digits.target, test_size=0.25,
    random_state=123
)

# train models with AutoML
automl = AutoML(mode="Perform")
automl.fit(X_train, y_train)

# compute the accuracy on test data
predictions = automl.predict_all(X_test)
print(predictions.head())
print("Test accuracy:", accuracy_score(y_test, predictions["label"].astype(int)))

## Regression Exemple

In [None]:
# Load the data
housing = fetch_california_housing()
X_train, X_test, y_train, y_test = train_test_split(
    pd.DataFrame(housing.data, columns=housing.feature_names),
    housing.target,
    test_size=0.25,
    random_state=123,
)

# train models with AutoML
automl = AutoML(mode="Explain")
automl.fit(X_train, y_train)

# compute the MSE on test data
predictions = automl.predict(X_test)
print("Test MSE:", mean_squared_error(y_test, predictions))

## Classify Random Data

In [None]:
COLS = 10

for ROWS in [1000, 5000, 10000]:
    X = np.random.uniform(size=(ROWS, COLS))
    y = np.random.randint(0, 2, size=(ROWS,))

    automl = AutoML(results_path=f"AutoML_{ROWS//1000}k", mode="Explain", features_selection=True)
    automl.fit(X, y)

## Classify Titanic Passenger

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from supervised import AutoML

train = pd.read_csv("https://raw.githubusercontent.com/pplonski/datasets-for-start/master/Titanic/train.csv")

X = train[train.columns[2:]]
y = train["Survived"]

automl = AutoML(results_path="AutoML_3")
automl.fit(X, y)

test = pd.read_csv("https://raw.githubusercontent.com/pplonski/datasets-for-start/master/Titanic/test_with_Survived.csv")
predictions = automl.predict(test)
print(f"Accuracy: {accuracy_score(test['Survived'], predictions)*100.0:.2f}%" )