In [1]:
import pyarrow as pa
import pyarrow.ipc as ipc
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits
from sklearn.metrics import accuracy_score
from supervised.automl import AutoML
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.metrics import classification_report

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


## Binary Classification Exemple

In [7]:
df = pd.read_csv(
    "https://raw.githubusercontent.com/pplonski/datasets-for-start/master/adult/data.csv",
    skipinitialspace=True,
)
X_train, X_test, y_train, y_test = train_test_split(
    df[df.columns[:-1]], df["income"], test_size=0.25
)

automl = AutoML(mode="Perform", algorithms=["Xgboost"])
automl.fit(X_train, y_train)

predictions = automl.predict(X_test)

print(classification_report(y_test, predictions))

AutoML directory: AutoML_3
The task is binary_classification with evaluation metric logloss
AutoML will use algorithms: ['Xgboost']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 1 model
1_Default_Xgboost logloss 0.278835 trained in 9.71 seconds (1-sample predict time 0.0111 seconds)
* Step not_so_random will try to check up to 4 models
2_Xgboost logloss 0.282561 trained in 8.17 seconds (1-sample predict time 0.0124 seconds)
3_Xgboost logloss 0.280678 trained in 7.87 seconds (1-sample predict time 0.0115 seconds)
4_Xgboost logloss 0.289586 trained in 9.56 seconds (1-sample predict time 0.0113 seconds)
5_Xgboost logloss 0.299258 trained in 7.95 seconds (1-sample predict time 0.0111 seconds)
* Step g

In [4]:
# Chemin vers le fichier .arrow
file_path = '/Users/dominicprenovost/Programmation/AutoML/AutoML-Practice/pricing-canada.arrow'

# Lecture du fichier .arrow
with pa.memory_map(file_path, 'r') as source:
    table = ipc.RecordBatchFileReader(source).read_all()

# Conversion en DataFrame Pandas
df = table.to_pandas()

X_train, X_test, y_train, y_test = train_test_split(
    df[df.columns[:-1]], df["trading_value_1d_usd"], test_size=0.25
)

automl = AutoML(mode="Perform")#, algorithms="Xgboost")
automl.fit(X_train, y_train)

predictions = automl.predict(X_test)

#print(classification_report(y_test, predictions))



Linear algorithm was disabled.
AutoML directory: AutoML_3
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['Random Forest', 'LightGBM', 'Xgboost', 'CatBoost', 'Neural Network']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 5 models
1_Default_LightGBM rmse 15765106.76579 trained in 21.85 seconds (1-sample predict time 0.0139 seconds)
2_Default_Xgboost rmse 16242055.558968 trained in 5.97 seconds (1-sample predict time 0.0154 seconds)
There was an error during 3_Default_CatBoost training.
Please check AutoML_3/errors.md for details.
4_Default_NeuralNetwork rmse 10148698.87322 trained in 6.22 seconds (1-sample predict time 0.0298 seconds)


KeyboardInterrupt: 

## Multi-Class Classification Example

In [None]:
# load the data
digits = load_digits()
X_train, X_test, y_train, y_test = train_test_split(
    pd.DataFrame(digits.data), digits.target, stratify=digits.target, test_size=0.25,
    random_state=123
)

# train models with AutoML
automl = AutoML(mode="Perform")
automl.fit(X_train, y_train)

# compute the accuracy on test data
predictions = automl.predict_all(X_test)
print(predictions.head())
print("Test accuracy:", accuracy_score(y_test, predictions["label"].astype(int)))

## Regression Exemple

In [None]:
# Load the data
housing = fetch_california_housing()
X_train, X_test, y_train, y_test = train_test_split(
    pd.DataFrame(housing.data, columns=housing.feature_names),
    housing.target,
    test_size=0.25,
    random_state=123,
)

# train models with AutoML
automl = AutoML(mode="Explain")
automl.fit(X_train, y_train)

# compute the MSE on test data
predictions = automl.predict(X_test)
print("Test MSE:", mean_squared_error(y_test, predictions))

## Classify Random Data

In [None]:
COLS = 10

for ROWS in [1000, 5000, 10000]:
    X = np.random.uniform(size=(ROWS, COLS))
    y = np.random.randint(0, 2, size=(ROWS,))

    automl = AutoML(results_path=f"AutoML_{ROWS//1000}k", mode="Explain", features_selection=True)
    automl.fit(X, y)

## Classify Titanic Passenger

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from supervised import AutoML

train = pd.read_csv("https://raw.githubusercontent.com/pplonski/datasets-for-start/master/Titanic/train.csv")

X = train[train.columns[2:]]
y = train["Survived"]

automl = AutoML(results_path="AutoML_3")
automl.fit(X, y)

test = pd.read_csv("https://raw.githubusercontent.com/pplonski/datasets-for-start/master/Titanic/test_with_Survived.csv")
predictions = automl.predict(test)
print(f"Accuracy: {accuracy_score(test['Survived'], predictions)*100.0:.2f}%" )