In [None]:
# !pip install mljar-supervised

In [1]:
import os
import time
from os import path

import joblib
import numpy as np
import pandas as pd
from autogluon.tabular import TabularPredictor
from catboost import CatBoostClassifier
from scipy.stats import randint, uniform
from sklearn.ensemble import (
    ExtraTreesClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
    StackingClassifier,
    VotingClassifier,
)
from sklearn.feature_selection import (
    RFE,
    RFECV,
    SelectKBest,
    VarianceThreshold,
    f_classif,
)
from sklearn.linear_model import ElasticNet, LogisticRegression
from sklearn.metrics import balanced_accuracy_score, mutual_info_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import shuffle
from supervised.automl import AutoML
from xgboost import XGBClassifier

# from supervised.automl import AutoML  # mljar-supervised

In [2]:
# Constants

SEED = 42
N_JOBS = -1
RANDOM_SEARCH_N_ITER = 1
TRAIN_TIME_LIMIT_AUTOGLUON = 60 * 60 * 6
TRAIN_TIME_LIMIT_MLJAR = 60 * 60 * 6
OUTPUT_DIR_MANUAL = path.join("output", "manual")
OUTPUT_DIR_AUTOGLUON = path.join("output", "autogluon")
OUTPUT_DIR_MLJAR = path.join("output", "mljar")
UNIQUE_ID = time.strftime("%Y%m%d_%H%M%S")
APPLY_REMOVE_LOW_VARIANCE_FEATURES = False
APPLY_REMOVE_CORRELATED_FEATURES = False
APPLY_REMOVE_RANDOM_FEATURES = False
APPLY_ANOVA = False
ANOVE_FEATURES = 25

In [3]:
# prepare output directories
for output_dir in [OUTPUT_DIR_MANUAL, OUTPUT_DIR_AUTOGLUON, OUTPUT_DIR_MLJAR]:
    if not path.exists(path.join(output_dir, UNIQUE_ID)):
        print(f"Creating output directory {path.join(output_dir, UNIQUE_ID)}")
        os.makedirs(path.join(output_dir, UNIQUE_ID))

Creating output directory output\manual\20240114_145731
Creating output directory output\autogluon\20240114_145731
Creating output directory output\mljar\20240114_145731


In [4]:
def remove_highly_correlated_features(train_x, valid_x, test_x, threshold=0.95):
    # Calculate correlation matrix
    corr_matrix = np.corrcoef(train_x, rowvar=False)
    # Select upper triangle of correlation matrix
    upper = np.triu(corr_matrix, k=1)
    # Find indices of feature columns with correlation greater than threshold
    to_drop = [i for i in range(upper.shape[1]) if any(upper[:, i] > threshold)]

    # Drop features from train, validation, and test set
    train_x = np.delete(train_x, to_drop, axis=1)
    valid_x = np.delete(valid_x, to_drop, axis=1)
    test_x = np.delete(test_x, to_drop, axis=1)

    return train_x, valid_x, test_x

In [5]:
# Remove Low Variance Columns
def remove_low_variance_features(train_x, valid_x, test_x, threshold=(0.8 * (1 - 0.8))):
    sel = VarianceThreshold(threshold=threshold)
    sel.fit(train_x)
    train_x = train_x[:, sel.get_support(indices=True)]
    valid_x = valid_x[:, sel.get_support(indices=True)]
    test_x = test_x[:, sel.get_support(indices=True)]
    return train_x, valid_x, test_x

In [6]:
# Remove Random Columns
def remove_random_features(
    train_x: np.ndarray,
    train_y: np.ndarray,
    valid_x: np.ndarray,
    test_x: np.ndarray,
    importance=0.005,
):
    tree: DecisionTreeClassifier = DecisionTreeClassifier(random_state=0)
    tree.fit(train_x, train_y)
    importances = tree.feature_importances_

    # Assume columns with very low importance are "random"
    # This threshold can be adjusted based on domain knowledge
    important_indices = [i for i, imp in enumerate(importances) if imp > importance]
    train_x = train_x[:, important_indices]
    valid_x = valid_x[:, important_indices]
    test_x = test_x[:, important_indices]
    return train_x, valid_x, test_x

In [7]:
def anova_filter(
    train_x: np.ndarray,
    train_y: np.ndarray,
    valid_x: np.ndarray,
    test_x: np.ndarray,
    k: int = 50,
):
    selector = SelectKBest(f_classif, k=k)
    selector.fit(train_x, train_y)

    train_x = selector.transform(train_x)
    valid_x = selector.transform(valid_x)
    test_x = selector.transform(test_x)
    return train_x, valid_x, test_x

In [8]:
def dump_proba(model, test_x, output_path_proba):
    proba = model.predict_proba(test_x)

    if isinstance(proba, pd.DataFrame):
        proba = proba.values

    np.savetxt(
        output_path_proba,
        proba[:, 1],
        delimiter="\n",
        header='"313201_313212"',
        comments="",
        # fmt="%.19f",
    )

In [9]:
def dump_model(model, output_path_model):
    joblib.dump(model, output_path_model)

In [10]:
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.feature_selection import RFE, RFECV


def perform_feature_selection(
    train_x, train_y, valid_x, test_x, n_features_to_select=None
):
    estimator_et = ExtraTreesClassifier()
    rfe_et = RFE(estimator=estimator_et, n_features_to_select=250)
    rfe_et.fit(train_x, train_y)
    train_x = train_x[:, rfe_et.support_]
    valid_x = valid_x[:, rfe_et.support_]
    test_x = test_x[:, rfe_et.support_]
    print(train_x.shape, valid_x.shape, test_x.shape)

    estimator_rf = RandomForestClassifier()
    rfe_rf = RFE(estimator=estimator_rf, n_features_to_select=125)
    rfe_rf.fit(train_x, train_y)
    train_x = train_x[:, rfe_rf.support_]
    valid_x = valid_x[:, rfe_rf.support_]
    test_x = test_x[:, rfe_rf.support_]
    print(train_x.shape, valid_x.shape, test_x.shape)

    rfecv_et = RFECV(estimator=estimator_et, cv=3, min_features_to_select=25)
    rfecv_et.fit(train_x, train_y)
    train_x = train_x[:, rfecv_et.support_]
    valid_x = valid_x[:, rfecv_et.support_]
    test_x = test_x[:, rfecv_et.support_]
    print(train_x.shape, valid_x.shape, test_x.shape)

    rfecv_rf = RFECV(estimator=estimator_rf, cv=3, min_features_to_select=15)
    rfecv_rf.fit(train_x, train_y)
    train_x = train_x[:, rfecv_rf.support_]
    valid_x = valid_x[:, rfecv_rf.support_]
    test_x = test_x[:, rfecv_rf.support_]
    print(train_x.shape, valid_x.shape, test_x.shape)

    return train_x, valid_x, test_x

In [11]:
prefix = ""

_test_x = pd.read_table(prefix + "artificial_test.data", sep=" ", header=None)
_test_x.drop(_test_x.columns[500], axis=1, inplace=True)
_train_y = pd.read_table(prefix + "artificial_train.labels", header=None)
_train_x = pd.read_table(prefix + "artificial_train.data", sep=" ", header=None)
_train_x.drop(_train_x.columns[500], axis=1, inplace=True)

In [12]:
_test_x = np.array(_test_x, dtype=float, copy=True)
_train_x = np.array(_train_x, dtype=float, copy=True)
_train_y = np.array(_train_y, dtype=float, copy=True)

In [13]:
_train_x, _train_y = shuffle(_train_x, _train_y, random_state=42)

In [14]:
def get_train_and_validation_data():
    split = 400
    train_x, valid_x = _train_x[split:].copy(), _train_x[:split].copy()
    train_y, valid_y = _train_y[split:].copy(), _train_y[:split].copy()
    return train_x, train_y, valid_x, valid_y

In [17]:
train_x, train_y, valid_x, valid_y = get_train_and_validation_data()
print(train_x.shape, train_y.shape, valid_x.shape, valid_y.shape)

(1600, 500) (1600, 1) (400, 500) (400, 1)


In [18]:
train_x, valid_x, test_x = perform_feature_selection(
    train_x, train_y.copy().ravel(), valid_x, _test_x, n_features_to_select=None
)
print(train_x.shape, valid_x.shape, test_x.shape)

(1600, 250) (400, 250) (600, 250)
(1600, 125) (400, 125) (600, 125)
(1600, 26) (400, 26) (600, 26)
(1600, 20) (400, 20) (600, 20)
(1600, 20) (400, 20) (600, 20)


In [19]:
if APPLY_REMOVE_CORRELATED_FEATURES:
    train_x, valid_x, test_x = remove_highly_correlated_features(
        train_x, valid_x, _test_x
    )
    print("train_x.shape: ", train_x.shape)
    print("valid_x.shape: ", valid_x.shape)
    print("test_x.shape: ", test_x.shape)

In [20]:
if APPLY_REMOVE_LOW_VARIANCE_FEATURES:
    train_x, valid_x, test_x = remove_low_variance_features(train_x, valid_x, test_x)
    print("train_x.shape: ", train_x.shape)
    print("valid_x.shape: ", valid_x.shape)
    print("test_x.shape: ", test_x.shape)

In [21]:
if APPLY_REMOVE_RANDOM_FEATURES:
    train_x, valid_x, test_x = remove_random_features(
        train_x=train_x, train_y=train_y, valid_x=valid_x, test_x=test_x
    )
    print("train_x.shape: ", train_x.shape)
    print("valid_x.shape: ", valid_x.shape)
    print("test_x.shape: ", test_x.shape)

In [22]:
if APPLY_ANOVA:
    train_x, valid_x, test_x = anova_filter(
        train_x=train_x,
        train_y=train_y,
        valid_x=valid_x,
        test_x=test_x,
        k=ANOVE_FEATURES,
    )
    print("train_x.shape: ", train_x.shape)
    print("valid_x.shape: ", valid_x.shape)
    print("test_x.shape: ", test_x.shape)

In [23]:
print("train_x.shape: ", train_x.shape)
print("train_y.shape: ", train_y.shape)
print("valid_x.shape: ", valid_x.shape)
print("valid_y.shape: ", valid_y.shape)
print("test_x.shape: ", test_x.shape)

train_x.shape:  (1600, 20)
train_y.shape:  (1600, 1)
valid_x.shape:  (400, 20)
valid_y.shape:  (400, 1)
test_x.shape:  (600, 20)


In [24]:
if isinstance(train_x, np.ndarray):
    print("ok")
if isinstance(train_y, np.ndarray):
    print("ok")
if isinstance(valid_x, np.ndarray):
    print("ok")
if isinstance(valid_y, np.ndarray):
    print("ok")
if isinstance(test_x, np.ndarray):
    print("ok")

ok
ok
ok
ok
ok


### manual model

In [25]:
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import RandomizedSearchCV

base_classifiers_1 = [
    (
        "mlp",
        make_pipeline(
            StandardScaler(),
            MLPClassifier(
                random_state=SEED,
                max_iter=1000,
                early_stopping=True,
                tol=1e-3,
                solver="lbfgs",
                hidden_layer_sizes=(100, 300, 200, 100),
                alpha=0.001,
            ),
        ),
    ),
    (
        "rf",
        make_pipeline(
            StandardScaler(),
            RandomForestClassifier(
                random_state=SEED,
                n_estimators=500,
                max_depth=30,
                min_samples_leaf=4,
                min_samples_split=2,
            ),
        ),
    ),
]

base_classifiers_2 = [
    (
        "mlp",
        make_pipeline(
            StandardScaler(),
            MLPClassifier(
                random_state=SEED,
                max_iter=1000,
                early_stopping=True,
                tol=1e-3,
                solver="lbfgs",
                hidden_layer_sizes=(100, 300, 200, 100),
                alpha=0.001,
            ),
        ),
    ),
    (
        "gbc",
        make_pipeline(
            StandardScaler(),
            GradientBoostingClassifier(
                random_state=SEED,
                max_features=None,
                n_estimators=500,
                max_depth=30,
                min_samples_leaf=2,
                min_samples_split=5,
            ),
        ),
    ),
]

stacked_ensamble_1 = StackingClassifier(
    estimators=base_classifiers_1, final_estimator=LogisticRegression(), cv=5
)

stacked_ensamble_2 = StackingClassifier(
    estimators=base_classifiers_2, final_estimator=LogisticRegression(), cv=5
)

committee_models = [
    ("stacked_ensemble_1", stacked_ensamble_1),
    ("stacked_ensemble_2", stacked_ensamble_2),
    (
        "gbc",
        make_pipeline(
            StandardScaler(),
            GradientBoostingClassifier(
                random_state=SEED,
                max_features=None,
                n_estimators=500,
                max_depth=30,
                min_samples_leaf=2,
                min_samples_split=5,
            ),
        ),
    ),
    ("et", make_pipeline(StandardScaler(), ExtraTreesClassifier(
        random_state=SEED,
        n_estimators=500,
        max_depth=30,
        min_samples_leaf=4,
        min_samples_split=2,
        ))),
    (
        "rf",
        make_pipeline(
            StandardScaler(),
            RandomForestClassifier(
                random_state=SEED,
                n_estimators=500,
                max_depth=30,
                min_samples_leaf=4,
                min_samples_split=2,
            ),
        ),
    ),
    (
        "mlp",
        make_pipeline(
            StandardScaler(),
            MLPClassifier(
                random_state=SEED,
                max_iter=1000,
                early_stopping=True,
                tol=1e-3,
                solver="lbfgs",
                hidden_layer_sizes=(100, 300, 200, 100),
                alpha=0.001,
            ),
        ),
    ),
    (
        "cb",
        make_pipeline(
            StandardScaler(),
            CatBoostClassifier(
                iterations=500,
                learning_rate=0.03,
                depth=6,
                l2_leaf_reg=3,
                border_count=32,
                cat_features=None,
                loss_function="Logloss",
                eval_metric="Accuracy",
                random_seed=SEED,
                early_stopping_rounds=50,
                verbose=100,
            ),
        ),
    ),
    (
        "xgb",
        make_pipeline(
            StandardScaler(),
            XGBClassifier(
                random_state=SEED,
                use_label_encoder=False,
                eval_metric=balanced_accuracy_score,
                n_estimators=500,
                learning_rate=0.02,
                max_depth=6,
                min_child_weight=1,
                subsample=0.8,
                colsample_bytree=0.8,
                gamma=0,
                reg_alpha=0.1,
                reg_lambda=1.0,
                scale_pos_weight=1,
            ),
        ),
    ),
]

# from sklearn.model_selection import cross_val_score, cross_validate
# from sklearn.metrics import make_scorer
# balanced_accuracy_scorer = make_scorer(balanced_accuracy_score)
# results = cross_validate(committee_model, train_x, train_y.ravel(), cv=5, scoring=balanced_accuracy_scorer)
# print("Cross-validation results: ", results)

committee_model = VotingClassifier(committee_models, voting="soft")
committee_model.fit(train_x.copy(), train_y.copy().ravel())

0:	learn: 0.7775000	total: 160ms	remaining: 1m 20s
100:	learn: 0.9162500	total: 536ms	remaining: 2.12s
200:	learn: 0.9418750	total: 900ms	remaining: 1.34s
300:	learn: 0.9631250	total: 1.24s	remaining: 823ms
400:	learn: 0.9775000	total: 1.59s	remaining: 392ms
499:	learn: 0.9862500	total: 1.93s	remaining: 0us


In [26]:
y_pred = committee_model.predict(valid_x)
balanced_accuracy = balanced_accuracy_score(valid_y, y_pred)

print(f"Model Balanced Accuracy: {balanced_accuracy}")

Model Balanced Accuracy: 0.8999774949363607


In [27]:
output_path_proba = path.join(OUTPUT_DIR_MANUAL, UNIQUE_ID, "manual_model_proba.txt")
output_path_model = path.join(OUTPUT_DIR_MANUAL, UNIQUE_ID, "manual_model.pkl")
dump_proba(committee_model, test_x, output_path_proba)
dump_model(committee_model, output_path_model)

### Autogluon

In [28]:
train_data = np.concatenate((train_x.copy(), train_y.copy()), axis=1)
train_data_pd = pd.DataFrame(train_data, copy=True)
train_data_pd.rename(columns={train_data_pd.columns[-1]: "class"}, inplace=True)

valid_data = np.concatenate((valid_x.copy(), valid_y.copy()), axis=1)
valid_data_pd = pd.DataFrame(data=valid_data, copy=True)
valid_data_pd.rename(columns={valid_data_pd.columns[-1]: "class"}, inplace=True)

print(train_data_pd.shape, valid_data_pd.shape)

(1600, 21) (400, 21)


In [29]:
save_path = path.join(OUTPUT_DIR_AUTOGLUON, UNIQUE_ID)
predictor = TabularPredictor(
    label="class",
    path=save_path,
    eval_metric="balanced_accuracy",
    problem_type="binary",
).fit(
    train_data=train_data_pd,
    time_limit=TRAIN_TIME_LIMIT_AUTOGLUON,
    presets="best_quality",
    hyperparameters="default",
    fit_weighted_ensemble=True,
    fit_full_last_level_weighted_ensemble=True,
    full_weighted_ensemble_additionally=True,
    num_bag_folds=15,
    num_bag_sets=25,
    num_stack_levels=3,
    auto_stack=True,
    dynamic_stacking=True,
    feature_generator="auto",
    hyperparameter_tune_kwargs={
        "scheduler": "local",
        "searcher": "auto",
        "time_out": 1200,
        "num_trials": 30,
    },
)

No model was trained during hyperparameter tuning NeuralNetTorch_BAG_L1... Skipping this model.
Fitting model: LightGBMLarge_BAG_L1 ... Training model for up to 74.42s of the -621.65s of remaining time.
	Fitting 15 child models (S1F1 - S1F15) | Fitting with ParallelLocalFoldFittingStrategy (8 workers, per: cpus=1, gpus=0, memory=0.19%)
	0.8994	 = Validation score   (balanced_accuracy)
	31.01s	 = Training   runtime
	0.12s	 = Validation runtime
Completed 1/25 k-fold bagging repeats ...
Fitting model: WeightedEnsemble_L2 ... Training model for up to 360.0s of the -664.37s of remaining time.
	Ensemble Weights: {'CatBoost_BAG_L1\T19': 1.0}
	0.9056	 = Validation score   (balanced_accuracy)
	2.59s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting 11 L2 models ...
Completed 1/25 k-fold bagging repeats ...
No base models to train on, skipping auxiliary stack level 3...
No base models to train on, skipping stack level 3...
No base models to train on, skipping auxiliary stack level 4...


In [30]:
predictor.leaderboard()

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,CatBoost_BAG_L1\T19,0.905649,balanced_accuracy,0.000000,111.773090,0.000000,111.773090,1,True,83
1,WeightedEnsemble_L2,0.905649,balanced_accuracy,0.005999,114.366095,0.005999,2.593004,2,True,117
2,WeightedEnsemble_ALL_L5,0.905649,balanced_accuracy,0.009055,114.320091,0.009055,2.547000,5,True,118
3,CatBoost_BAG_L1\T14,0.902513,balanced_accuracy,0.000000,103.769992,0.000000,103.769992,1,True,78
4,LightGBMXT_BAG_L1\T2,0.901904,balanced_accuracy,0.000000,27.887532,0.000000,27.887532,1,True,4
...,...,...,...,...,...,...,...,...,...,...
113,XGBoost_BAG_L1\T14,0.850625,balanced_accuracy,0.000000,30.261424,0.000000,30.261424,1,True,99
114,LightGBMXT_BAG_L1\T12,0.838790,balanced_accuracy,0.000000,26.204720,0.000000,26.204720,1,True,14
115,LightGBMXT_BAG_L1\T15,0.830681,balanced_accuracy,0.000000,26.520986,0.000000,26.520986,1,True,17
116,LightGBMXT_BAG_L1\T25,0.803766,balanced_accuracy,0.000000,26.606920,0.000000,26.606920,1,True,27


In [31]:
predictor.evaluate(valid_data_pd)

{'balanced_accuracy': 0.884899102298017,
 'accuracy': 0.885,
 'mcc': 0.7699522451097641,
 'roc_auc': 0.9496886799529896,
 'f1': 0.8826530612244897,
 'precision': 0.8871794871794871,
 'recall': 0.8781725888324873}

In [32]:
output_path_proba = path.join(
    OUTPUT_DIR_AUTOGLUON, UNIQUE_ID, "autogluon_model_proba.txt"
)
dump_proba(predictor, pd.DataFrame(test_x), output_path_proba)

### MLJar

In [33]:
automl = AutoML(
    mode="Compete",
    ml_task="binary_classification",
    total_time_limit=TRAIN_TIME_LIMIT_MLJAR,
    eval_metric="f1",
    random_state=SEED,
    results_path=path.join(OUTPUT_DIR_MLJAR, UNIQUE_ID, "tmp"),
)
train_y = train_y.copy().reshape(-1)
print(train_y)
automl.fit(train_x.copy(), train_y)

[ 1.  1.  1. ...  1. -1. -1.]
AutoML directory: output\mljar\20240114_145731\tmp
The task is binary_classification with evaluation metric f1
AutoML will use algorithms: ['Decision Tree', 'Linear', 'Random Forest', 'Extra Trees', 'LightGBM', 'Xgboost', 'CatBoost', 'Neural Network', 'Nearest Neighbors']
AutoML will stack models
AutoML will ensemble available models
AutoML steps: ['adjust_validation', 'simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'kmeans_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step adjust_validation will try to check up to 1 model
1_DecisionTree f1 0.702703 trained in 4.44 seconds
Adjust validation. Remove: 1_DecisionTree
Validation strategy: 10-fold CV Shuffle,Stratify
* Step simple_algorithms will try to check up to 4 models
1_DecisionTree f1 0.749035 trained in 7.69 seconds
2_DecisionTree f1 0.65162 trained in 7.44 secon



6_Default_Xgboost f1 0.862305 trained in 31.16 seconds
7_Default_CatBoost f1 0.884184 trained in 18.16 seconds
8_Default_NeuralNetwork f1 0.864831 trained in 17.46 seconds




9_Default_RandomForest f1 0.823602 trained in 24.34 seconds
10_Default_ExtraTrees f1 0.769231 trained in 18.15 seconds
11_Default_NearestNeighbors f1 0.871411 trained in 8.93 seconds
* Step not_so_random will try to check up to 61 models
21_LightGBM f1 0.882426 trained in 21.8 seconds




12_Xgboost f1 0.860377 trained in 26.64 seconds
30_CatBoost f1 0.891386 trained in 21.74 seconds
39_RandomForest f1 0.848823 trained in 24.76 seconds
48_ExtraTrees f1 0.85625 trained in 24.29 seconds
57_NeuralNetwork f1 0.825955 trained in 17.91 seconds




66_NearestNeighbors f1 0.880597 trained in 9.36 seconds
22_LightGBM f1 0.875 trained in 26.75 seconds




In [None]:
loaded = AutoML(
    mode="Compete",
    ml_task="binary_classification",
    total_time_limit=TRAIN_TIME_LIMIT_MLJAR,
    eval_metric="f1",
    random_state=SEED,
    results_path="output\\mljar\\20240114_002215",
)

print(valid_x.shape, valid_y.shape)

print(train_x.shape, train_y.shape)
predictions = loaded.predict_proba(valid_x.copy().reshape(-1))

score = balanced_accuracy_score(valid_y, predictions)

print(f"Model Balanced Accuracy: {score}")

In [None]:
output_path = path.join(OUTPUT_DIR_MLJAR, UNIQUE_ID, "mljar_model_proba.txt")
dump_proba(automl, test_x, output_path)

In [None]:
def ensemble_predict(X, model1, model2):
    pred1 = model1.predict_proba(pd.DataFrame(X)).values[:, 1]
    pred2 = model2.predict_proba(X)[:, 1]
    print(pred1, pred2)
    avg_pred = (pred1 + pred2) / 2

    return avg_pred


final_predictions = ensemble_predict(test_x, predictor, automl)

In [None]:
os.makedirs(path.join("ensamble", UNIQUE_ID), exist_ok=True)

np.savetxt(
    path.join("ensamble", UNIQUE_ID, "123manual_model_pred.txt"),
    final_predictions,
    delimiter="\n",
    comments="",
    header='"313201"',
)