Imports

In [21]:
import functools
import os
import random

import lightgbm as lgb
import numpy as np
import optuna
import pandas as pd
import xgboost as xgb
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier

import warnings
warnings.filterwarnings("ignore")

In [2]:
def seed_everything(seed=34):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything()

# Config

We will define a global Config class which holds common configuration.

We will also define a configuration class for each of the models that will be used in the ensemble. These classes will hold the default hyperparameters, static parameters (parameters that are not tuned), extra parameters to pass to .fit() etc., for the respective models.

Additionally, we will have a mapping between model names and their configuration classes so that we can get to them easily.

Note: The get_fit_params() method is required in the configuration classes of the models since there are subtle differences between what the .fit() method of each model accepts. For example, LogisticRegression.fit() has no parameter called eval_set while XGBClassifier.fit() does. LogisticRegression takes the verbose argument in the initializer while XGBoost takes it in .fit(). XGBoost, since version 1.6.0, has moved arguments like callbacks and eval_metric to the initializer while LightGBM continues to use them in .fit(). These differences are reconciled by dividing the arguments appropriately between STATIC_PARAMS and the output of get_fit_params().

In [3]:
class Config:
    DATA_DIR = "C:/Users/Emincan/Desktop/Playground/"
    L1_N_TRIALS = 100
    L2_N_TRIALS = 20
    N_JOBS = 2
    
    # Map internal identifier to human-friendly name
    MODELS = {
        "lr": "Logisitc Regression",
        "ada": "AdaBoost",
        "rf": "Random Forest",
        "xgb": "XGBoost",
        "lgb": "LightGBM",
        "cb": "CatBoost",
        "gbr": "GradientBoosting",
        "hgbr": "HistGradientBoosting",
    }
    
    @classmethod
    def filepath(cls, filename):
        return os.path.join(cls.DATA_DIR, filename)

In [4]:
# Configuration for Logistic Regression
class LRConfig:
    DEFAULT_VALUES = {
        "tol": 1e-4,
        "C": 1.0,
        "solver": "lbfgs",
    }
    STATIC_PARAMS = {
        "max_iter": 1000,
        "verbose": False,
    }
    
    USE_PRUNER = False
    
    @classmethod
    def get_fit_params(cls, X_train, y_train, X_val, y_val, params):
        return {}

In [5]:
# Configuration for AdaBoost
class AdaConfig:
    DEFAULT_VALUES = {
        "base_estimator": None,
        "n_estimators": 50,
        "learning_rate": 1.0,
        
    }
    STATIC_PARAMS = {}
    
    USE_PRUNER = False
    
    @classmethod
    def get_fit_params(cls, X_train, y_train, X_val, y_val, params):
        return {}

In [6]:
# Configuration for Random Forest
class RFConfig:
    DEFAULT_VALUES = {
        "n_estimators": 100,
        "max_depth": None,
        "min_samples_split": 2,
        "min_samples_leaf": 1,
        "max_features": "sqrt",
        "bootstrap": True,
        "ccp_alpha": 0.0,
        "max_samples": None,
    }

    STATIC_PARAMS = {
        "n_jobs": Config.N_JOBS,
    }
    
    USE_PRUNER = False
    
    @classmethod
    def get_fit_params(cls, X_train, y_train, X_val, y_val, params):
        return {}

In [7]:
# Configuration for XGBoost
class XGBConfig:
    EVAL_METRIC = "logloss"
    
    DEFAULT_VALUES = {
        "max_depth": 6,
        "n_estimators": 100,
        "alpha": 0.0,
        "lambda": 1.0,
        "learning_rate": 0.3,
        "colsample_bytree": 1.0,
        "colsample_bylevel": 1.0,
        "min_child_weight": 1.0,
        "sampling_method": "uniform",
        "early_stopping_rounds": None,
    }
    
    STATIC_PARAMS = {
        "tree_method": "gpu_hist",
        "use_label_encoder": False,
        "n_jobs": Config.N_JOBS,
        "predictor": "gpu_predictor",
        "max_bin": 1024,
        "eval_metric": EVAL_METRIC,
    }
    
    USE_PRUNER = True
    
    @classmethod
    def get_fit_params(cls, X_train, y_train, X_val, y_val, params):
        return {
            "eval_set": [(X_train, y_train), (X_val, y_val)],
            "verbose": False,
        }

In [8]:
# Configuration for LightGBM
class LGBConfig:
    DEFAULT_VALUES = {
        "num_leaves": 31,
        "max_depth": -1,
        "learning_rate": 0.1,
        "n_estimators": 100,
        "reg_alpha": 0.0,
        "reg_lambda": 0.0,
        "min_child_samples": 20,
        "subsample_for_bin": 200000,
    }
    
    STATIC_PARAMS = {
        "n_jobs": Config.N_JOBS,
        "verbose": -1,
        "objective": "binary",
    }
    
    USE_PRUNER = True
    
    @classmethod
    def get_fit_params(cls, X_train, y_train, X_val, y_val, params):
        # To suppress training output
        callbacks = params.get("callbacks", []) + [lgb.log_evaluation(period=0)]
        return {
            "eval_set": [(X_train, y_train), (X_val, y_val)],
            "eval_metric": "logloss",
            "callbacks": callbacks,
        }

In [None]:
# Configuration for LightGBM
class LGBConfig:
    DEFAULT_VALUES = {
        "num_leaves": 31,
        "max_depth": -1,
        "learning_rate": 0.1,
        "n_estimators": 100,
        "reg_alpha": 0.0,
        "reg_lambda": 0.0,
        "min_child_samples": 20,
        "subsample_for_bin": 200000,
    }
    
    STATIC_PARAMS = {
        "n_jobs": Config.N_JOBS,
        "verbose": -1,
        "objective": "binary",
    }
    
    USE_PRUNER = True
    
    @classmethod
    def get_fit_params(cls, X_train, y_train, X_val, y_val, params):
        # To suppress training output
        callbacks = params.get("callbacks", []) + [lgb.log_evaluation(period=0)]
        return {
            "eval_set": [(X_train, y_train), (X_val, y_val)],
            "eval_metric": "logloss",
            "callbacks": callbacks,
        }

In [10]:
class CBConfig:
    DEFAULT_VALUES = {
        "iterations": 1000,
        "learning_rate": 0.1,
        "depth": 6,
        "l2_leaf_reg": 3,
        "border_count": 32,
        "thread_count": -1,
        "random_seed": 42,
    }

    STATIC_PARAMS = {
        "loss_function": "Logloss",
        "eval_metric": "Logloss",
        "task_type": "CPU",
        "verbose": False,
    }

    USE_PRUNER = True

    @classmethod
    def get_fit_params(cls, X_train, y_train, X_val, y_val, params):
        return {
            "eval_set": [(X_train, y_train), (X_val, y_val)],
            "use_best_model": True,
            "early_stopping_rounds": 100,
            "verbose_eval": False,
            "plot": False,
        }

In [11]:
class GBRConfig:
    DEFAULT_VALUES = {
        "n_estimators": 100,
        "learning_rate": 0.1,
        "max_depth": 3,
        "min_samples_split": 2,
        "min_samples_leaf": 1,
        "subsample": 1.0,
        "max_features": None,
        "random_state": None,
        "alpha": 0.9,
    }
    
    STATIC_PARAMS = {
        "loss": "ls",
    }
    
    @classmethod
    def get_fit_params(cls, X_train, y_train, X_val, y_val, params):
        return {
            "verbose": 0,
        }

In [12]:
class HGBRConfig:
    DEFAULT_VALUES = {
        "learning_rate": 0.1,
        "max_iter": 100,
        "max_depth": None,
        "min_samples_leaf": 20,
        "l2_regularization": 0.0,
        "max_leaf_nodes": 31,
        "random_state": None,
    }

    STATIC_PARAMS = {
        "loss": "least_squares",
    }

    @classmethod
    def get_fit_params(cls, X_train, y_train, X_val, y_val, params):
        return {
            "verbose": 0,
        }

In [13]:
CONFIG_MAP = {
    "lr": LRConfig,
    "ada": AdaConfig,
    "rf": RFConfig,
    "xgb": XGBConfig,
    "lgb": LGBConfig,
    "cb" : CBConfig,
    "gbr" : GBRConfig,
    "hgbr" : HGBRConfig
}

# Load the dataset

In [17]:
train = pd.read_csv(Config.filepath('train.csv'))
test = pd.read_csv(Config.filepath('test.csv'))
original = pd.read_csv(Config.filepath('CrabAgePrediction.csv'))

In [18]:
train.head()

Unnamed: 0,id,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age
0,0,I,1.525,1.175,0.375,28.973189,12.728926,6.647958,8.348928,9
1,1,I,1.1,0.825,0.275,10.418441,4.521745,2.324659,3.40194,8
2,2,M,1.3875,1.1125,0.375,24.777463,11.3398,5.556502,6.662133,9
3,3,F,1.7,1.4125,0.5,50.660556,20.354941,10.991839,14.996885,11
4,4,I,1.25,1.0125,0.3375,23.289114,11.977664,4.50757,5.953395,8


In [20]:
test.head()

Unnamed: 0,id,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight
0,74051,I,1.05,0.7625,0.275,8.618248,3.657085,1.729319,2.721552
1,74052,I,1.1625,0.8875,0.275,15.507176,7.030676,3.246018,3.96893
2,74053,F,1.2875,0.9875,0.325,14.571643,5.556502,3.883882,4.819415
3,74054,F,1.55,0.9875,0.3875,28.377849,13.380964,6.548735,7.030676
4,74055,I,1.1125,0.85,0.2625,11.765042,5.528153,2.466407,3.331066


# Model Map

In [31]:
MODEL_MAP = {
    "lr": LogisticRegression,
    "ada": AdaBoostClassifier,
    "rf": RandomForestClassifier,
    "xgb": xgb.XGBClassifier,
    "lgb": lgb.LGBMClassifier,
    "cat": CatBoostClassifier,
    "gb" : GradientBoostingClassifier,
    "hgb": HistGradientBoostingClassifier,
}

# TrainingLoop

The train() function will be used as a generic training loop. It takes the training and test dataframes, a string identifier for the model that is being trained, the hyperparameters for the model and an optional verbosity argument. 

In the end, it returns the predictions for the training set, the test set and the overall accuracy.

In [32]:
def train(train_df, test_df, model, params, verbose=True):
    # Create copies so that original datatsets do not change
    df = train_df.copy()
    test = test_df.drop("PassengerId", axis=1)
    
    df["preds"] = pd.NA
    
    drop = ["Transported", "preds", "kfold"]
    
    # Get the initializer class and the configuration class
    klass = MODEL_MAP[model]
    config = CONFIG_MAP[model]
    
    # Default values in the config class are for tuned parameters
    # So, only those are filtered from params
    # This is mainly added for AdaBoost, since it has a slightly different objective
    params = {k: v for k, v in params.items() if k in config.DEFAULT_VALUES}
    
    # Add default values for parameters not defined in params
    params.update({k: v for k, v in config.DEFAULT_VALUES.items() if k not in params})
    
    # Add static params - Parameters that are not tuned
    params.update(config.STATIC_PARAMS)
    
    # For storing total accuracy across folds for averaging
    total_acc = 0.0
    
    # Empty list for storing test predictions in each fold
    test_preds = []
    
    for fold in range(5):
        train = df[df["kfold"] != fold]
        
        # Get training features and labels
        y_train = train["Transported"]
        X_train = train.drop(drop, axis=1)
        
        val = df[df["kfold"] == fold]
        
        # Get validation features and labels
        y_val = val["Transported"]
        X_val = val.drop(drop, axis=1)
        
        # Initialize model
        clf = klass(**params)
        
        # Get parameters for .fit() other than X and y
        fit_params = config.get_fit_params(
            X_train=X_train,
            y_train=y_train,
            X_val=X_val,
            y_val=y_val,
            params=params
        )
        
        # Train model on training set
        clf.fit(
            X=X_train,
            y=y_train,
            **fit_params,
        )
        
        # Make predictions on validation set
        val_pred = clf.predict(X_val)
        acc = metrics.accuracy_score(y_val, val_pred)
        
        # Report accuracy if verbose is True
        if verbose is True:
            print(f"\tFold {fold + 1} - Accuracy = {acc: .4f}")
        
        # Add to total accuracy
        total_acc += acc
        
        # Make predictions on validation set again
        # But this time in terms of probabilities
        # And store in the df
        # These will be used in the meta model
        df.loc[val.index, "preds"] = clf.predict_proba(X_val)[:, 1]
        
        # Get the test predictions for this fold in terms of probability
        test_preds.append(clf.predict_proba(test)[:, 1])
        
    acc = total_acc / 5
    
    if verbose is True:
        print(f"\tOverall accuracy = {acc: .4f}")   
    
    # Calculate final test predictions
    # These will be used in the meta model
    test_preds = np.vstack(test_preds)
    test_preds = test_preds.mean(axis=0)
    
    # Return val preds, test preds and overall accuracy
    return df["preds"].values, test_preds, acc

# Optuna Objectives

The functions below will act as the objective functions that should be used for each of the models. 

These take the Optuna trial, the training dataframe and the test dataframe as arguments. They first use Optuna to get a dictionary of parameters and then call the train() function with the appropriate arguments.

In [33]:
# Objective for Logistic Regression
def lr_objective(trial, train_df, test_df):
    params = {
        "tol": trial.suggest_float("tol", 1e-6, 1e-4, log=True),
        "C": trial.suggest_float("C", 0.5, 2.0, log=True),
        "solver": trial.suggest_categorical("solver", ["newton-cg", "lbfgs", "liblinear", "sag", "saga"]),
    }
    
    _, _, acc = train(df=train_df, test_df=test_df, model="lr", params=params, verbose=False)
    return acc

In [34]:
# Objective for AdaBoost
def adaboost_objective(trial, train_df, test_df):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 500, log=True),
        "learning_rate": trial.suggest_float("learning_rate", 0.1, 5.0, log=True),
    }
    
    tune_estimator = trial.suggest_categorical("tune_estimator", [True, False])
    
    if tune_estimator:
        # Parameters for the Decision Tree in AdaBoost
        max_depth = trial.suggest_int("max_depth", 1, 50)
        min_samples_split = trial.suggest_int("min_samples_split", 2, 10, log=True)
        min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 5, log=True)
        ccp_alpha = trial.suggest_float("ccp_alpha", 0.01, 1.0, log=True)
        
        params["base_estimator"] = DecisionTreeClassifier(
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            ccp_alpha=ccp_alpha,
        )
    
    _, _, acc = train(df=train_df, test_df=test_df, model="ada", params=params, verbose=False)
    return acc

In [35]:
# Objectuve for Random Forest
def rf_objective(trial, train_df, test_df):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 1, 500),
        "max_depth": trial.suggest_int("max_depth", 1, 50),
        "min_samples_split": trial.suggest_int("min_samples_plit", 2, 10, log=True),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 5, log=True),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
        "bootstrap": trial.suggest_categorical("bootstrap", [True, False]),
        "ccp_alpha": trial.suggest_float("ccp_alpha", 0.01, 1.0, log=True),
    }
    
    if params["bootstrap"] is True:
        params["max_samples"] = trial.suggest_float("max_samples", 0.01, 1.0, log=True)
    
    _, _, acc = train(df=train_df, test_df=test_df, model="rf", params=params, verbose=False)
    return acc

In [36]:
# Objective for XGBoost
def xgb_objective(trial, train_df, test_df):
    params = {
        "max_depth": trial.suggest_int("max_depth", 1, 11),
        "n_estimators": trial.suggest_int("n_estimators", 5, 500),
        "alpha": trial.suggest_uniform("alpha", 0.0, 5.0),
        "lambda": trial.suggest_float("lambda", 1.0, 5.0, log=True),
        "learning_rate": trial.suggest_float("learning_rate", 0.03, 0.8, log=True),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.2, 1.0),
        "min_child_weight": trial.suggest_uniform("min_child_weight", 1, 100),
        "sampling_method": trial.suggest_categorical("sampling_method", ["uniform", "gradient_based"]),
        "early_stopping_rounds": trial.suggest_int("early_stopping_rounds", 5, 20, step=5)
    }
    
    obs_k = f"validation_1-{XGBConfig.EVAL_METRIC}"
    params["callbacks"] = [optuna.integration.XGBoostPruningCallback(trial, obs_k)]
    
    _, _, acc = train(df=train_df, test_df=test_df, model="xgb", params=params, verbose=False)
    return acc

In [37]:
# Objective for LightGBM
def lgb_objective(trial, train_df, test_df):
    params = {
        "num_leaves": trial.suggest_int("num_leaves", 31, 100, log=True),
        "max_depth": trial.suggest_int("max_depth", 1, 100, log=True),
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 1.0),
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 5.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 5.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 20, 50, log=True),
        "subsample_for_bin": trial.suggest_int("subsample_for_bin", 2000, 8000),
    }
    
    params["callbacks"] = [optuna.integration.LightGBMPruningCallback(trial, "logloss", "valid_1")]
    
    _, _, acc = train(df=train_df, test_df=test_df, model="lgb", params=params, verbose=False)
    return acc

In [38]:
# Objective for CatBoost
def cb_objective(trial, train_df, test_df):
    params = {
        "iterations": trial.suggest_int("iterations", 100, 1000),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
        "depth": trial.suggest_int("depth", 3, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0.1, 10.0),
        "border_count": trial.suggest_int("border_count", 32, 255),
    }


    params['callbacks'] = optuna.integration.CatBoostPruningCallback(trial, "Logloss", 'valid_1')


    _, _, acc = train(df=train_df, test_df=test_df, model="cb", params=params, verbose=False)
    return acc

In [None]:
# # Objective for GradientBoosting
# def cb_objective(trial, train_df, test_df):
#     params = {
#         "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
#         "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
#         "max_depth": trial.suggest_int("max_depth", 3, 10),
#         "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
#         "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
#         "subsample": trial.suggest_float("subsample", 0.1, 1.0),
#     }


#     params['callbacks'] = optuna.integration.CatBoostPruningCallback(trial, "Logloss", 'valid_1')


#     _, _, acc = train(df=train_df, test_df=test_df, model="cb", params=params, verbose=False)
#     return acc

In [39]:
OBJECTIVE_MAP = {
    "lr": lr_objective,
    "ada": adaboost_objective,
    "rf": rf_objective,
    "xgb": xgb_objective,
    "lgb": lgb_objective,
    "cb" : cb_objective
}

# HyperParameter Search

The function hyperparameter_search() finds the best hyperparameters for the given model by utilizing the proper objective function. It takes the training dataframe, test dataframe and the string identifier of the model for which hyperparameters are required.

In [40]:
def hyperparameter_search(train_df, test_df, model, n_trials=Config.L1_N_TRIALS):
    # Turn off verbose output
    v = optuna.logging.get_verbosity()
    optuna.logging.set_verbosity(optuna.logging.WARNING)
    
    objective = OBJECTIVE_MAP[model]
    objective = functools.partial(objective, train_df=train_df, test_df=test_df)
    
    # Check if pruning is required
    pruner = optuna.pruners.HyperbandPruner() if CONFIG_MAP[model].USE_PRUNER is True else None
    
    sampler = optuna.samplers.TPESampler(seed=42)
    study = optuna.create_study(
        direction="maximize",
        pruner=pruner,
        sampler=sampler,
    )
    
    study.optimize(objective, n_trials=n_trials)
    
    # Restore verbosity level
    optuna.logging.set_verbosity(v)

    return study.best_params

# Datasets for LGBMClassifier

LightGBM is unique from other boosting algorithms in that it supports categorical features out of the box. 

We only need to make sure they are label encoded and have their datatype as category. 

The function below takes the two datasets and label encodes all the one-hot encoded columns.

In [None]:
def lgb_datasets(train_df, test_df):
    # Make copies so that original datasets remain unchanged
    train_df = train_df.copy()
    test_df = test_df.copy()
    
    # Drop Transported and kfold
    drop = ["Transported", "kfold"]
    dropped = train_df[drop].values
    train_df = train_df.drop(drop, axis=1)
    
    # Drop PassengerId
    passenger_id = test_df["PassengerId"].values
    test_df = test_df.drop("PassengerId", axis=1)
    
    # Add suffix to index and store indices
    # So that the dataframes can be merged and split
    train_df = train_df.rename("train_{}".format)
    test_df = test_df.rename("test_{}".format)
    
    tr_idx = train_df.index
    te_idx = test_df.index
    
    # Merge
    df = pd.concat([train_df, test_df])
    
    oh_cols = ["CabinDeck", "HomePlanet", "Destination", "GroupSize"]
    
    for oh_col in oh_cols:
        # Get all columns associated with the one-hot column
        columns = [column for column in df.columns if column.startswith(f"{oh_col}_")]
        
        # .idxmax() returns that column name which has the maximum value in the row
        values = df[columns].idxmax(axis=1)
        
        # Get all levels and make a mapping from level to index
        levels = values.value_counts().index
        mapping = {level: idx for idx, level in enumerate(levels)}
        
        # Add column with the mapping and specify type as category
        df[oh_col] = values.map(mapping).astype("category")
        
        # Drop one-hot columns
        df = df.drop(columns, axis=1)
        
    # Make sure other categorical features have the correct type
    missing = (col for col in df.columns if col.endswith("_missing"))
    others = ["CryoSleep", "VIP", "Alone", "CabinNum", "GroupId", *missing]
    df[others] = df[others].astype("category")
        
    # Split and add dropped columns
    train_df = df.loc[tr_idx, :]
    train_df[drop] = dropped
    
    test_df = df.loc[te_idx, :]
    test_df["PassengerId"] = passenger_id
    
    return train_df, test_df

# Ensemble Class

The Ensemble class will implement all the logic required for stacking. It takes the training dataframe, the test dataframe and an optional list of strings which specifies the models that should be excluded from the ensemble. It has the following methods:

fit_level_one_models(): This method fits all the different models (Logistic Regression, XGBoost, etc.) on the original training data and also creates the training set and test sets for the meta-classifier that will generate the final predictions.
fit_level_two_model(): This method fits a logistic regression model on the dataset generated by fit_level_one_models() and gets the final predictions.

In [41]:
class Ensemble:
    def __init__(self, train_df, test_df, exclude=None):
        self.train_df = train_df
        self.test_df = test_df
        
        models = Config.MODELS.keys()
        
        # Exclude models
        if exclude is not None:
            models = models - exclude
            
        self.models = list(models)
        
        # Create empty dataframe that will store
        # The training set for the level 2 model
        columns = [f"{m}_preds" for m in self.models]
        extra_cols = ["Transported", "kfold"]
        
        meta_train_df = pd.DataFrame(columns=columns + extra_cols)
        meta_train_df[extra_cols] = train_df[extra_cols]
        
        self.meta_train_df = meta_train_df
        
        # Create empty dataframe that will store
        # The test set for the level 2 model
        meta_test_df = pd.DataFrame(columns=["PassengerId"] + columns)
        meta_test_df["PassengerId"] = test_df["PassengerId"]
        
        self.meta_test_df = meta_test_df
        
    def fit_level_one_models(self):
        print("Training level 1 models...")
        
        for model in self.models:
            if model == "lgb":
                # Modify dataset for LGBMClassifier
                train_df, test_df = lgb_datasets(self.train_df, self.test_df)
            else:
                train_df, test_df = self.train_df, self.test_df
            
            print(f"{Config.MODELS[model]}:")
            
            print("\tFinding optimal hyperparameters using Optuna...")
            params = hyperparameter_search(
                train_df=train_df, test_df=test_df, model=model
            )
            
            print(f"\n\tBest params: {params}\n")
            
            print("\tTraining model with optimal parameters...\n")
            val_preds, test_preds, acc = train(
                df=train_df,
                test_df=test_df,
                model=model,
                params=params
            )
            
            print("\tDone!\n")
            
            # Add predictions to the datasets for the level 2 model
            self.meta_train_df[f"{model}_preds"] = val_preds
            self.meta_test_df[f"{model}_preds"] = test_preds
            
    def fit_level_two_model(self):    
        print("Training a Logistic Regression model as level 2 model...")
        
        train_df = self.meta_train_df
        test_df = self.meta_test_df
        
        print("\tFinding optimal hyperparameters using Optuna...")
        params = hyperparameter_search(
            train_df=train_df,
            test_df=test_df,
            model="lr",
            n_trials=Config.L2_N_TRIALS,
        )

        print(f"\n\tBest params: {params}\n")

        print("\tTraining model with optimal parameters...\n")
        
        _, test_preds, _ = train(
            df=train_df,
            test_df=test_df,
            model="lr",
            params=params
        )
        
        print("\tDone!")
        
        self.meta_test_df["Transported"] = test_preds >= 0.5
        
        return self.meta_test_df

# Training

Finally, we will use the Ensemble class to train our ensemble model.

In [42]:
# Initialize the ensemble
ensemble = Ensemble(train, test)
ensemble.models

TypeError: 'function' object is not subscriptable

In [None]:
# Fit the level one models
ensemble.fit_level_one_models()

In [None]:
# Generated training set for level 2 model
ensemble.meta_train_df.head()

In [None]:
# Generated test set for level 2 model
ensemble.meta_test_df.head()

In [None]:
# Fit the level 2 model
test_predictions = ensemble.fit_level_two_model()

In [None]:
test_predictions.head()