# ICR - Identifying Age-Related Conditions: Training Notebook

In [1]:
import argparse
import gc
import os
import lightgbm as lgb
import numpy as np
import pandas as pd
import pickle
import shutil
import yaml
import xgboost as xgb
from catboost import Pool, CatBoostClassifier
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from pprint import pprint
from sklearn.preprocessing import StandardScaler

In [2]:
def get_dataframes(data_path):
    """Reads and returns all competition data."""
    train = pd.read_csv(os.path.join(data_path, "train.csv"))
    test = pd.read_csv(os.path.join(data_path, "test.csv"))
    greeks = pd.read_csv(os.path.join(data_path, "greeks.csv"))
    sub = pd.read_csv(os.path.join(data_path, "sample_submission.csv"))
    train = train.rename(columns={"BD ": "BD", "CD ": "CD", "CW ": "CW", "FD ": "FD"})
    test = test.rename(columns={"BD ": "BD", "CD ": "CD", "CW ": "CW", "FD ": "FD"})
    return train, test, greeks, sub

In [3]:
def balanced_log_loss(y_true, y_pred):
    """Competition metric."""
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
    nc = np.bincount(y_true)
    balanced_log_loss_score = (
        -1.0 / nc[0] * (np.sum(np.where(y_true == 0, 1, 0) * np.log(1 - y_pred)))
        - 1.0 / nc[1] * (np.sum(np.where(y_true != 0, 1, 0) * np.log(y_pred)))
    ) / 2.0
    return balanced_log_loss_score

In [4]:
def calc_log_loss_weight(y_true):
    """Calculates weights for dataset."""
    nc = np.bincount(y_true)
    w0, w1 = 1 / (nc[0] / y_true.shape[0]), 1 / (nc[1] / y_true.shape[0])
    return w0, w1

In [5]:
def train_catboost(x_train, y_train, x_valid, y_valid, categorical_features, cat_params):
    """catboost training."""
    train_w0, train_w1 = calc_log_loss_weight(y_train)
    valid_w0, valid_w1 = calc_log_loss_weight(y_valid)

    cat_train = Pool(
        data=x_train,
        label=y_train,
        weight=y_train.map({0: train_w0, 1: train_w1}),
        cat_features=categorical_features,
    )
    cat_valid = Pool(
        data=x_valid,
        label=y_valid,
        weight=y_valid.map({0: valid_w0, 1: valid_w1}),
        cat_features=categorical_features,
    )

    model = CatBoostClassifier(**cat_params)
    model.fit(cat_train, eval_set=[cat_valid], use_best_model=True)

    valid_pred = model.predict_proba(x_valid)[:, 1]

    return model, valid_pred

In [6]:
def train_lightgbm(
    x_train, y_train, x_valid, y_valid, categorical_features, lgb_params, train_params
):
    """LightGBM training."""
    train_w0, train_w1 = calc_log_loss_weight(y_train)
    valid_w0, valid_w1 = calc_log_loss_weight(y_valid)

    lgb_train = lgb.Dataset(
        x_train,
        y_train,
        weight=y_train.map({0: train_w0, 1: train_w1}),
    )

    lgb_valid = lgb.Dataset(
        x_valid,
        y_valid,
        weight=y_valid.map({0: valid_w0, 1: valid_w1}),
    )

    model = lgb.train(
        params=lgb_params,
        train_set=lgb_train,
        valid_sets=[lgb_train, lgb_valid],
        num_boost_round=train_params["num_boost_round"],
        callbacks=[
            lgb.early_stopping(train_params["stopping_rounds"], verbose=True),
            lgb.log_evaluation(train_params["period"]),
        ],
    )

    valid_pred = model.predict(x_valid)

    return model, valid_pred

In [7]:
def train_xgboost(x_train, y_train, x_valid, y_valid, categorical_features, xgb_params, train):
    """XGBoost training."""
    train_w0, train_w1 = calc_log_loss_weight(y_train)
    valid_w0, valid_w1 = calc_log_loss_weight(y_valid)
    xgb_train = xgb.DMatrix(
        data=x_train, label=y_train, weight=y_train.map({0: train_w0, 1: train_w1})
    )
    xgb_valid = xgb.DMatrix(
        data=x_valid, label=y_valid, weight=y_valid.map({0: valid_w0, 1: valid_w1})
    )
    xgb_params["scale_pos_weight"] = train_w1 / train_w0
    model = xgb.train(
        xgb_params, dtrain=xgb_train, evals=[(xgb_train, "train"), (xgb_valid, "eval")], **train
    )

    valid_pred = model.predict(xgb.DMatrix(x_valid), iteration_range=(0, model.best_ntree_limit))
    return model, valid_pred

In [8]:
def run_training(config_path):
    with open(config_path, "r", encoding="utf-8") as file_obj:
        config = yaml.safe_load(file_obj)

    #print(config_path)
    #pprint(config)

    df_train, _, df_greeks, _ = get_dataframes(config["data_path"])

    cat_features = ["EJ"]
    num_features = list(df_train.columns)[1:-1]
    num_features.remove("EJ")
    features = num_features + cat_features

    str2int_dict = {}
    str2int_dict["EJ"] = {"A": 1, "B": 0}
    df_train["EJ"] = df_train["EJ"].map(str2int_dict["EJ"])
    if config["denoise"]:
        df_train[num_features] = df_train[num_features].apply(lambda i: np.floor(i * 10))
    if config["scaler"] == "RobustScaler":
        scaler = StandardScaler()
        df_train[num_features] = scaler.fit_transform(df_train[num_features])

    os.makedirs(config["models_path"], exist_ok=True)
    shutil.copy(config_path, os.path.join(config["models_path"], "config.yaml"))

    gc.enable()

    print(f"\nTraining {config['method']} model")

    fold_scores = []
    fold_scores_ = []
    oof_preds = np.zeros(len(df_train))
    oof_fold = np.zeros(len(df_train))

    kfold = MultilabelStratifiedKFold(**config["folds"])
    for fold, (train_idx, valid_idx) in enumerate(kfold.split(df_train, df_greeks.iloc[:, 1:-1])):
        print("")

        df_train_inner = df_train.iloc[train_idx]
        df_greeks_inner = df_greeks.iloc[train_idx]
        x_valid_outer = df_train[features].iloc[valid_idx]
        y_valid_outer = df_train["Class"].iloc[valid_idx]

        fold_inner_scores = []
        oof_inner_preds = np.zeros(len(df_train_inner))
        oof_inner_fold = np.zeros(len(df_train_inner))
  
        models = []

        kfold_inner = MultilabelStratifiedKFold(**config["folds"])
        for fold_inner, (train_idx_inner, valid_idx_inner) in enumerate(kfold_inner.split(df_train_inner, df_greeks_inner.iloc[:, 1:-1])):
            # print(f"\n Outer Fold {fold + 1} Inner Fold {fold_inner + 1}", flush=True)
 
            x_train = df_train_inner[features].iloc[train_idx_inner]
            y_train = df_train_inner["Class"].iloc[train_idx_inner]
            x_valid = df_train_inner[features].iloc[valid_idx_inner]
            y_valid = df_train_inner["Class"].iloc[valid_idx_inner] 
    
            if config["method"] == "catboost":
                model, valid_pred = train_catboost(
                    x_train, y_train, x_valid, y_valid, cat_features, config["model"]
                )
            if config["method"] == "lightgbm":
                model, valid_pred = train_lightgbm(
                    x_train, y_train, x_valid, y_valid, cat_features, config["model"], config["train"]
                )
            if config["method"] == "xgboost":
                model, valid_pred = train_xgboost(
                    x_train, y_train, x_valid, y_valid, cat_features, config["model"], config["train"]
                )

            oof_inner_preds[valid_idx_inner] = valid_pred

            fold_inner_scores.append(balanced_log_loss(y_valid, valid_pred))
            print(f"Outer fold {fold+1}, Inner fold {fold_inner+1} OOF CV: {fold_inner_scores[-1]:.4f}")

            models.append(model)
            
            del x_train, x_valid, y_train, y_valid, model, valid_pred
            gc.collect()
            
        print(f"mean/std: {np.mean(fold_inner_scores):.4f}/{np.std(fold_inner_scores):.4f}")
        score = balanced_log_loss(df_train_inner["Class"], oof_inner_preds)
        print(f"Inner OOF CV score is {score:.4f}")
        fold_scores_.append(score)
        
        valid_pred = np.zeros(len(x_valid_outer))
        for model in models:
            if config["method"] == "catboost":
                valid_pred += model.predict_proba(x_valid_outer)[:, 1]
            if config["method"] == "lightgbm":
                valid_pred += model.predict(x_valid_outer)
            if config["method"] == "xgboost":
                valid_pred += model.predict(xgb.DMatrix(x_valid_outer), iteration_range=(0, model.best_ntree_limit))
        valid_pred = valid_pred / config["folds"]["n_splits"]
        
        oof_preds[valid_idx] = valid_pred

        fold_scores.append(balanced_log_loss(y_valid_outer, valid_pred))
        print(f"Outer Fold {fold+1} OOF CV: {fold_scores[-1]:.4f}")
    
    print("")
    for i in range(config["folds"]["n_splits"]):
        print(f"{fold_scores[i]:.4f} {fold_scores_[i]:.4f}")
    print(f"mean/std: {np.mean(fold_scores):.4f}/{np.std(fold_scores):.4f}")
    print(f"mean/std_: {np.mean(fold_scores_):.4f}/{np.std(fold_scores_):.4f}")
    score = balanced_log_loss(df_train["Class"], oof_preds)
    print(f"Outer OOF CV score is {score:.4f}")

In [9]:
run_training("config.yaml")


Training xgboost model

[0]	train-logloss:0.67558	eval-logloss:0.67499
[100]	train-logloss:0.19361	eval-logloss:0.32319
[200]	train-logloss:0.07341	eval-logloss:0.25595
[300]	train-logloss:0.03077	eval-logloss:0.23198
[400]	train-logloss:0.01528	eval-logloss:0.23984
[500]	train-logloss:0.00985	eval-logloss:0.23830
[600]	train-logloss:0.00748	eval-logloss:0.24214
[700]	train-logloss:0.00622	eval-logloss:0.24813
[800]	train-logloss:0.00543	eval-logloss:0.25289
[900]	train-logloss:0.00496	eval-logloss:0.25799
[1000]	train-logloss:0.00468	eval-logloss:0.26219
[1100]	train-logloss:0.00445	eval-logloss:0.26354
[1200]	train-logloss:0.00422	eval-logloss:0.26421
[1300]	train-logloss:0.00408	eval-logloss:0.26348
[1311]	train-logloss:0.00406	eval-logloss:0.26306
Outer fold 1, Inner fold 1 OOF CV: 0.2300
[0]	train-logloss:0.67499	eval-logloss:0.67941
[100]	train-logloss:0.18893	eval-logloss:0.29181
[200]	train-logloss:0.07041	eval-logloss:0.24120
[300]	train-logloss:0.02922	eval-logloss:0.25767
[

[900]	train-logloss:0.00477	eval-logloss:0.33082
[1000]	train-logloss:0.00448	eval-logloss:0.33751
[1100]	train-logloss:0.00426	eval-logloss:0.33918
[1185]	train-logloss:0.00408	eval-logloss:0.34775
Outer fold 2, Inner fold 5 OOF CV: 0.2120
mean/std: 0.1929/0.0899
Inner OOF CV score is 0.1923
Outer Fold 2 OOF CV: 0.3529

[0]	train-logloss:0.67747	eval-logloss:0.67609
[100]	train-logloss:0.20279	eval-logloss:0.24226
[200]	train-logloss:0.07501	eval-logloss:0.17505
[300]	train-logloss:0.03028	eval-logloss:0.19359
[400]	train-logloss:0.01509	eval-logloss:0.22217
[500]	train-logloss:0.00951	eval-logloss:0.23232
[600]	train-logloss:0.00733	eval-logloss:0.25077
[700]	train-logloss:0.00613	eval-logloss:0.26444
[800]	train-logloss:0.00537	eval-logloss:0.26754
[900]	train-logloss:0.00485	eval-logloss:0.27135
[1000]	train-logloss:0.00462	eval-logloss:0.27132
[1100]	train-logloss:0.00440	eval-logloss:0.27135
[1200]	train-logloss:0.00422	eval-logloss:0.27107
[1204]	train-logloss:0.00421	eval-loglo

[2100]	train-logloss:0.00323	eval-logloss:0.21149
[2200]	train-logloss:0.00320	eval-logloss:0.21042
[2300]	train-logloss:0.00318	eval-logloss:0.21025
[2400]	train-logloss:0.00316	eval-logloss:0.20986
[2500]	train-logloss:0.00315	eval-logloss:0.20902
[2600]	train-logloss:0.00313	eval-logloss:0.20916
[2700]	train-logloss:0.00313	eval-logloss:0.20846
[2800]	train-logloss:0.00312	eval-logloss:0.20847
[2900]	train-logloss:0.00312	eval-logloss:0.20841
[3000]	train-logloss:0.00310	eval-logloss:0.20881
[3100]	train-logloss:0.00310	eval-logloss:0.20844
[3200]	train-logloss:0.00310	eval-logloss:0.20833
[3300]	train-logloss:0.00310	eval-logloss:0.20782
[3400]	train-logloss:0.00309	eval-logloss:0.20743
[3500]	train-logloss:0.00309	eval-logloss:0.20752
[3600]	train-logloss:0.00307	eval-logloss:0.20742
[3700]	train-logloss:0.00308	eval-logloss:0.20738
[3800]	train-logloss:0.00308	eval-logloss:0.20718
[3900]	train-logloss:0.00307	eval-logloss:0.20711
[4000]	train-logloss:0.00307	eval-logloss:0.20686


[300]	train-logloss:0.03115	eval-logloss:0.13506
[400]	train-logloss:0.01525	eval-logloss:0.12459
[500]	train-logloss:0.00952	eval-logloss:0.12367
[600]	train-logloss:0.00723	eval-logloss:0.12362
[700]	train-logloss:0.00602	eval-logloss:0.12609
[800]	train-logloss:0.00523	eval-logloss:0.12554
[900]	train-logloss:0.00479	eval-logloss:0.12697
[1000]	train-logloss:0.00450	eval-logloss:0.13112
[1100]	train-logloss:0.00427	eval-logloss:0.13173
[1200]	train-logloss:0.00408	eval-logloss:0.13377
[1300]	train-logloss:0.00393	eval-logloss:0.13651
[1400]	train-logloss:0.00380	eval-logloss:0.13876
[1500]	train-logloss:0.00369	eval-logloss:0.14186
[1558]	train-logloss:0.00364	eval-logloss:0.14242
Outer fold 5, Inner fold 4 OOF CV: 0.1215
[0]	train-logloss:0.68024	eval-logloss:0.68443
[100]	train-logloss:0.19331	eval-logloss:0.33052
[200]	train-logloss:0.06778	eval-logloss:0.28106
[300]	train-logloss:0.02669	eval-logloss:0.29443
[400]	train-logloss:0.01301	eval-logloss:0.31334
[500]	train-logloss:0.