# ICR - Identifying Age-Related Conditions: Training Notebook

In [1]:
import argparse
import gc
import os
import lightgbm as lgb
import numpy as np
import pandas as pd
import pickle
import shutil
import yaml
import xgboost as xgb
from catboost import Pool, CatBoostClassifier
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from pprint import pprint
from sklearn.preprocessing import StandardScaler

In [2]:
def get_dataframes(data_path):
    """Reads and returns all competition data."""
    train = pd.read_csv(os.path.join(data_path, "train.csv"))
    test = pd.read_csv(os.path.join(data_path, "test.csv"))
    greeks = pd.read_csv(os.path.join(data_path, "greeks.csv"))
    sub = pd.read_csv(os.path.join(data_path, "sample_submission.csv"))
    train = train.rename(columns={"BD ": "BD", "CD ": "CD", "CW ": "CW", "FD ": "FD"})
    test = test.rename(columns={"BD ": "BD", "CD ": "CD", "CW ": "CW", "FD ": "FD"})
    return train, test, greeks, sub

In [3]:
def balanced_log_loss(y_true, y_pred):
    """Competition metric."""
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
    nc = np.bincount(y_true)
    balanced_log_loss_score = (
        -1.0 / nc[0] * (np.sum(np.where(y_true == 0, 1, 0) * np.log(1 - y_pred)))
        - 1.0 / nc[1] * (np.sum(np.where(y_true != 0, 1, 0) * np.log(y_pred)))
    ) / 2.0
    return balanced_log_loss_score

In [4]:
def calc_log_loss_weight(y_true):
    """Calculates weights for dataset."""
    nc = np.bincount(y_true)
    w0, w1 = 1 / (nc[0] / y_true.shape[0]), 1 / (nc[1] / y_true.shape[0])
    return w0, w1

In [5]:
def train_catboost(x_train, y_train, x_valid, y_valid, categorical_features, cat_params):
    """catboost training."""
    train_w0, train_w1 = calc_log_loss_weight(y_train)
    valid_w0, valid_w1 = calc_log_loss_weight(y_valid)

    cat_train = Pool(
        data=x_train,
        label=y_train,
        weight=y_train.map({0: train_w0, 1: train_w1}),
        cat_features=categorical_features,
    )
    cat_valid = Pool(
        data=x_valid,
        label=y_valid,
        weight=y_valid.map({0: valid_w0, 1: valid_w1}),
        cat_features=categorical_features,
    )

    model = CatBoostClassifier(**cat_params)
    model.fit(cat_train, eval_set=[cat_valid], use_best_model=True)

    valid_pred = model.predict_proba(x_valid)[:, 1]

    return model, valid_pred

In [6]:
def train_lightgbm(
    x_train, y_train, x_valid, y_valid, categorical_features, lgb_params, train_params
):
    """LightGBM training."""
    train_w0, train_w1 = calc_log_loss_weight(y_train)
    valid_w0, valid_w1 = calc_log_loss_weight(y_valid)

    lgb_train = lgb.Dataset(
        x_train,
        y_train,
        weight=y_train.map({0: train_w0, 1: train_w1}),
    )

    lgb_valid = lgb.Dataset(
        x_valid,
        y_valid,
        weight=y_valid.map({0: valid_w0, 1: valid_w1}),
    )

    model = lgb.train(
        params=lgb_params,
        train_set=lgb_train,
        valid_sets=[lgb_train, lgb_valid],
        num_boost_round=train_params["num_boost_round"],
        callbacks=[
            lgb.early_stopping(train_params["stopping_rounds"]),
            lgb.log_evaluation(train_params["period"]),
        ],
    )

    valid_pred = model.predict(x_valid)

    return model, valid_pred

In [7]:
def train_xgboost(x_train, y_train, x_valid, y_valid, categorical_features, xgb_params, train):
    """XGBoost training."""
    train_w0, train_w1 = calc_log_loss_weight(y_train)
    valid_w0, valid_w1 = calc_log_loss_weight(y_valid)
    xgb_train = xgb.DMatrix(
        data=x_train, label=y_train, weight=y_train.map({0: train_w0, 1: train_w1})
    )
    xgb_valid = xgb.DMatrix(
        data=x_valid, label=y_valid, weight=y_valid.map({0: valid_w0, 1: valid_w1})
    )
    xgb_params["scale_pos_weight"] = train_w1 / train_w0
    model = xgb.train(
        xgb_params, dtrain=xgb_train, evals=[(xgb_train, "train"), (xgb_valid, "eval")], **train
    )

    valid_pred = model.predict(xgb.DMatrix(x_valid), iteration_range=(0, model.best_ntree_limit))
    return model, valid_pred

In [8]:
def run_training_stack(config_path, models):
    with open(config_path, "r", encoding="utf-8") as file_obj:
        config = yaml.safe_load(file_obj)

    print(config_path)
    pprint(config)

    df_train, _, df_greeks, _ = get_dataframes(config["data_path"])

    cat_features = ["EJ"]
    num_features = list(df_train.columns)[1:-1]
    num_features.remove("EJ")
    print("Original number of numerical features:", len(num_features))
    print("Original number of categorical features:", len(cat_features))
    features = num_features + cat_features

    str2int_dict = {}
    str2int_dict["EJ"] = {"A": 1, "B": 0}
    df_train["EJ"] = df_train["EJ"].map(str2int_dict["EJ"])
    if config["denoise"]:
        df_train[num_features] = df_train[num_features].apply(lambda i: np.floor(i * 10))
    if config["scaler"] == "RobustScaler":
        scaler = StandardScaler()
        df_train[num_features] = scaler.fit_transform(df_train[num_features])
    
    for i, model_path in enumerate(models):
        oof = pd.read_csv(os.path.join(model_path, "oof.csv"))
        df_train[f"OOF_{i}"] = oof["prediction"]
        features += [f"OOF_{i}"]

    os.makedirs(config["models_path"], exist_ok=True)
    shutil.copy(config_path, os.path.join(config["models_path"], "config.yaml"))

    gc.enable()

    print(f"\nTraining {config['method']} model")

    fold_scores = []
    oof_preds = np.zeros(len(df_train))
    oof_fold = np.zeros(len(df_train))

    kfold = MultilabelStratifiedKFold(**config["folds"])
    for fold, (train_idx, valid_idx) in enumerate(kfold.split(df_train, df_greeks.iloc[:, 1:-1])):
        print(f"\nFold {fold + 1}", flush=True)

        x_train = df_train[features].iloc[train_idx]
        y_train = df_train["Class"].iloc[train_idx]
        x_valid = df_train[features].iloc[valid_idx]
        y_valid = df_train["Class"].iloc[valid_idx]

        if config["method"] == "catboost":
            model, valid_pred = train_catboost(
                x_train, y_train, x_valid, y_valid, cat_features, config["model"]
            )
        if config["method"] == "lightgbm":
            model, valid_pred = train_lightgbm(
                x_train, y_train, x_valid, y_valid, cat_features, config["model"], config["train"]
            )
        if config["method"] == "xgboost":
            model, valid_pred = train_xgboost(
                x_train, y_train, x_valid, y_valid, cat_features, config["model"], config["train"]
            )

        oof_preds[valid_idx] = valid_pred
        oof_fold[valid_idx] = fold + 1

        fold_scores.append(balanced_log_loss(y_valid, valid_pred))
        print(f"Fold {fold+1} OOF CV: {fold_scores[-1]:.4f}")

        with open(os.path.join(config["models_path"], f"model_f{fold + 1}.pkl"), "wb") as file_obj:
            pickle.dump(model, file_obj)

        del x_train, x_valid, y_train, y_valid, model, valid_pred
        gc.collect()

    print("")
    for i in range(config["folds"]["n_splits"]):
        print(f"Fold {i+1} {fold_scores[i]:.4f}")
    print(f"mean/std: {np.mean(fold_scores):.4f}/{np.std(fold_scores):.4f}")
    score = balanced_log_loss(df_train["Class"], oof_preds)
    print(f"OOF CV score is {score:.4f}")

    oof_df = pd.DataFrame(
        {
            "Id": df_train["Id"],
            "Class": df_train["Class"],
            "prediction": oof_preds,
            "fold": oof_fold,
        }
    )

    oof_df.to_csv(os.path.join(config["models_path"], "oof.csv"), index=False)

In [9]:
models = ["/home/trushin/Kaggle/ICR/MODELS/catboost_1748",
          "/home/trushin/Kaggle/ICR/MODELS/lightgbm_1831",
          "/home/trushin/Kaggle/ICR/MODELS/xgboost_1851_dart"]
run_training_stack("config.yaml", models)

config.yaml
{'data_path': '/home/trushin/Kaggle/icr-identify-age-related-conditions/',
 'denoise': True,
 'folds': {'n_splits': 10, 'random_state': 42, 'shuffle': True},
 'method': 'catboost',
 'model': {'auto_class_weights': 'Balanced',
           'colsample_bylevel': 0.75,
           'depth': 4,
           'diffusion_temperature': 500,
           'early_stopping_rounds': 5000,
           'grow_policy': 'Depthwise',
           'iterations': 500000,
           'l2_leaf_reg': 1.0,
           'langevin': True,
           'learning_rate': 0.001,
           'min_data_in_leaf': 52,
           'random_seed': 42,
           'random_strength': 10.0,
           'subsample': 0.8,
           'verbose': 1000},
 'models_path': 'models',
 'scaler': 'RobustScaler'}
Original number of numerical features: 55
Original number of categorical features: 1

Training catboost model

Fold 1
0:	learn: 0.6927031	test: 0.6926871	best: 0.6926871 (0)	total: 47ms	remaining: 6h 31m 18s
1000:	learn: 0.4338248	test: 0.

2000:	learn: 0.3344103	test: 0.3171749	best: 0.3171749 (2000)	total: 1.05s	remaining: 4m 21s
3000:	learn: 0.2708966	test: 0.2509115	best: 0.2509115 (3000)	total: 1.57s	remaining: 4m 20s
4000:	learn: 0.2321872	test: 0.2147797	best: 0.2147797 (4000)	total: 2.15s	remaining: 4m 26s
5000:	learn: 0.2037392	test: 0.1880275	best: 0.1880275 (5000)	total: 2.67s	remaining: 4m 24s
6000:	learn: 0.1833856	test: 0.1703672	best: 0.1703672 (6000)	total: 3.18s	remaining: 4m 21s
7000:	learn: 0.1660727	test: 0.1575166	best: 0.1575166 (7000)	total: 3.7s	remaining: 4m 20s
8000:	learn: 0.1494116	test: 0.1458580	best: 0.1458455 (7998)	total: 4.21s	remaining: 4m 19s
9000:	learn: 0.1305093	test: 0.1330708	best: 0.1330708 (9000)	total: 4.73s	remaining: 4m 18s
10000:	learn: 0.1050069	test: 0.1167151	best: 0.1166877 (9997)	total: 5.26s	remaining: 4m 17s
11000:	learn: 0.0757661	test: 0.1013139	best: 0.1013139 (11000)	total: 5.81s	remaining: 4m 18s
12000:	learn: 0.0534951	test: 0.0907406	best: 0.0907406 (12000)	tota

12000:	learn: 0.0524501	test: 0.1459366	best: 0.1434626 (11346)	total: 6.43s	remaining: 4m 21s
13000:	learn: 0.0403472	test: 0.1536776	best: 0.1434626 (11346)	total: 6.86s	remaining: 4m 16s
14000:	learn: 0.0326089	test: 0.1645804	best: 0.1434626 (11346)	total: 7.26s	remaining: 4m 12s
15000:	learn: 0.0267144	test: 0.1741855	best: 0.1434626 (11346)	total: 7.67s	remaining: 4m 7s
16000:	learn: 0.0224707	test: 0.1844480	best: 0.1434626 (11346)	total: 8.08s	remaining: 4m 4s
Stopped by overfitting detector  (5000 iterations wait)

bestTest = 0.1434625624
bestIteration = 11346

Shrink model to first 11347 iterations.
Fold 8 OOF CV: 0.1434

Fold 9
0:	learn: 0.6930793	test: 0.6930836	best: 0.6930836 (0)	total: 577us	remaining: 4m 48s
1000:	learn: 0.4432146	test: 0.4249962	best: 0.4249962 (1000)	total: 536ms	remaining: 4m 27s
2000:	learn: 0.3331140	test: 0.3087892	best: 0.3087892 (2000)	total: 1.06s	remaining: 4m 23s
3000:	learn: 0.2731462	test: 0.2453171	best: 0.2452941 (2999)	total: 1.59s	remai