In [2]:
import time
import numpy as np
import pandas as pd
import catboost as cb

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

pd.set_option("display.max_columns", 30)

## Useful Functions

In [3]:
def get_input(data_path: str) -> pd.DataFrame:
    """
    Считывание данных и вывод основной информации о наборе данных.

    Parameters
    ----------
    data_path: str
        Название файла.

    Returns
    -------
    data: pandas.core.frame.DataFrame
        Загруженный набор данных в pandas.DataFrame

    """
    base_path = "../geekbrains-competitive-data-analysis"
    data = pd.read_csv(f"{base_path}/{data_path}")
    data.columns = [col.lower() for col in data.columns]
    print(f"{data_path}: shape = {data.shape[0]} rows, {data.shape[1]} cols")

    return data


def catboost_cross_validation(params, X, y, cv, categorical = None):
    """
    Кросс-валидация для модели catbooost.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признако для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    cv: KFold or StratifiedKFold generator.
        Объект KFold / StratifiedKFold для определения
        стратегии кросс-валидации модели.

    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimators: list
        Список с объектами обученной модели.

    oof_preds: np.array
        Вектор OOF-прогнозов.

    """
    estimators, folds_scores = [], []
    oof_preds = np.zeros(X.shape[0])

    print(f"{time.ctime()}, Cross-Validation, {X.shape[0]} rows, {X.shape[1]} cols")
    X[categorical] = X[categorical].astype(str)

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):

        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        model = cb.CatBoostClassifier(**params)
        model.fit(
            x_train, y_train, categorical,
            eval_set=[(x_train, y_train), (x_valid, y_valid)]
        )
        oof_preds[valid_idx] = model.predict_proba(x_valid)[:, 1]
        score = roc_auc_score(y_valid, oof_preds[valid_idx])
        print(f"Fold {fold+1}, Valid score = {round(score, 5)}")
        folds_scores.append(round(score, 5))
        estimators.append(model)

    print(f"Score by each fold: {folds_scores}")
    print("="*65)
    return estimators, oof_preds


def catboost_hold_out_validation(params, X, y, split_params = [0.7, 0.2, 0.1], categorical = None):
    """
    Hold-Out валидация для модели catbooost.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признако для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    split_params: List[float], optional, default = [0.7, 0.2, 0.1]
        Параметры (доли) разбиения выборки.
        Опциональный параметр, по умолчанию, равен [0.7, 0.2, 0.1].
    
    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimator: catboost.core.CatBoostClassifier
        Обученный классификатор catboost.

    test_prediction: np.array, optional
        Вектор прогнозов для тестовой выборки.
        Опциональный объект, возвращается только, если split_params
        содержит 3 значения.

    """
    numeric = list(set(x_train.columns) - set(categorical))
    x_train, x_valid = train_test_split(
        X, train_size=split_params[0], random_state=27
    )
    y_train, y_valid = train_test_split(
        y, train_size=split_params[0], random_state=27
    )

    if len(split_params) == 3:
        test_size = int(split_params[2] * X.shape[0])

        x_valid, x_test = train_test_split(
            x_valid, test_size=test_size, random_state=72
        )
        y_valid, y_test = train_test_split(
            y_valid, test_size=test_size, random_state=72
        )

    model = cb.CatBoostClassifier(**params)
    model.fit(
        x_train, y_train, categorical,
        eval_set=[(x_train, y_train), (x_valid, y_valid)]
    )

    print("="*80)
    valid_score = roc_auc_score(y_valid, model.predict_proba(x_valid)[:, 1])
    print(f"Valid Score = {round(valid_score, 4)}")

    if len(split_params) == 3:

        test_prediction = model.predict_proba(x_test)[:, 1]
        test_score = roc_auc_score(y_test, test_prediction)
        print(f"Test Score = {round(test_score, 4)}")

        return estimator, test_prediction

    else:
        return estimator

In [4]:
def create_client_profile_features(X: pd.DataFrame, copy: bool = True) -> pd.DataFrame:
    """
    Создание признаков на основе профиля клиентов.

    Parameters
    ----------
    X: pandas.core.frame.DataFrame
        Матрица признаков с исходным профилем клиента.

    copy: bool, optional, default = True
        Флаг использования копии датафрейма X.
        Опциональный параметр, по умолчанию, равен True.

    Returns
    -------
    X_transformed: pandas.core.frame.DataFrame
        Расширенная матрица признаков с профилем клиентов.

    """
    if copy:
        X = X.copy()

    X["days_on_last_job"] = X["days_on_last_job"].replace(365243, np.nan)
    bki_flags = [flag for flag in X.columns if "amt_req_credit_bureau" in flag]
    X["bki_requests_count"] = X[bki_flags].sum(axis=1)
    X["bki_kurtosis"] = X[bki_flags].kurtosis(axis=1)

    X["external_scoring_prod"] = X["external_scoring_rating_1"] * X["external_scoring_rating_2"] * X["external_scoring_rating_3"]
    X["external_scoring_weighted"] = X.external_scoring_rating_1 * 2 + X.external_scoring_rating_2 * 1 + X.external_scoring_rating_3 * 3

    for function_name in ["min", "max", "mean", "nanmedian", "var"]:
        feature_name = "external_scoring_rating_{}".format(function_name)
        X[feature_name] = eval("np.{}".format(function_name))(
            X[["external_scoring_rating_1", "external_scoring_rating_2", "external_scoring_rating_3"]], axis=1
        )

    # Отношение между основными фин. показателями
    X['ratio_credit_to_annuity'] = X['amount_credit'] / X['amount_annuity']
    X["ratio_annuity_to_salary"] = X['amount_annuity'] / X['total_salary']
    X['ratio_credit_to_salary'] = X['amount_credit'] / X['total_salary']
    #X["total_salary_net"] = X["total_salary"] - X["amount_annuity"]

    # Отношение фин. показателей к возрасту и временным фичам
    X["ratio_annuity_to_age"] = X["amount_annuity"] / X["age"]
    X["ratio_credit_to_age"] = X["amount_credit"] / X["age"]
    X["ratio_salary_to_age"] = X["total_salary"] / X["age"]
    X["ratio_salary_to_experience"] = X["total_salary"] / X["days_on_last_job"]
    X["ratio_credit_to_experience"] = X["amount_credit"] / X["days_on_last_job"]
    X["ratio_annuity_to_experience"] = X["amount_annuity"] / X["days_on_last_job"]

    # Отношение врменных признаков
    X["ratio_age_to_experience"] = X["age"] / X["days_on_last_job"]
    X["ratio_salary_to_region_population"] = X["total_salary"] * X["region_population"]
    X["ratio_car_to_experience"] = X["own_car_age"] / X["days_on_last_job"]
    X["ratio_car_to_age"] = X["own_car_age"] / X["age"]

    # Произведение фин. показателей кредита на вероятность дефолта
    # Такая штука называется математическим ожиданием дефолта или ожидаемыми потерями
    X["expected_total_loss_1"] = X["external_scoring_rating_1"] * X["amount_credit"]
    X["expected_total_loss_2"] = X["external_scoring_rating_2"] * X["amount_credit"]
    X["expected_total_loss_3"] = X["external_scoring_rating_3"] * X["amount_credit"]
    X["expected_monthly_loss_1"] = X["external_scoring_rating_1"] * X["amount_annuity"]
    X["expected_monthly_loss_2"] = X["external_scoring_rating_2"] * X["amount_annuity"]
    X["expected_monthly_loss_3"] = X["external_scoring_rating_3"] * X["amount_annuity"]

    return X

## Base Tables

In [5]:
train = get_input("train.csv")
test = get_input("test.csv")

data = pd.concat([train, test], axis=0)
data = data.reset_index(drop=True)
data.head(n=2)

train.csv: shape = 110093 rows, 3 cols
test.csv: shape = 165141 rows, 2 cols


Unnamed: 0,application_number,target,name_contract_type
0,123687442,0.0,Cash
1,123597908,1.0,Cash


## client_profile

In [7]:
client_profile = get_input("client_profile.csv")
client_profile = create_client_profile_features(client_profile)
client_profile.head(n=2)

client_profile.csv: shape = 250000 rows, 24 cols


  r, k = function_base._ureduce(a, func=_nanmedian, axis=axis, out=out,


Unnamed: 0,application_number,gender,childrens,total_salary,amount_credit,amount_annuity,education_level,family_status,region_population,age,days_on_last_job,own_car_age,flag_phone,flag_email,family_size,...,ratio_credit_to_age,ratio_salary_to_age,ratio_salary_to_experience,ratio_credit_to_experience,ratio_annuity_to_experience,ratio_age_to_experience,ratio_salary_to_region_population,ratio_car_to_experience,ratio_car_to_age,expected_total_loss_1,expected_total_loss_2,expected_total_loss_3,expected_monthly_loss_1,expected_monthly_loss_2,expected_monthly_loss_3
0,123666076,F,0,157500.0,270000.0,13500.0,Incomplete higher,Civil marriage,0.008068,8560,1549.0,,1,0,2.0,...,31.542056,18.399533,101.678502,174.306004,8.7153,5.526146,1270.71,,,88957.124333,63804.96656,183213.275945,4447.856217,3190.248328,9160.663797
1,123423688,F,0,270000.0,536917.5,28467.0,Secondary / secondary special,Married,0.020246,23187,,,0,0,2.0,...,23.155971,11.644456,,,,,5466.42,,,,237475.743779,431008.094056,,12590.802122,22851.755462


In [8]:
data = data.merge(
    client_profile, how="left", on="application_number"
)

In [9]:
data.head(20)

Unnamed: 0,application_number,target,name_contract_type,gender,childrens,total_salary,amount_credit,amount_annuity,education_level,family_status,region_population,age,days_on_last_job,own_car_age,flag_phone,...,ratio_credit_to_age,ratio_salary_to_age,ratio_salary_to_experience,ratio_credit_to_experience,ratio_annuity_to_experience,ratio_age_to_experience,ratio_salary_to_region_population,ratio_car_to_experience,ratio_car_to_age,expected_total_loss_1,expected_total_loss_2,expected_total_loss_3,expected_monthly_loss_1,expected_monthly_loss_2,expected_monthly_loss_3
0,123687442,0.0,Cash,M,1.0,157500.0,855000.0,25128.0,Secondary / secondary special,Married,0.019101,15728.0,1719.0,11.0,0.0,...,54.361648,10.013988,91.623037,497.382199,14.617801,9.149506,3008.4075,0.006399,0.000699,599170.547652,552256.266546,612667.559305,17609.307043,16230.521013,18005.977111
1,123597908,1.0,Cash,,,,,,,,,,,,,...,,,,,,,,,,,,,,,
2,123526683,0.0,Cash,F,0.0,135000.0,1006920.0,42660.0,Higher education,Married,0.026392,21557.0,3618.0,,1.0,...,46.709653,6.262467,37.313433,278.308458,11.791045,5.958264,3562.92,,,,686869.876357,269722.58888,,29100.49351,11427.288803
3,123710391,1.0,Cash,M,0.0,180000.0,518562.0,22972.5,Secondary / secondary special,Married,0.031329,22338.0,,,0.0,...,23.214343,8.058018,,,,,5639.22,,,,88829.188848,88386.882459,,3935.167908,3915.573562
4,123590329,1.0,Cash,,,,,,,,,,,,,...,,,,,,,,,,,,,,,
5,123718821,0.0,Cash,F,1.0,180000.0,755190.0,36459.0,Secondary / secondary special,Married,0.010032,16049.0,187.0,20.0,1.0,...,47.055268,11.215652,962.566845,4038.449198,194.967914,85.823529,1805.76,0.106952,0.001246,,414224.915008,250157.34043,,19997.91599,12077.075272
6,123544624,0.0,Cash,M,0.0,202500.0,1078200.0,38331.0,Secondary / secondary special,Married,0.031329,23087.0,16364.0,,1.0,...,46.701607,8.77117,12.374725,65.888536,2.342398,1.410841,6344.1225,,,,590229.235465,798729.57665,,20983.191267,28395.569841
7,123631154,0.0,Cash,M,0.0,99000.0,90000.0,10179.0,Secondary / secondary special,Single / not married,0.011703,9334.0,253.0,,0.0,...,9.642168,10.606385,391.304348,355.731225,40.233202,36.893281,1158.597,,,26394.29117,58819.293046,11565.393964,2985.194331,6652.462044,1308.046057
8,123702544,0.0,Cash,M,0.0,135000.0,171000.0,8446.5,Higher education,Single / not married,0.006296,12205.0,351.0,14.0,0.0,...,14.010651,11.061041,384.615385,487.179487,24.064103,34.77208,849.96,0.039886,0.001147,33396.658223,54629.860798,101228.001732,1649.619144,2698.427598,5000.130507
9,123660201,0.0,Cash,M,0.0,225000.0,473760.0,51151.5,Secondary / secondary special,Single / not married,0.018801,9494.0,163.0,18.0,1.0,...,49.90099,23.699178,1380.368098,2906.503067,313.812883,58.245399,4230.225,0.110429,0.001896,,260539.064613,244221.088936,,28130.200869,26368.361683


## baseline

In [10]:
mask = data["target"].isnull()
features_to_drop = ["application_number", "target"]

train, test = data.loc[~mask], data.loc[mask]

target, test_id = train["target"], test["application_number"]
train = train.drop(features_to_drop, axis=1)
test = test.drop(features_to_drop, axis=1)

categorial = train.dtypes[train.dtypes == "object"].index
numerical = list(set(train.columns) - set(categorial))

train = train.replace(np.inf, np.nan)
train = train.replace(-np.inf, np.nan)

In [11]:
train.head(30)

Unnamed: 0,name_contract_type,gender,childrens,total_salary,amount_credit,amount_annuity,education_level,family_status,region_population,age,days_on_last_job,own_car_age,flag_phone,flag_email,family_size,...,ratio_credit_to_age,ratio_salary_to_age,ratio_salary_to_experience,ratio_credit_to_experience,ratio_annuity_to_experience,ratio_age_to_experience,ratio_salary_to_region_population,ratio_car_to_experience,ratio_car_to_age,expected_total_loss_1,expected_total_loss_2,expected_total_loss_3,expected_monthly_loss_1,expected_monthly_loss_2,expected_monthly_loss_3
0,Cash,M,1.0,157500.0,855000.0,25128.0,Secondary / secondary special,Married,0.019101,15728.0,1719.0,11.0,0.0,0.0,3.0,...,54.361648,10.013988,91.623037,497.382199,14.617801,9.149506,3008.4075,0.006399,0.000699,599170.547652,552256.266546,612667.559305,17609.307043,16230.521013,18005.977111
1,Cash,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,
2,Cash,F,0.0,135000.0,1006920.0,42660.0,Higher education,Married,0.026392,21557.0,3618.0,,1.0,0.0,2.0,...,46.709653,6.262467,37.313433,278.308458,11.791045,5.958264,3562.92,,,,686869.876357,269722.58888,,29100.49351,11427.288803
3,Cash,M,0.0,180000.0,518562.0,22972.5,Secondary / secondary special,Married,0.031329,22338.0,,,0.0,0.0,2.0,...,23.214343,8.058018,,,,,5639.22,,,,88829.188848,88386.882459,,3935.167908,3915.573562
4,Cash,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,
5,Cash,F,1.0,180000.0,755190.0,36459.0,Secondary / secondary special,Married,0.010032,16049.0,187.0,20.0,1.0,0.0,3.0,...,47.055268,11.215652,962.566845,4038.449198,194.967914,85.823529,1805.76,0.106952,0.001246,,414224.915008,250157.34043,,19997.91599,12077.075272
6,Cash,M,0.0,202500.0,1078200.0,38331.0,Secondary / secondary special,Married,0.031329,23087.0,16364.0,,1.0,0.0,2.0,...,46.701607,8.77117,12.374725,65.888536,2.342398,1.410841,6344.1225,,,,590229.235465,798729.57665,,20983.191267,28395.569841
7,Cash,M,0.0,99000.0,90000.0,10179.0,Secondary / secondary special,Single / not married,0.011703,9334.0,253.0,,0.0,0.0,1.0,...,9.642168,10.606385,391.304348,355.731225,40.233202,36.893281,1158.597,,,26394.29117,58819.293046,11565.393964,2985.194331,6652.462044,1308.046057
8,Cash,M,0.0,135000.0,171000.0,8446.5,Higher education,Single / not married,0.006296,12205.0,351.0,14.0,0.0,0.0,1.0,...,14.010651,11.061041,384.615385,487.179487,24.064103,34.77208,849.96,0.039886,0.001147,33396.658223,54629.860798,101228.001732,1649.619144,2698.427598,5000.130507
9,Cash,M,0.0,225000.0,473760.0,51151.5,Secondary / secondary special,Single / not married,0.018801,9494.0,163.0,18.0,1.0,0.0,1.0,...,49.90099,23.699178,1380.368098,2906.503067,313.812883,58.245399,4230.225,0.110429,0.001896,,260539.064613,244221.088936,,28130.200869,26368.361683


## KFold

In [12]:
cb_params = {
    "n_estimators": 2000,
    "learning_rate": 0.01,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "max_bin": 20,
    "verbose": 10,
    "max_depth": 6,
    "l2_leaf_reg": 10,
    "early_stopping_rounds": 50,
    "thread_count": 6,
    "random_seed": 42
}

cv = KFold(n_splits=5, random_state=1234123, shuffle=True)

estimators, oof_preds = catboost_cross_validation(
    params=cb_params, X=train, y=target, cv=cv, categorical=categorial
)

Thu Dec 10 14:25:36 2020, Cross-Validation, 110093 rows, 52 cols
0:	test: 0.5867378	test1: 0.5812295	best: 0.5812295 (0)	total: 209ms	remaining: 6m 57s
10:	test: 0.7001402	test1: 0.6925714	best: 0.6925714 (10)	total: 1.33s	remaining: 4m
20:	test: 0.7035652	test1: 0.6963165	best: 0.6963165 (20)	total: 2.33s	remaining: 3m 39s
30:	test: 0.7047232	test1: 0.6979904	best: 0.6979904 (30)	total: 3.36s	remaining: 3m 33s
40:	test: 0.7054465	test1: 0.6992489	best: 0.6994170 (38)	total: 4.38s	remaining: 3m 29s
50:	test: 0.7069468	test1: 0.7006301	best: 0.7006301 (50)	total: 5.38s	remaining: 3m 25s
60:	test: 0.7081454	test1: 0.7013650	best: 0.7013650 (60)	total: 6.36s	remaining: 3m 22s
70:	test: 0.7090323	test1: 0.7018906	best: 0.7018906 (70)	total: 7.36s	remaining: 3m 20s
80:	test: 0.7095323	test1: 0.7027972	best: 0.7027972 (80)	total: 8.36s	remaining: 3m 18s
90:	test: 0.7093081	test1: 0.7030943	best: 0.7032636 (89)	total: 9.37s	remaining: 3m 16s
100:	test: 0.7099472	test1: 0.7032162	best: 0.70334

900:	test: 0.7433949	test1: 0.7211687	best: 0.7213110 (896)	total: 1m 28s	remaining: 1m 48s
910:	test: 0.7437403	test1: 0.7212308	best: 0.7213223 (909)	total: 1m 29s	remaining: 1m 47s
920:	test: 0.7438893	test1: 0.7211859	best: 0.7213223 (909)	total: 1m 30s	remaining: 1m 46s
930:	test: 0.7442164	test1: 0.7211960	best: 0.7213223 (909)	total: 1m 31s	remaining: 1m 45s
940:	test: 0.7444673	test1: 0.7211900	best: 0.7213223 (909)	total: 1m 32s	remaining: 1m 44s
950:	test: 0.7446176	test1: 0.7213102	best: 0.7213341 (944)	total: 1m 33s	remaining: 1m 43s
960:	test: 0.7447767	test1: 0.7213274	best: 0.7213536 (959)	total: 1m 34s	remaining: 1m 42s
970:	test: 0.7449901	test1: 0.7212787	best: 0.7213536 (959)	total: 1m 35s	remaining: 1m 41s
980:	test: 0.7452039	test1: 0.7213356	best: 0.7213536 (959)	total: 1m 36s	remaining: 1m 40s
990:	test: 0.7454340	test1: 0.7214053	best: 0.7214053 (990)	total: 1m 37s	remaining: 1m 39s
1000:	test: 0.7456573	test1: 0.7214711	best: 0.7215842 (998)	total: 1m 38s	remai

660:	test: 0.7364992	test1: 0.7251044	best: 0.7252898 (632)	total: 1m 5s	remaining: 2m 12s
670:	test: 0.7367153	test1: 0.7252754	best: 0.7253008 (668)	total: 1m 6s	remaining: 2m 11s
680:	test: 0.7370070	test1: 0.7253924	best: 0.7253924 (680)	total: 1m 7s	remaining: 2m 10s
690:	test: 0.7372993	test1: 0.7255150	best: 0.7255245 (688)	total: 1m 8s	remaining: 2m 9s
700:	test: 0.7376482	test1: 0.7255921	best: 0.7255921 (700)	total: 1m 9s	remaining: 2m 8s
710:	test: 0.7378369	test1: 0.7255575	best: 0.7256163 (707)	total: 1m 10s	remaining: 2m 7s
720:	test: 0.7380053	test1: 0.7253046	best: 0.7256163 (707)	total: 1m 11s	remaining: 2m 6s
730:	test: 0.7382912	test1: 0.7256792	best: 0.7257447 (728)	total: 1m 12s	remaining: 2m 5s
740:	test: 0.7386629	test1: 0.7258531	best: 0.7258531 (740)	total: 1m 13s	remaining: 2m 4s
750:	test: 0.7389277	test1: 0.7260086	best: 0.7260086 (750)	total: 1m 14s	remaining: 2m 3s
760:	test: 0.7391130	test1: 0.7259355	best: 0.7260086 (750)	total: 1m 15s	remaining: 2m 2s
7

650:	test: 0.7349543	test1: 0.7289876	best: 0.7289898 (649)	total: 1m 3s	remaining: 2m 12s
660:	test: 0.7353302	test1: 0.7288517	best: 0.7289898 (649)	total: 1m 4s	remaining: 2m 11s
670:	test: 0.7354908	test1: 0.7292261	best: 0.7292261 (670)	total: 1m 5s	remaining: 2m 10s
680:	test: 0.7356919	test1: 0.7294032	best: 0.7294757 (679)	total: 1m 6s	remaining: 2m 9s
690:	test: 0.7360473	test1: 0.7298147	best: 0.7298304 (689)	total: 1m 7s	remaining: 2m 8s
700:	test: 0.7363391	test1: 0.7296541	best: 0.7298603 (691)	total: 1m 8s	remaining: 2m 7s
710:	test: 0.7366030	test1: 0.7296896	best: 0.7298603 (691)	total: 1m 9s	remaining: 2m 6s
720:	test: 0.7367948	test1: 0.7296802	best: 0.7298603 (691)	total: 1m 10s	remaining: 2m 5s
730:	test: 0.7370834	test1: 0.7297406	best: 0.7298603 (691)	total: 1m 11s	remaining: 2m 4s
740:	test: 0.7373554	test1: 0.7299262	best: 0.7299262 (740)	total: 1m 12s	remaining: 2m 3s
750:	test: 0.7376026	test1: 0.7295805	best: 0.7299262 (740)	total: 1m 13s	remaining: 2m 2s
760

10:	test: 0.7009050	test1: 0.6971595	best: 0.6971595 (10)	total: 1.23s	remaining: 3m 41s
20:	test: 0.7027085	test1: 0.6991332	best: 0.7000237 (19)	total: 2.35s	remaining: 3m 41s
30:	test: 0.7034192	test1: 0.7001410	best: 0.7002587 (29)	total: 3.37s	remaining: 3m 34s
40:	test: 0.7043805	test1: 0.7000679	best: 0.7003638 (34)	total: 4.39s	remaining: 3m 29s
50:	test: 0.7053054	test1: 0.7007605	best: 0.7007605 (50)	total: 5.48s	remaining: 3m 29s
60:	test: 0.7070427	test1: 0.7018562	best: 0.7019658 (59)	total: 6.6s	remaining: 3m 29s
70:	test: 0.7085961	test1: 0.7025069	best: 0.7025069 (70)	total: 7.63s	remaining: 3m 27s
80:	test: 0.7092150	test1: 0.7029684	best: 0.7029684 (80)	total: 8.66s	remaining: 3m 25s
90:	test: 0.7097728	test1: 0.7031326	best: 0.7032270 (87)	total: 9.64s	remaining: 3m 22s
100:	test: 0.7103111	test1: 0.7032649	best: 0.7033486 (96)	total: 10.6s	remaining: 3m 19s
110:	test: 0.7103708	test1: 0.7031034	best: 0.7035129 (105)	total: 11.6s	remaining: 3m 18s
120:	test: 0.710600

920:	test: 0.7431134	test1: 0.7198842	best: 0.7200036 (917)	total: 1m 34s	remaining: 1m 51s
930:	test: 0.7432018	test1: 0.7200127	best: 0.7200127 (930)	total: 1m 35s	remaining: 1m 50s
940:	test: 0.7435705	test1: 0.7199882	best: 0.7200631 (934)	total: 1m 36s	remaining: 1m 49s
950:	test: 0.7436955	test1: 0.7201647	best: 0.7201716 (949)	total: 1m 37s	remaining: 1m 47s
960:	test: 0.7440479	test1: 0.7200598	best: 0.7201716 (949)	total: 1m 38s	remaining: 1m 46s
970:	test: 0.7441174	test1: 0.7201158	best: 0.7201716 (949)	total: 1m 39s	remaining: 1m 45s
980:	test: 0.7442351	test1: 0.7201732	best: 0.7202780 (978)	total: 1m 40s	remaining: 1m 44s
990:	test: 0.7443714	test1: 0.7202290	best: 0.7202780 (978)	total: 1m 41s	remaining: 1m 43s
1000:	test: 0.7446108	test1: 0.7202334	best: 0.7203413 (993)	total: 1m 42s	remaining: 1m 42s
1010:	test: 0.7449463	test1: 0.7202366	best: 0.7203413 (993)	total: 1m 43s	remaining: 1m 41s
1020:	test: 0.7450313	test1: 0.7202386	best: 0.7203802 (1018)	total: 1m 44s	re

710:	test: 0.7370909	test1: 0.7236543	best: 0.7236550 (709)	total: 1m 10s	remaining: 2m 8s
720:	test: 0.7373614	test1: 0.7236936	best: 0.7237238 (715)	total: 1m 11s	remaining: 2m 7s
730:	test: 0.7375626	test1: 0.7239316	best: 0.7239432 (729)	total: 1m 12s	remaining: 2m 6s
740:	test: 0.7377891	test1: 0.7241867	best: 0.7242118 (739)	total: 1m 13s	remaining: 2m 5s
750:	test: 0.7380222	test1: 0.7241201	best: 0.7243002 (742)	total: 1m 14s	remaining: 2m 4s
760:	test: 0.7382272	test1: 0.7242769	best: 0.7243002 (742)	total: 1m 15s	remaining: 2m 3s
770:	test: 0.7385541	test1: 0.7245926	best: 0.7245926 (770)	total: 1m 16s	remaining: 2m 2s
780:	test: 0.7388865	test1: 0.7244511	best: 0.7245926 (770)	total: 1m 17s	remaining: 2m 1s
790:	test: 0.7391762	test1: 0.7245447	best: 0.7245926 (770)	total: 1m 18s	remaining: 2m
800:	test: 0.7394589	test1: 0.7245659	best: 0.7246343 (791)	total: 1m 19s	remaining: 1m 59s
810:	test: 0.7397353	test1: 0.7245585	best: 0.7246343 (791)	total: 1m 20s	remaining: 1m 58s


In [14]:
oof_score = roc_auc_score(
    target, oof_preds
)
print(f"OOF-score = {round(oof_score, 5)}")
# [0.72194, 0.72659, 0.73283, 0.72053, 0.72657]
# OOF-score = 0.72481

OOF-score = 0.72481


## Подготовка прогноза

In [15]:
y_pred = np.zeros(test.shape[0])
test[numerical] = test[numerical].astype(float)
test[categorial] = test[categorial].astype(str)

for estimator in estimators:
    y_pred += estimator.predict_proba(test)[:, 1]

In [156]:
 = pd.DataFrame({
    "APPLICATION_NUMBER": test_id,
    "TARGET": y_pred / cv.n_splits
})
y_pred.to_csv("./geekbrains-competitive-data-analysis/baseline_submit.csv", index=False)

Unnamed: 0,APPLICATION_NUMBER,TARGET
110093,123724268,0.056370
110094,123456549,0.222611
110095,123428178,0.185588
110096,123619984,0.084373
110097,123671104,0.020926
...,...,...
275229,123487967,0.084373
275230,123536402,0.046302
275231,123718238,0.084373
275232,123631557,0.019437
