In [3]:
import warnings

import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from dataset import Dataset
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    LabelEncoder,
    OrdinalEncoder,
    StandardScaler,
    TargetEncoder,
)
from sklearn.svm import SVC
from sklearn.utils import compute_class_weight
from tqdm.notebook import tqdm
from xgboost import XGBClassifier

from lightgbm import LGBMClassifier, early_stopping

RAND = 0
warnings.filterwarnings("ignore")

In [16]:
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    log_loss,
    precision_score,
    recall_score,
    roc_auc_score,
)


def metrics_estimation(
    model,
    X_train: pd.DataFrame,
    y_train: pd.Series,
    X_test: pd.DataFrame,
    y_test: pd.Series,
    weeks_train: pd.Series,
    weeks_test: pd.Series,
    name: str,
) -> pd.DataFrame:
    """Generating tables with metrics for classification.

    Parameters
    ----------
    model: sklearn clf estimator
    X_train: pd.DataFrame
    y_train: np.ndarray
    X_test: pd.DataFrame
    y_test: np.ndarray
    weeks_train: pd.Series
    weeks_test: pd.Series
    name: str
        id of metrics
    Returns
    -------
    df: pd.DataFrame
    """
    if name == "TabTransformer":
        y_pred_train, y_score_train = model.predict(X_train)
        y_pred_test, y_score_test = model.predict(X_test)
    else:
        y_pred_train = model.predict(X_train)
        y_score_train = model.predict_proba(X_train)
        y_pred_test = model.predict(X_test)
        y_score_test = model.predict_proba(X_test)

    base_train = pd.DataFrame(
        {
            "WEEK_NUM": weeks_train,
            "target": y_train,
            "score": (
                y_score_train[:, 0] if name == "TabTransformer" else y_score_train[:, 1]
            ),
        }
    )
    base_test = pd.DataFrame(
        {
            "WEEK_NUM": weeks_test,
            "target": y_test,
            "score": (
                y_score_test[:, 0] if name == "TabTransformer" else y_score_test[:, 1]
            ),
        }
    )
    df_train = get_metrics(
        y_train,
        y_pred_train,
        y_score_train,
        name + "_train",
        base_train,
        name == "TabTransformer",
    )
    df_test = get_metrics(
        y_test,
        y_pred_test,
        y_score_test,
        name + "_test",
        base_test,
        name == "TabTransformer",
    )
    df = pd.concat([df_train, df_test])
    df.set_index("model", inplace=True)
    auc_train = df.loc[name + "_train"]["ROC_AUC"]
    auc_test = df.loc[name + "_test"]["ROC_AUC"]
    df["overfitting, %"] = abs(auc_train - auc_test) / auc_test * 100
    return df


def get_metrics(
    y_true: np.ndarray,
    y_pred: np.ndarray,
    y_score: np.ndarray,
    name: str,
    base: pd.DataFrame,
    is_tabtransformer: bool = False,
) -> pd.DataFrame:
    """
    Generating tables with metrics for classification.

    Parameters
    ----------
    y_true: 1d array-like, or label indicator array / sparse matrix
        Ground truth (correct) labels.

    y_pred: 1d array-like, or label indicator array / sparse matrix
        Predicted labels, as returned by a classifier.

    y_score: array-like of shape (n_samples,) or (n_samples, n_classes)
        Target scores.

    name: str
        id of metrics

    base: pd.DataFrame
        with 'WEEK_NUM', 'target' and 'score' columns

    is_tabtransformer: bool

    Returns
    -------
    df: pd.DataFrame
    """
    if len(y_score.shape) != 1:
        y_score = y_score[:, 0] if is_tabtransformer else y_score[:, 1]
    df_metrics = pd.DataFrame()
    df_metrics["model"] = [name]
    df_metrics["Accuracy"] = accuracy_score(y_true, y_pred)
    df_metrics["ROC_AUC"] = roc_auc_score(y_true, y_score)
    df_metrics["Precision"] = precision_score(y_true, y_pred)
    df_metrics["Recall"] = recall_score(y_true, y_pred)
    df_metrics["f1"] = f1_score(y_true, y_pred)
    df_metrics["Logloss"] = log_loss(y_true, y_score)
    df_metrics["gini_stability"] = gini_stability(base)
    return df_metrics


def gini_stability(base: pd.DataFrame, w_fallingrate=88.0, w_resstd=-0.5) -> float:
    """
    Target metric of the competition.

    Parameters
    ----------
    base: pd.DataFrame
        with target, score and WEEK_NUM

    w_fallingrate: float
        Default=88.0

    w_resstd: float
        Default=-0.5

    Returns
    -------
    out: float
    """
    gini_in_time = (
        base.loc[:, ["WEEK_NUM", "target", "score"]]
        .sort_values("WEEK_NUM")
        .groupby("WEEK_NUM")[["target", "score"]]
        .apply(lambda x: 2 * roc_auc_score(x["target"], x["score"]) - 1)
        .tolist()
    )

    x = np.arange(len(gini_in_time))
    y = gini_in_time
    a, b = np.polyfit(x, y, 1)
    y_hat = a * x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    avg_gini = np.mean(gini_in_time)
    return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std

Выберем наиболее подходящие модели.

# Data Import

In [None]:
ds = Dataset()
df_train = ds.get_dataframe()
X = df_train.to_pandas()
X.set_index("case_id", inplace=True)
del df_train

In [6]:
weeks = X.pop("WEEK_NUM")
y = X.pop("target")

In [7]:
X

Unnamed: 0_level_0,month_decision,weekday_decision,days30_165L,maritalst_385M,pmtscount_423L,requesttype_4525192L,annuity_780A,annuitynextmonth_57A,avginstallast24m_3658937A,avgoutstandbalancel6m_4187114A,...,purposeofcred_active,residualamount_closed,subjectrole_active,subjectrole_closed,empladdr,num_total_inst,num_total_paid_inst,cred_closure_date_days_ago,actualdpd,sellerplace_cnt
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,1,4,,,,,1917.6000,0.0,,,...,,,,,P167_100_165,0.0,,,,0
1.0,1,4,,,,,3134.0000,0.0,,,...,,,,,P49_46_174,0.0,,,,0
2.0,1,5,,,,,4937.0000,0.0,,,...,,,,,P63_14_24,0.0,,,0.0,0
3.0,1,4,,,,,4643.6000,0.0,,,...,,,,,P161_14_174,0.0,,,0.0,1
4.0,1,5,,,,,3390.2000,0.0,,,...,,,,,P8_88_79,0.0,,,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2703450.0,10,1,0.0,2.0,,,3675.4001,0.0,7356.8003,16392.496,...,a55475b1,0.0,a55475b1,a55475b1,a55475b1,92.0,113.0,-27.0,0.0,8
2703451.0,10,1,0.0,2.0,,,7088.6000,6191.6,12553.2000,105129.310,...,a55475b1,,a55475b1,a55475b1,a55475b1,69.0,75.0,-7.0,0.0,3
2703452.0,10,1,0.0,2.0,,,7788.8003,0.0,2662.4001,,...,a55475b1,0.0,a55475b1,a55475b1,a55475b1,9.0,9.0,-321.0,0.0,1
2703453.0,10,1,1.0,0.0,,,1195.4000,2827.2,8212.6010,47943.062,...,a55475b1,0.0,a55475b1,a55475b1,a55475b1,119.0,119.0,-293.0,0.0,4


In [8]:
cat_features = X.select_dtypes(exclude=np.number).columns.tolist()
num_features = X.select_dtypes(include=np.number).columns.tolist()
len(num_features) + len(cat_features)

190

Пропуски для категориальных значений заполним 'None'.

In [9]:
imputer_cat = SimpleImputer(missing_values=None, strategy="constant", fill_value="None")
X[cat_features] = imputer_cat.fit_transform(X[cat_features], y)
X

Unnamed: 0_level_0,month_decision,weekday_decision,days30_165L,maritalst_385M,pmtscount_423L,requesttype_4525192L,annuity_780A,annuitynextmonth_57A,avginstallast24m_3658937A,avgoutstandbalancel6m_4187114A,...,purposeofcred_active,residualamount_closed,subjectrole_active,subjectrole_closed,empladdr,num_total_inst,num_total_paid_inst,cred_closure_date_days_ago,actualdpd,sellerplace_cnt
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,1,4,,,,,1917.6000,0.0,,,...,,,,,P167_100_165,0.0,,,,0
1.0,1,4,,,,,3134.0000,0.0,,,...,,,,,P49_46_174,0.0,,,,0
2.0,1,5,,,,,4937.0000,0.0,,,...,,,,,P63_14_24,0.0,,,0.0,0
3.0,1,4,,,,,4643.6000,0.0,,,...,,,,,P161_14_174,0.0,,,0.0,1
4.0,1,5,,,,,3390.2000,0.0,,,...,,,,,P8_88_79,0.0,,,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2703450.0,10,1,0.0,2.0,,,3675.4001,0.0,7356.8003,16392.496,...,a55475b1,0.0,a55475b1,a55475b1,a55475b1,92.0,113.0,-27.0,0.0,8
2703451.0,10,1,0.0,2.0,,,7088.6000,6191.6,12553.2000,105129.310,...,a55475b1,,a55475b1,a55475b1,a55475b1,69.0,75.0,-7.0,0.0,3
2703452.0,10,1,0.0,2.0,,,7788.8003,0.0,2662.4001,,...,a55475b1,0.0,a55475b1,a55475b1,a55475b1,9.0,9.0,-321.0,0.0,1
2703453.0,10,1,1.0,0.0,,,1195.4000,2827.2,8212.6010,47943.062,...,a55475b1,0.0,a55475b1,a55475b1,a55475b1,119.0,119.0,-293.0,0.0,4


Переведем признаки типа `object` в категориальный тип данных.

In [10]:
X[cat_features] = X[cat_features].astype("category")

In [None]:
df_test = ds.get_dataframe(train=False)
df_test_p = df_test.to_pandas()

In [10]:
df_test_p

Unnamed: 0,WEEK_NUM,case_id,month_decision,weekday_decision,days30_165L,maritalst_385M,pmtscount_423L,requesttype_4525192L,annuity_780A,annuitynextmonth_57A,...,purposeofcred_active,residualamount_closed,subjectrole_active,subjectrole_closed,empladdr,num_total_inst,num_total_paid_inst,cred_closure_date_days_ago,actualdpd,sellerplace_cnt
0,100,57543.0,5,5,1,1,,,3674.6,1218.2001,...,a55475b1,0.0,a55475b1,a55475b1,a55475b1,112,96.0,-18.0,0,5
1,100,57549.0,1,1,3,3,,,5742.6,3546.6,...,a55475b1,0.0,a55475b1,a55475b1,a55475b1,38,44.0,-93.0,0,7
2,100,57551.0,11,5,1,0,,,2844.6,0.0,...,a55475b1,0.0,a55475b1,a55475b1,a55475b1,9,9.0,-150.0,0,2
3,100,57552.0,11,5,0,2,,,6298.8003,12155.4,...,,,,,a55475b1,62,32.0,,0,6
4,100,57569.0,12,1,1,0,,,4682.6,0.0,...,,,,,a55475b1,17,15.0,,0,3
5,100,57630.0,3,2,1,0,,,8905.0,0.0,...,,,,,a55475b1,7,7.0,,0,1
6,100,57631.0,6,6,0,2,,,2540.6,0.0,...,,,,,,0,,,0,0
7,100,57632.0,2,6,0,2,,,4732.0,0.0,...,,,,,,21,18.0,,0,1
8,100,57633.0,1,2,2,0,,,8273.0,0.0,...,5d1b0cdd,,a55475b1,a55475b1,,0,,-27.0,0,2
9,100,57634.0,1,3,1,2,,,1165.8,0.0,...,,,,,,10,10.0,,0,0


Размер доступной в соревновании тестовой выборки всего 10 строк, поэтому возьмем за полный датасет именно train таблицу.

# Logistic regression

Для логистической регрессии заполним пропуски в числовых данных нулями и сделаем нормализацию. Для категориальных закодируем значения TargetEncoder.

In [11]:
num_pipe = Pipeline(
    [
        (
            "zero_impute",
            SimpleImputer(missing_values=np.nan, strategy="constant", fill_value=0.0),
        ),
        ("scaler", StandardScaler()),
    ]
)
pipe = ColumnTransformer(
    [
        ("float", num_pipe, num_features),
        ("cat", TargetEncoder(random_state=RAND), cat_features),
    ],
    n_jobs=24,
    verbose=True,
    remainder="passthrough",
)

In [12]:
X_lr = pipe.fit_transform(X, y)
X_lr = pd.DataFrame(X_lr, columns=num_features + cat_features).set_index(X.index)
X_lr

Unnamed: 0_level_0,month_decision,weekday_decision,days30_165L,maritalst_385M,pmtscount_423L,annuity_780A,annuitynextmonth_57A,avginstallast24m_3658937A,avgoutstandbalancel6m_4187114A,cntincpaycont9m_3716944L,...,subjectroles_name_541M,subjectroles_name_838M,classificationofcontr_active,contractst_active,employers_name,familystate,purposeofcred_active,subjectrole_active,subjectrole_closed,empladdr
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,-1.544517,0.008039,-0.540289,-1.168270,-0.576249,-0.705648,-0.511493,-0.561811,-0.424847,-0.735748,...,0.032494,0.032494,0.031986,0.031986,0.029776,0.028585,0.031986,0.031986,0.031986,0.040152
1.0,-1.544517,0.008039,-0.540289,-1.168270,-0.576249,-0.301073,-0.511493,-0.561811,-0.424847,-0.735748,...,0.032918,0.032918,0.032286,0.032286,0.029451,0.037873,0.032286,0.032286,0.032286,0.041005
2.0,-1.544517,0.510936,-0.540289,-1.168270,-0.576249,0.298607,-0.511493,-0.561811,-0.424847,-0.735748,...,0.032702,0.032702,0.032025,0.032025,0.029128,0.041243,0.032025,0.032025,0.032025,0.028794
3.0,-1.544517,0.008039,-0.540289,-1.168270,-0.576249,0.201022,-0.511493,-0.561811,-0.424847,-0.735748,...,0.032918,0.032918,0.032286,0.032286,0.029451,0.028448,0.032286,0.032286,0.032286,0.039149
4.0,-1.544517,0.510936,-0.540289,-1.168270,-0.576249,-0.215860,-0.511493,-0.561811,-0.424847,-0.735748,...,0.032494,0.032494,0.031986,0.031986,0.029776,0.028585,0.031986,0.031986,0.031986,0.036629
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2703450.0,1.017540,-1.500651,-0.540289,0.452756,-0.576249,-0.121003,-0.511493,0.733570,-0.087551,1.804338,...,0.031914,0.031499,0.030177,0.030542,0.030412,0.028589,0.030679,0.030578,0.030774,0.028345
2703451.0,1.017540,-1.500651,-0.540289,0.452756,-0.576249,1.014231,1.694264,1.648549,1.738323,2.143016,...,0.031733,0.031475,0.030165,0.030549,0.030442,0.028458,0.030688,0.030557,0.030750,0.028223
2703452.0,1.017540,-1.500651,-0.540289,0.452756,-0.576249,1.247118,-0.511493,-0.093017,-0.424847,-0.735748,...,0.031802,0.031429,0.030180,0.030545,0.030395,0.028488,0.030692,0.030496,0.030697,0.028284
2703453.0,1.017540,-1.500651,0.609497,-1.168270,-0.576249,-0.945853,0.495696,0.884259,0.561643,0.449625,...,0.031733,0.031475,0.030165,0.030549,0.030442,0.028458,0.030688,0.030557,0.030750,0.028223


Сделаем разделение данных на обучающую, валидационную и тестовую выборку для логистической регрессии.

In [13]:
X_train, X_test, y_train, y_test, weeks_train, weeks_test = train_test_split(
    X_lr, y, weeks, test_size=0.2, stratify=y, shuffle=True, random_state=RAND
)

X_train_, X_val, y_train_, y_val, weeks_train_, weeks_val = train_test_split(
    X_train,
    y_train,
    weeks_train,
    test_size=0.16,
    stratify=y_train,
    shuffle=True,
    random_state=RAND,
)

In [None]:
clf_lr = LogisticRegression(class_weight="balanced", random_state=RAND, n_jobs=24)
clf_lr.fit(X_train_, y_train_)

In [17]:
metrics = metrics_estimation(
    clf_lr, X_train_, y_train_, X_test, y_test, weeks_train_, weeks_test, name="LR"
)
metrics

Unnamed: 0_level_0,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss,gini_stability,"overfitting, %"
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
LR_train,0.724331,0.794145,0.077931,0.717227,0.140587,0.562475,0.565279,0.366829
LR_test,0.725659,0.791242,0.07794,0.713408,0.140528,0.561868,0.552538,0.366829


# Linear Discriminant Analysis.

In [18]:
clf_lda = LinearDiscriminantAnalysis()
clf_lda.fit(X_train_, y_train_)
metrics_ = metrics_estimation(
    clf_lda, X_train_, y_train_, X_test, y_test, weeks_train_, weeks_test, name="LDA"
)
metrics = pd.concat([metrics, metrics_])
metrics

Unnamed: 0_level_0,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss,gini_stability,"overfitting, %"
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
LR_train,0.724331,0.794145,0.077931,0.717227,0.140587,0.562475,0.565279,0.366829
LR_test,0.725659,0.791242,0.07794,0.713408,0.140528,0.561868,0.552538,0.366829
LDA_train,0.964572,0.797184,0.247034,0.061981,0.099098,0.127526,0.572686,0.244131
LDA_test,0.964514,0.795242,0.243781,0.061256,0.09791,0.127899,0.553696,0.244131


# Random Forest

Для Random forest не будем использовать нормализацию.

In [19]:
del X_lr

In [20]:
pipe = ColumnTransformer(
    [
        (
            "num",
            SimpleImputer(missing_values=np.nan, strategy="constant", fill_value=0.0),
            num_features,
        ),
        ("cat", TargetEncoder(random_state=RAND), cat_features),
    ],
    n_jobs=24,
    verbose=True,
)
X_ = pipe.fit_transform(X, y)
X_ = pd.DataFrame(X_, columns=num_features + cat_features).set_index(X.index)
X_

Unnamed: 0_level_0,month_decision,weekday_decision,days30_165L,maritalst_385M,pmtscount_423L,annuity_780A,annuitynextmonth_57A,avginstallast24m_3658937A,avgoutstandbalancel6m_4187114A,cntincpaycont9m_3716944L,...,subjectroles_name_541M,subjectroles_name_838M,classificationofcontr_active,contractst_active,employers_name,familystate,purposeofcred_active,subjectrole_active,subjectrole_closed,empladdr
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,1.0,4.0,0.0,0.0,0.0,1917.6000,0.0,0.0000,0.000,0.0,...,0.032494,0.032494,0.031986,0.031986,0.029776,0.028585,0.031986,0.031986,0.031986,0.040152
1.0,1.0,4.0,0.0,0.0,0.0,3134.0000,0.0,0.0000,0.000,0.0,...,0.032918,0.032918,0.032286,0.032286,0.029451,0.037873,0.032286,0.032286,0.032286,0.041005
2.0,1.0,5.0,0.0,0.0,0.0,4937.0000,0.0,0.0000,0.000,0.0,...,0.032702,0.032702,0.032025,0.032025,0.029128,0.041243,0.032025,0.032025,0.032025,0.028794
3.0,1.0,4.0,0.0,0.0,0.0,4643.6000,0.0,0.0000,0.000,0.0,...,0.032918,0.032918,0.032286,0.032286,0.029451,0.028448,0.032286,0.032286,0.032286,0.039149
4.0,1.0,5.0,0.0,0.0,0.0,3390.2000,0.0,0.0000,0.000,0.0,...,0.032494,0.032494,0.031986,0.031986,0.029776,0.028585,0.031986,0.031986,0.031986,0.036629
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2703450.0,10.0,1.0,0.0,2.0,0.0,3675.4001,0.0,7356.8003,16392.496,15.0,...,0.031914,0.031499,0.030177,0.030542,0.030412,0.028589,0.030679,0.030578,0.030774,0.028345
2703451.0,10.0,1.0,0.0,2.0,0.0,7088.6000,6191.6,12553.2000,105129.310,17.0,...,0.031733,0.031475,0.030165,0.030549,0.030442,0.028458,0.030688,0.030557,0.030750,0.028223
2703452.0,10.0,1.0,0.0,2.0,0.0,7788.8003,0.0,2662.4001,0.000,0.0,...,0.031802,0.031429,0.030180,0.030545,0.030395,0.028488,0.030692,0.030496,0.030697,0.028284
2703453.0,10.0,1.0,1.0,0.0,0.0,1195.4000,2827.2,8212.6010,47943.062,7.0,...,0.031733,0.031475,0.030165,0.030549,0.030442,0.028458,0.030688,0.030557,0.030750,0.028223


In [21]:
X_train, X_test, y_train, y_test, weeks_train, weeks_test = train_test_split(
    X_, y, weeks, test_size=0.2, stratify=y, shuffle=True, random_state=RAND
)

X_train_, X_val, y_train_, y_val, weeks_train_, weeks_val = train_test_split(
    X_train,
    y_train,
    weeks_train,
    test_size=0.16,
    stratify=y_train,
    shuffle=True,
    random_state=RAND,
)

In [22]:
rf = RandomForestClassifier(n_jobs=24, random_state=RAND, class_weight="balanced")
rf.fit(X_train_, y_train_)
metrics_ = metrics_estimation(
    rf, X_train_, y_train_, X_test, y_test, weeks_train_, weeks_test, name="RF"
)
metrics = pd.concat([metrics, metrics_])
metrics

Unnamed: 0_level_0,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss,gini_stability,"overfitting, %"
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
LR_train,0.724331,0.794145,0.077931,0.717227,0.140587,0.562475,0.565279,0.366829
LR_test,0.725659,0.791242,0.07794,0.713408,0.140528,0.561868,0.552538,0.366829
LDA_train,0.964572,0.797184,0.247034,0.061981,0.099098,0.127526,0.572686,0.244131
LDA_test,0.964514,0.795242,0.243781,0.061256,0.09791,0.127899,0.553696,0.244131
RF_train,0.999965,1.0,1.0,0.998884,0.999442,0.023363,1.0,26.521781
RF_test,0.968565,0.790378,0.526316,0.001042,0.002079,0.156773,0.535937,26.521781


# XGBoost

Для бустингов target encoder использовать не будем. Пропуски в числовых данных оставим. В категориальных пропуски уже заменены на 'None'.

In [23]:
X_train, X_test, y_train, y_test, weeks_train, weeks_test = train_test_split(
    X, y, weeks, test_size=0.2, stratify=y, shuffle=True, random_state=RAND
)

X_train_, X_val, y_train_, y_val, weeks_train_, weeks_val = train_test_split(
    X_train,
    y_train,
    weeks_train,
    test_size=0.16,
    stratify=y_train,
    shuffle=True,
    random_state=RAND,
)
scale_pos_weight = float(np.sum(y_train_ == 0)) / np.sum(y_train_ == 1)

In [None]:
xgb_clf = XGBClassifier(
    eval_metric="auc",
    early_stopping_rounds=100,
    random_state=RAND,
    scale_pos_weight=scale_pos_weight,
    enable_categorical=True,
    device="cuda",
)
eval_set = [(X_val, y_val)]

xgb_clf.fit(X_train_, y_train_, eval_set=eval_set, verbose=0)

In [26]:
metrics_ = metrics_estimation(
    xgb_clf,
    X_train_,
    y_train_,
    X_test,
    y_test,
    weeks_train_,
    weeks_test,
    name="XGBoost",
)
metrics = pd.concat([metrics, metrics_])
metrics

Unnamed: 0_level_0,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss,gini_stability,"overfitting, %"
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
LR_train,0.724331,0.794145,0.077931,0.717227,0.140587,0.562475,0.565279,0.366829
LR_test,0.725659,0.791242,0.07794,0.713408,0.140528,0.561868,0.552538,0.366829
LDA_train,0.964572,0.797184,0.247034,0.061981,0.099098,0.127526,0.572686,0.244131
LDA_test,0.964514,0.795242,0.243781,0.061256,0.09791,0.127899,0.553696,0.244131
RF_train,0.999965,1.0,1.0,0.998884,0.999442,0.023363,1.0,26.521781
RF_test,0.968565,0.790378,0.526316,0.001042,0.002079,0.156773,0.535937,26.521781
XGBoost_train,0.801754,0.898543,0.119748,0.835483,0.209472,0.43229,0.77209,12.090518
XGBoost_test,0.789452,0.801622,0.092986,0.650797,0.162723,0.447085,0.575426,12.090518


# LightGBM

In [27]:
lgb_clf = LGBMClassifier(
    random_state=RAND, scale_pos_weight=scale_pos_weight, n_jobs=24
)

In [None]:
lgb_clf.fit(
    X_train_,
    y_train_,
    eval_set=eval_set,
    eval_metric="auc",
    callbacks=[
        early_stopping(stopping_rounds=100),
    ],
)

In [29]:
metrics_ = metrics_estimation(
    lgb_clf,
    X_train_,
    y_train_,
    X_test,
    y_test,
    weeks_train_,
    weeks_test,
    name="LightGBM",
)
metrics = pd.concat([metrics, metrics_])
metrics

Unnamed: 0_level_0,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss,gini_stability,"overfitting, %"
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
LR_train,0.724331,0.794145,0.077931,0.717227,0.140587,0.562475,0.565279,0.366829
LR_test,0.725659,0.791242,0.07794,0.713408,0.140528,0.561868,0.552538,0.366829
LDA_train,0.964572,0.797184,0.247034,0.061981,0.099098,0.127526,0.572686,0.244131
LDA_test,0.964514,0.795242,0.243781,0.061256,0.09791,0.127899,0.553696,0.244131
RF_train,0.999965,1.0,1.0,0.998884,0.999442,0.023363,1.0,26.521781
RF_test,0.968565,0.790378,0.526316,0.001042,0.002079,0.156773,0.535937,26.521781
XGBoost_train,0.801754,0.898543,0.119748,0.835483,0.209472,0.43229,0.77209,12.090518
XGBoost_test,0.789452,0.801622,0.092986,0.650797,0.162723,0.447085,0.575426,12.090518
LightGBM_train,0.77506,0.896789,0.10993,0.867326,0.195128,0.435288,0.777927,8.768459
LightGBM_test,0.766949,0.824493,0.091529,0.718512,0.162374,0.446039,0.619085,8.768459


# CatBoost

In [30]:
eval_set = [(X_val, y_val)]
cat_clf = CatBoostClassifier(
    eval_metric="AUC",
    random_state=RAND,
    scale_pos_weight=scale_pos_weight,
    cat_features=cat_features,
    task_type="GPU",
)

In [None]:
cat_clf.fit(X_train_, y_train_, eval_set=eval_set, early_stopping_rounds=100, verbose=0)

In [None]:
metrics_ = metrics_estimation(
    cat_clf,
    X_train_,
    y_train_,
    X_test,
    y_test,
    weeks_train_,
    weeks_test,
    name="CatBoost",
)
metrics = pd.concat([metrics, metrics_])

In [36]:
metrics.iloc[1::2].style.highlight_min(color="yellow", axis=0).highlight_max(
    color="lightgreen", axis=0
)

Unnamed: 0_level_0,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss,gini_stability,"overfitting, %"
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
LR_test,0.725659,0.791242,0.07794,0.713408,0.140528,0.561868,0.552538,0.366829
LDA_test,0.964514,0.795242,0.243781,0.061256,0.09791,0.127899,0.553696,0.244131
RF_test,0.968565,0.790378,0.526316,0.001042,0.002079,0.156773,0.535937,26.521781
XGBoost_test,0.789452,0.801622,0.092986,0.650797,0.162723,0.447085,0.575426,12.090518
LightGBM_test,0.766949,0.824493,0.091529,0.718512,0.162374,0.446039,0.619085,8.768459
CatBoost_test,0.76969,0.853116,0.098676,0.777685,0.17513,0.463001,0.676212,2.496882


# TabTransformer

Проверим еще насколько подходит [TabTransformer](https://keras.io/examples/structured_data/tabtransformer/) для этой задачи. 

In [None]:
import os

os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"

import tensorflow as tf
from tabtransformer import TabTransformer

tf.get_logger().setLevel("INFO")

In [22]:
cats = [np.unique(np.array(X[col]).ravel().astype(str)) for col in cat_features]
cat_pipe = Pipeline(
    [
        (
            "none_impute",
            SimpleImputer(missing_values=None, strategy="constant", fill_value="None"),
        ),
        (
            "orlEnc_with_map",
            OrdinalEncoder(
                categories=cats,
                handle_unknown="use_encoded_value",
                unknown_value=-1,
                dtype=int,
            ),
        ),
    ]
)
num_pipe = Pipeline(
    [
        (
            "zero_impute",
            SimpleImputer(missing_values=np.nan, strategy="constant", fill_value=0),
        )
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("orlEncdr_with_map", cat_pipe, cat_features),
        ("num", num_pipe, num_features),
    ],
    n_jobs=24,
    verbose=True,
    remainder="passthrough",
)

In [23]:
X_ = preprocessor.fit_transform(X, y)
X_ = pd.DataFrame(X_, columns=cat_features + num_features).set_index(X.index)
X_

Unnamed: 0_level_0,requesttype_4525192L,credtype_322L,disbursementtype_67L,lastapprcommoditycat_1041M,lastapprcommoditytypec_5251766M,lastcancelreason_561M,lastrejectcommoditycat_161M,lastrejectcommodtypec_5251769M,lastrejectreason_759M,lastrejectreasonclient_4145040M,...,education_max,avg_days_before_or_past_due,installmentamount_active,isdebitcard,residualamount_closed,num_total_inst,num_total_paid_inst,cred_closure_date_days_ago,actualdpd,sellerplace_cnt
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,1.0,0.0,1.0,44.0,213.0,73.0,44.0,186.0,17.0,13.0,...,2.0,0.0,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1.0,1.0,0.0,1.0,44.0,213.0,73.0,44.0,186.0,17.0,13.0,...,2.0,0.0,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2.0,1.0,0.0,1.0,44.0,213.0,73.0,44.0,186.0,17.0,13.0,...,2.0,0.0,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3.0,1.0,0.0,1.0,44.0,213.0,67.0,44.0,186.0,15.0,13.0,...,3.0,0.0,0.000,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4.0,1.0,0.0,1.0,44.0,213.0,41.0,44.0,186.0,17.0,13.0,...,3.0,0.0,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2703450.0,1.0,0.0,1.0,8.0,48.0,73.0,44.0,186.0,15.0,12.0,...,1.0,-23.0,0.000,1.0,0.0,92.0,113.0,-27.0,0.0,8.0
2703451.0,1.0,0.0,1.0,44.0,213.0,73.0,44.0,186.0,17.0,13.0,...,1.0,-12.0,0.000,1.0,0.0,69.0,75.0,-7.0,0.0,3.0
2703452.0,1.0,0.0,1.0,17.0,178.0,22.0,44.0,186.0,17.0,13.0,...,1.0,-12.0,4497.532,0.0,0.0,9.0,9.0,-321.0,0.0,1.0
2703453.0,1.0,0.0,1.0,44.0,213.0,73.0,17.0,68.0,17.0,13.0,...,2.0,-33.0,0.000,1.0,0.0,119.0,119.0,-293.0,0.0,4.0


In [24]:
X_train, X_test, y_train, y_test, weeks_train, weeks_test = train_test_split(
    X_, y, weeks, test_size=0.2, stratify=y, shuffle=True, random_state=RAND
)

X_train_, X_val, y_train_, y_val, weeks_train_, weeks_val = train_test_split(
    X_train,
    y_train,
    weeks_train,
    test_size=0.16,
    stratify=y_train,
    shuffle=True,
    random_state=RAND,
)

In [25]:
class_weights = compute_class_weight(
    class_weight="balanced", classes=np.unique(y_train), y=y_train
)
class_weights = dict(zip(np.unique(y_train), class_weights))
class_weights

{0: 0.5162287434949769, 1: 15.904766245604897}

In [None]:
clf = TabTransformer(
    X_train_,
    cat_features,
    num_features,
    embedding_dims=16,
    num_transformer_blocks=4,
    num_heads=6,
    use_column_embedding=True,
)

In [28]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor="val_auc", verbose=1, patience=3, mode="max", restore_best_weights=True
)

In [None]:
history = clf.fit(
    x=X_train_,
    y=y_train_,
    epochs=20,
    batch_size=128,
    validation_data=(X_val, y_val),
    class_weight=class_weights,
    callbacks=[early_stopping],
)

In [30]:
metrics_estimation(
    clf,
    X_train_,
    y_train_,
    X_test,
    y_test,
    weeks_train_,
    weeks_test,
    name="TabTransformer",
)

[1m8015/8015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 10ms/step
[1m2386/2386[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 11ms/step


Unnamed: 0_level_0,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss,gini_stability,"overfitting, %"
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
TabTransformer_train,0.707634,0.824228,0.07927,0.781905,0.143947,0.539401,0.630339,5.310152
TabTransformer_test,0.701361,0.782668,0.072446,0.720075,0.131647,0.553359,0.533733,5.310152


# Вывод

Создано 7 различных baseline моделей. Лучший результат показал CatBoost (ROC AUC = 0.853116, gini = 0.676212) с большим отрывом от LightGBM (ROC AUC = 0.824493, gini = 0.619085). Также проверен TabTransformer, который показал результаты на уровне Random forest и оказался хуже примерно на 2 % чем линейные модели. Выберем 3 наилучшие модели: CatBoost, LightGBM, XGBoost.