In [28]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

sns.set()

# Data processing

In [4]:
pd.set_option("max_columns", 300)

In [5]:
train = pd.read_csv("data/raw/application_train.csv")
test = pd.read_csv("data/raw/application_test.csv")

In [6]:
fl_col = [var for var in train.columns if "FLAG" in var]

In [7]:
fl_col = [var for var in train.columns if "FLAG" in var]

train[fl_col] = train[fl_col].agg([lambda x: True if x == "Y" else False])
test[fl_col] = test[fl_col].agg([lambda x: True if x == "Y" else False])

In [8]:
train["AMT_ANNUITY"].fillna(train["AMT_INCOME_TOTAL"] * 0.2,inplace=True)
test["AMT_ANNUITY"].fillna(test["AMT_INCOME_TOTAL"] * 0.2,inplace=True)

In [10]:
train['CREDIT_SUB_GOODP'] = train['AMT_GOODS_PRICE'] - train['AMT_CREDIT']

In [11]:
train[['CREDIT_SUB_GOODP',"TARGET"]][train['CREDIT_SUB_GOODP'] > 0].mean()

CREDIT_SUB_GOODP    179048.076923
TARGET                   0.050000
dtype: float64

In [12]:
train['CREDIT_REL_SIZE'] = train['AMT_CREDIT'] / train['AMT_INCOME_TOTAL']
train['ANNUITY_DIV_INCOME'] = train['AMT_ANNUITY'] / train['AMT_INCOME_TOTAL']
train['PAY_YEARS'] = train['AMT_CREDIT'] / train['AMT_ANNUITY'] # pay years
train['CREDIT_SUB_GOODP'] = train['AMT_GOODS_PRICE'] - train['AMT_CREDIT']
train['CAR_EXPENSES'] = train['FLAG_OWN_CAR'] * 5_000
train['FREE_MONEY'] = train['AMT_INCOME_TOTAL'] - train['CAR_EXPENSES'] - train['AMT_ANNUITY']

In [13]:
test['CREDIT_REL_SIZE'] = test['AMT_CREDIT'] / test['AMT_INCOME_TOTAL']
test['ANNUITY_DIV_INCOME'] = test['AMT_ANNUITY'] / test['AMT_INCOME_TOTAL']
test['PAY_YEARS'] = test['AMT_CREDIT'] / test['AMT_ANNUITY'] # pay years
test['CREDIT_SUB_GOODP'] = test['AMT_GOODS_PRICE'] - test['AMT_CREDIT']
test['CAR_EXPENSES'] = test['FLAG_OWN_CAR'] * 5_000 
test['FREE_MONEY'] = test['AMT_INCOME_TOTAL'] - test['CAR_EXPENSES'] - test['AMT_ANNUITY']

In [14]:
car_age_median = train["OWN_CAR_AGE"].median()

train["OWN_CAR_AGE"].fillna(car_age_median, inplace=True)
test["OWN_CAR_AGE"].fillna(car_age_median, inplace=True)

ext1_median = train["EXT_SOURCE_1"].median()
ext2_median = train["EXT_SOURCE_2"].median()
ext3_median = train["EXT_SOURCE_3"].median()

train["EXT_SOURCE_1"].fillna(ext1_median, inplace=True)
train["EXT_SOURCE_2"].fillna(ext2_median, inplace=True)
train["EXT_SOURCE_3"].fillna(ext3_median, inplace=True)

test["EXT_SOURCE_1"].fillna(ext1_median, inplace=True)
test["EXT_SOURCE_2"].fillna(ext2_median, inplace=True)
test["EXT_SOURCE_3"].fillna(ext3_median, inplace=True)

In [15]:
train['CREDIT_SUB_GOODP'].fillna(train['CREDIT_SUB_GOODP'].median(), inplace=True)
test['CREDIT_SUB_GOODP'].fillna(train['CREDIT_SUB_GOODP'].median(), inplace=True)

In [16]:
def standartization(df, col, mn, st):
    df[col] = (df[col] - mn) / st
    
    return df

col_mean = train['CREDIT_REL_SIZE'].mean()
col_std = train['CREDIT_REL_SIZE'].std()
train = standartization(train, 'CREDIT_REL_SIZE', col_mean, col_std)
test = standartization(test, 'CREDIT_REL_SIZE', col_mean, col_std)

col_mean = train['ANNUITY_DIV_INCOME'].mean()
col_std = train['ANNUITY_DIV_INCOME'].std()
train = standartization(train, 'ANNUITY_DIV_INCOME', col_mean, col_std)
test = standartization(test, 'ANNUITY_DIV_INCOME', col_mean, col_std)

col_mean = train['PAY_YEARS'].mean()
col_std = train['PAY_YEARS'].std()
train = standartization(train, 'PAY_YEARS', col_mean, col_std)
test = standartization(test, 'PAY_YEARS', col_mean, col_std)

col_mean = train['FREE_MONEY'].mean()
col_std = train['FREE_MONEY'].std()
train = standartization(train, 'FREE_MONEY', col_mean, col_std)
test = standartization(test, 'FREE_MONEY', col_mean, col_std)

col_mean = train['DAYS_EMPLOYED'].mean()
col_std = train['DAYS_EMPLOYED'].std()
train = standartization(train, 'DAYS_EMPLOYED', col_mean, col_std)
test = standartization(test, 'DAYS_EMPLOYED', col_mean, col_std)

col_mean = train['DAYS_REGISTRATION'].mean()
col_std = train['DAYS_REGISTRATION'].std()
train = standartization(train, 'DAYS_REGISTRATION', col_mean, col_std)
test = standartization(test, 'DAYS_REGISTRATION', col_mean, col_std)

col_mean = train['OWN_CAR_AGE'].mean()
col_std = train['OWN_CAR_AGE'].std()
train = standartization(train, 'OWN_CAR_AGE', col_mean, col_std)
test = standartization(test, 'OWN_CAR_AGE', col_mean, col_std)


col_mean = train['CREDIT_SUB_GOODP'].mean()
col_std = train['CREDIT_SUB_GOODP'].std()
train = standartization(train, 'CREDIT_SUB_GOODP', col_mean, col_std)
test = standartization(test, 'CREDIT_SUB_GOODP', col_mean, col_std)

In [17]:
x_cols = ['CREDIT_REL_SIZE', 'ANNUITY_DIV_INCOME', 'PAY_YEARS', 'FREE_MONEY','DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'OWN_CAR_AGE', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3','CREDIT_SUB_GOODP']

# Stacking

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from sklearn.ensemble import StackingClassifier

from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, log_loss

In [19]:
def crossValidation(df, x_col, y_col, cvModel, debag=False, **kwargs):
    logList = []
    aucList = []
    i = 1

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    for train_idx, val_idx in kf.split(df):
        train_part = df.loc[train_idx, :]
        val_part = df.loc[val_idx, :]

        model = cvModel(random_state=i, **kwargs)
        model.fit(X=train_part[x_col], y=train_part[y_col])
        val_pred = model.predict_proba(val_part[x_col])

        logloss_val = log_loss(val_part[y_col], val_pred)
        auc_val = roc_auc_score(val_part[y_col], val_pred[:, 1])

        logList.append(logloss_val)
        aucList.append(auc_val)

        if not debag:
            print(f'Fold: {i}, AUC: {auc_val}, LOSS: {logloss_val}')
        i += 1
    if not debag:
        print(f'AV_AUC: {np.mean(aucList)}, AV_LOGLOSS: {np.mean(logList)}')

    return np.mean(aucList)

### Случайный лес

In [20]:
rfc = RandomForestClassifier(n_estimators=5, random_state=42)

In [21]:
crossValidation(train, x_cols, 'TARGET', RandomForestClassifier, n_estimators=5)

Fold: 1, AUC: 0.6182202415552809, LOSS: 1.4774948635945355
Fold: 2, AUC: 0.6125957720509214, LOSS: 1.5077271767734293
Fold: 3, AUC: 0.6135156998426794, LOSS: 1.5411311387715594
Fold: 4, AUC: 0.6109880899532014, LOSS: 1.5038289935523392
Fold: 5, AUC: 0.6094975862106257, LOSS: 1.5183719189471798
AV_AUC: 0.6129634779225417, AV_LOGLOSS: 1.5097108183278087


0.6129634779225417

### Логистическая регрессия 1

In [36]:
logreg1 = LogisticRegression(penalty='l1',
                                class_weight='balanced',solver='liblinear')

In [37]:
crossValidation(train, x_cols, 'TARGET', LogisticRegression, penalty='l1',
                                class_weight='balanced',solver='liblinear')

Fold: 1, AUC: 0.728037936594313, LOSS: 0.6125242184884194
Fold: 2, AUC: 0.7279082084400457, LOSS: 0.6116663469003826
Fold: 3, AUC: 0.7256648811319947, LOSS: 0.6111431671004819
Fold: 4, AUC: 0.7228632494522054, LOSS: 0.6104076414451772
Fold: 5, AUC: 0.7261002472740659, LOSS: 0.6113319870104337
AV_AUC: 0.7261149045785249, AV_LOGLOSS: 0.6114146721889789


0.7261149045785249

### Логистическая регрессия 2

In [38]:
logreg2 = LogisticRegression(penalty='l2',
                                class_weight='balanced',
                                C=0.2)

In [39]:
crossValidation(train, x_cols, 'TARGET', LogisticRegression, penalty='l2',
                                class_weight='balanced',
                                C=0.2)

Fold: 1, AUC: 0.7280393514573622, LOSS: 0.6125059996924037
Fold: 2, AUC: 0.7279052398952158, LOSS: 0.6116653310104097
Fold: 3, AUC: 0.7256641487265703, LOSS: 0.6111381086319656
Fold: 4, AUC: 0.7228657606517952, LOSS: 0.6104064247546546
Fold: 5, AUC: 0.7261043531382836, LOSS: 0.6113347133550744
AV_AUC: 0.7261157707738454, AV_LOGLOSS: 0.6114101154889016


0.7261157707738454

### CatBoost 1

In [40]:
params1 = {'depth': 3, 'l2_leaf_reg': 2, 'learning_rate': 0.5}

In [51]:
cboost1 = CatBoostClassifier(**params1,
                iterations=50, random_seed=42, logging_level='Silent')

In [52]:
crossValidation(train, x_cols, 'TARGET', CatBoostClassifier, iterations=50, logging_level='Silent', **params1)

Fold: 1, AUC: 0.7540672006926712, LOSS: 0.24697885651925353
Fold: 2, AUC: 0.7532037715702247, LOSS: 0.2469967606020549
Fold: 3, AUC: 0.7520385336629576, LOSS: 0.2520959544951597
Fold: 4, AUC: 0.7493907260541662, LOSS: 0.24636518703175972
Fold: 5, AUC: 0.7516036800496809, LOSS: 0.24772944992508583
AV_AUC: 0.75206078240594, AV_LOGLOSS: 0.24803324171466273


0.75206078240594

### CatBoost 2

In [55]:
params2 = {'depth': 10.0, 'l2_leaf_reg': 1.0, 'learning_rate': 0.3013830452079781}

In [56]:
cboost2 = CatBoostClassifier(**params2,
                iterations=50, random_seed=42, logging_level='Silent')

In [57]:
crossValidation(train, x_cols, 'TARGET', CatBoostClassifier, iterations=50, logging_level='Silent', **params2)

Fold: 1, AUC: 0.7505386966008623, LOSS: 0.2480650118050493
Fold: 2, AUC: 0.7506552901572633, LOSS: 0.24796587494801398
Fold: 3, AUC: 0.7471726500004674, LOSS: 0.2536908762567409
Fold: 4, AUC: 0.7446106702448629, LOSS: 0.24779626365069513
Fold: 5, AUC: 0.7462448346041322, LOSS: 0.24962404094372911
AV_AUC: 0.7478444283215178, AV_LOGLOSS: 0.2494284135208457


0.7478444283215178

### Stacking

In [58]:
X, y = train[x_cols], train["TARGET"]

estimators = [
    ('rf', rfc),
    ('logreg1', logreg1),
    ('logreg2', logreg2),
    ('cboost1', cboost1), 
    ('cboost2', cboost2),
]

clf = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression()
)

y_score = clf.fit(X, y).predict_proba(test[x_cols])
sub = pd.read_csv("data/raw/sample_submission.csv")
sub['TARGET'] = y_score[:, 1]
sub.to_csv("stacking_1.csv", index=False)

<b>SUBMISSION SCORE: 0.74419</b>

### Убираем лучшую модель

In [59]:
X, y = train[x_cols], train["TARGET"]

estimators = [
    ('rf', rfc),
    ('logreg1', logreg1),
    ('logreg2', logreg2),
    ('cboost2', cboost2),
]

clf = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression()
)

y_score = clf.fit(X, y).predict_proba(test[x_cols])
sub = pd.read_csv("data/raw/sample_submission.csv")
sub['TARGET'] = y_score[:, 1]
sub.to_csv("stacking_2.csv", index=False)

<b>SUBMISSION SCORE: 0.74068</b>

### Results

После исключения самой лучшей модели результаты снизелся на 0.04. 
Значительное ухудшение работы модели, на фоне каждой модели по отдельности