In [1]:
import os
from tqdm import tqdm
from typing import List, Optional

import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
import matplotlib.pyplot as plt
from scipy.stats import skew, kurtosis
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from pandas_profiling import ProfileReport

pd.set_option("display.max_columns", 300)

  import pandas.util.testing as tm


In [2]:
os.listdir("data")

['1st_attempt',
 'train.csv',
 'applications_history.csv',
 'test_data_complete.csv',
 'payments_AFTER_EDA.csv',
 'train_data_complete.csv',
 'applications_history_AFTER_EDA.csv',
 'sample_submit.csv',
 'payments.csv',
 'bki_AFTER_EDA.csv',
 'client_profile.csv',
 'test.csv',
 'client_profile_AFTER_EDA.csv',
 'bki.csv']

# TRAIN.CSV

In [3]:
train = pd.read_csv("data/train.csv")
train.columns = [col.lower() for col in train.columns]
train.head(n=2)

Unnamed: 0,application_number,target,name_contract_type
0,123687442,0,Cash
1,123597908,1,Cash


# SAMPLE_SUBMIT.CSV

In [4]:
sample = pd.read_csv("data/sample_submit.csv")
sample.head(n=2)

Unnamed: 0,APPLICATION_NUMBER,TARGET
0,123724268,0
1,123456549,0


# CLIENT_PROFILE.CSV

In [5]:
client_profile = pd.read_csv("data/client_profile_AFTER_EDA.csv")
client_profile.columns = [col.lower() for col in client_profile.columns]
client_profile.head(n=2)

Unnamed: 0,application_number,gender,childrens,total_salary,amount_credit,amount_annuity,education_level,family_status,region_population,age,days_on_last_job,own_car_age,flag_phone,flag_email,family_size,external_scoring_rating_1,external_scoring_rating_2,external_scoring_rating_3,amt_req_credit_bureau_hour,amt_req_credit_bureau_day,amt_req_credit_bureau_week,amt_req_credit_bureau_mon,amt_req_credit_bureau_qrt,amt_req_credit_bureau_year
0,123666076,F,0,157500.0,270000.0,13500.0,Incomplete higher,Civil marriage,0.008068,8560,1549,,1,0,2.0,0.329471,0.236315,0.678568,0.0,0.0,0.0,0.0,1.0,2.0
1,123423688,F,0,270000.0,536917.5,28467.0,Secondary / secondary special,Married,0.020246,23187,365243,,0,0,2.0,0.62252,0.442295,0.802745,0.0,0.0,0.0,0.0,1.0,1.0


# APPLICATIONS_HISTORY.CSV

In [6]:
apps = pd.read_csv("data/applications_history_AFTER_EDA.csv")
apps.columns = [col.lower() for col in apps.columns]
apps.head(n=2)

Unnamed: 0,prev_application_number,application_number,name_contract_type,amount_annuity,amt_application,amount_credit,amount_payment,amount_goods_payment,name_contract_status,days_decision,name_payment_type,code_reject_reason,name_type_suite,name_client_type,name_goods_category,name_portfolio,name_product_type,sellerplace_area,cnt_payment,name_yield_group,days_first_drawing,days_first_due,days_last_due_1st_version,days_last_due,days_termination,nflag_insured_on_approval
0,49298709,123595216,,1730.43,17145.0,17145.0,0.0,17145.0,Approved,73,Cash through the bank,XAP,,Repeater,Mobile,POS,XNA,35,12.0,middle,365243.0,42.0,300.0,42.0,37.0,0.0
1,50070639,123431468,Cash,25188.615,607500.0,679671.0,,607500.0,Approved,164,XNA,XAP,Unaccompanied,Repeater,XNA,Cash,x-sell,-1,36.0,low_action,365243.0,134.0,916.0,365243.0,365243.0,1.0


# BKI.CSV

In [7]:
bki = pd.read_csv("data/bki_AFTER_EDA.csv")
bki.columns = [col.lower() for col in bki.columns]
bki.head(n=2)

Unnamed: 0,application_number,bureau_id,credit_active,credit_currency,days_credit,credit_day_overdue,days_credit_enddate,days_enddate_fact,amt_credit_max_overdue,cnt_credit_prolong,amt_credit_sum,amt_credit_sum_debt,amt_credit_sum_limit,amt_credit_sum_overdue,credit_type,days_credit_update,amt_annuity
0,123538884,5223613,Active,currency 1,718.0,0,377.0,,19386.81,0,675000.0,320265.495,0.0,0.0,Consumer credit,39.0,
1,123436670,6207544,Closed,currency 1,696.0,0,511.0,511.0,0.0,0,93111.66,0.0,0.0,0.0,Consumer credit,505.0,


# PAYMENTS.CSV

In [8]:
payments = pd.read_csv("data/payments_AFTER_EDA.csv")
payments.columns = [col.lower() for col in payments.columns]
payments.head(n=2)

Unnamed: 0,prev_application_number,application_number,num_instalment_version,num_instalment_number,days_instalment,days_entry_payment,amt_instalment,amt_payment
0,49011181,123664960,1.0,5,1002.0,1015.0,12156.615,12156.615
1,48683432,123497205,1.0,13,442.0,432.0,18392.535,10047.645


# TEST.CSV

In [9]:
test = pd.read_csv("data/test.csv")
test.columns = [col.lower() for col in test.columns]
test.head(n=2)

Unnamed: 0,application_number,name_contract_type
0,123724268,Cash
1,123456549,Cash


###########################################################
###########################################################

In [10]:
# train_sample = train.sample(2100).copy()
# train_sample = train.sample(len(train)).copy()
train_sample = train.copy()
train_sample.head(n=2)

Unnamed: 0,application_number,target,name_contract_type
0,123687442,0,Cash
1,123597908,1,Cash


In [11]:
# DEBUG
# test = test.sample(2100).copy()

In [12]:
def create_client_profile_features(X: pd.DataFrame, copy: bool = True) -> pd.DataFrame:
    """
    Создание признаков на основе профиля клиентов.

    Parameters
    ----------
    X: pandas.core.frame.DataFrame
        Матрица признаков с исходным профилем клиента.

    copy: bool, optional, default = True
        Флаг использования копии датафрейма X.
        Опциональный параметр, по умолчанию, равен True.

    Returns
    -------
    X_transformed: pandas.core.frame.DataFrame
        Расширенная матрица признаков с профилем клиентов.

    """
    if copy:
        X = X.copy()

    X["days_on_last_job"] = X["days_on_last_job"].replace(365243, np.nan)
    bki_flags = [flag for flag in X.columns if "amt_req_credit_bureau" in flag]
    X["bki_requests_count"] = X[bki_flags].sum(axis=1)
    X["bki_kurtosis"] = X[bki_flags].kurtosis(axis=1)

    X["external_scoring_prod"] = X["external_scoring_rating_1"] * X["external_scoring_rating_2"] * X["external_scoring_rating_3"]
    X["external_scoring_weighted"] = X.external_scoring_rating_1 * 2 + X.external_scoring_rating_2 * 1 + X.external_scoring_rating_3 * 3

    for function_name in ["min", "max", "mean", "nanmedian", "var"]:
        feature_name = "external_scoring_rating_{}".format(function_name)
        X[feature_name] = eval("np.{}".format(function_name))(
            X[["external_scoring_rating_1", "external_scoring_rating_2", "external_scoring_rating_3"]], axis=1
        )

    # Отношение между основными фин. показателями
    X['ratio_credit_to_annuity'] = X['amount_credit'] / X['amount_annuity']
    X["ratio_annuity_to_salary"] = X['amount_annuity'] / X['total_salary']
    X['ratio_credit_to_salary'] = X['amount_credit'] / X['total_salary']
    #X["total_salary_net"] = X["total_salary"] - X["amount_annuity"]

    # Отношение фин. показателей к возрасту и временным фичам
    X["ratio_annuity_to_age"] = X["amount_annuity"] / X["age"]
    X["ratio_credit_to_age"] = X["amount_credit"] / X["age"]
    X["ratio_salary_to_age"] = X["total_salary"] / X["age"]
    X["ratio_salary_to_experience"] = X["total_salary"] / X["days_on_last_job"]
    X["ratio_credit_to_experience"] = X["amount_credit"] / X["days_on_last_job"]
    X["ratio_annuity_to_experience"] = X["amount_annuity"] / X["days_on_last_job"]

    # Отношение врменных признаков
    X["ratio_age_to_experience"] = X["age"] / X["days_on_last_job"]
    X["ratio_salary_to_region_population"] = X["total_salary"] * X["region_population"]
    X["ratio_car_to_experience"] = X["own_car_age"] / X["days_on_last_job"]
    X["ratio_car_to_age"] = X["own_car_age"] / X["age"]

    # Произведение фин. показателей кредита на вероятность дефолта
    # Такая штука называется математическим ожиданием дефолта или ожидаемыми потерями
    X["expected_total_loss_1"] = X["external_scoring_rating_1"] * X["amount_credit"]
    X["expected_total_loss_2"] = X["external_scoring_rating_2"] * X["amount_credit"]
    X["expected_total_loss_3"] = X["external_scoring_rating_3"] * X["amount_credit"]
    X["expected_monthly_loss_1"] = X["external_scoring_rating_1"] * X["amount_annuity"]
    X["expected_monthly_loss_2"] = X["external_scoring_rating_2"] * X["amount_annuity"]
    X["expected_monthly_loss_3"] = X["external_scoring_rating_3"] * X["amount_annuity"]

    return X

In [13]:
client_profile = create_client_profile_features(client_profile)

  mean = values.sum(axis, dtype=np.float64) / count


In [14]:
client_profile.head()

Unnamed: 0,application_number,gender,childrens,total_salary,amount_credit,amount_annuity,education_level,family_status,region_population,age,days_on_last_job,own_car_age,flag_phone,flag_email,family_size,external_scoring_rating_1,external_scoring_rating_2,external_scoring_rating_3,amt_req_credit_bureau_hour,amt_req_credit_bureau_day,amt_req_credit_bureau_week,amt_req_credit_bureau_mon,amt_req_credit_bureau_qrt,amt_req_credit_bureau_year,bki_requests_count,bki_kurtosis,external_scoring_prod,external_scoring_weighted,external_scoring_rating_min,external_scoring_rating_max,external_scoring_rating_mean,external_scoring_rating_nanmedian,external_scoring_rating_var,ratio_credit_to_annuity,ratio_annuity_to_salary,ratio_credit_to_salary,ratio_annuity_to_age,ratio_credit_to_age,ratio_salary_to_age,ratio_salary_to_experience,ratio_credit_to_experience,ratio_annuity_to_experience,ratio_age_to_experience,ratio_salary_to_region_population,ratio_car_to_experience,ratio_car_to_age,expected_total_loss_1,expected_total_loss_2,expected_total_loss_3,expected_monthly_loss_1,expected_monthly_loss_2,expected_monthly_loss_3
0,123666076,F,0,157500.0,270000.0,13500.0,Incomplete higher,Civil marriage,0.008068,8560,1549.0,,1,0,2.0,0.329471,0.236315,0.678568,0.0,0.0,0.0,0.0,1.0,2.0,3.0,1.428571,0.052832,2.930959,0.236315,0.678568,0.414784,0.329471,0.036237,20.0,0.085714,1.714286,1.577103,31.542056,18.399533,101.678502,174.306004,8.7153,5.526146,1270.71,,,88957.124333,63804.96656,183213.275945,4447.856217,3190.248328,9160.663797
1,123423688,F,0,270000.0,536917.5,28467.0,Secondary / secondary special,Married,0.020246,23187,,,0,0,2.0,0.62252,0.442295,0.802745,0.0,0.0,0.0,0.0,1.0,1.0,2.0,-1.875,0.221026,4.095571,0.442295,0.802745,0.62252,0.62252,0.021654,18.86105,0.105433,1.988583,1.227714,23.155971,11.644456,,,,,5466.42,,,334241.918918,237475.743779,431008.094056,17721.278792,12590.802122,22851.755462
2,123501780,M,1,427500.0,239850.0,23850.0,Incomplete higher,Married,0.072508,14387,326.0,18.0,0,0,3.0,0.409017,0.738159,0.573588,,,,,,,0.0,,0.173177,3.276957,0.409017,0.738159,0.573588,0.573588,0.018056,10.056604,0.055789,0.561053,1.657747,16.6713,29.714325,1311.349693,735.736196,73.159509,44.131902,30997.17,0.055215,0.001251,98102.721947,177047.426953,137575.07445,9755.054903,17605.091235,13680.073069
3,123588799,M,0,112500.0,254700.0,17149.5,Secondary / secondary special,Married,0.019101,14273,1726.0,12.0,0,0,2.0,0.449613,0.308994,0.590233,0.0,0.0,0.0,0.0,0.0,3.0,3.0,6.0,0.082,2.97892,0.308994,0.590233,0.449613,0.449613,0.013183,14.851745,0.15244,2.264,1.201534,17.844882,7.882015,65.179606,147.566628,9.935979,8.269409,2148.8625,0.006952,0.000841,114516.542088,78700.65283,150332.431346,7710.645617,5299.084593,10122.206641
4,123647485,M,0,130500.0,614574.0,19822.5,Lower secondary,Married,0.022625,22954,,,0,0,2.0,0.448024,0.739408,0.15664,0.0,0.0,1.0,0.0,0.0,6.0,7.0,5.393765,0.05189,2.105376,0.15664,0.739408,0.448024,0.448024,0.056603,31.003859,0.151897,4.709379,0.863575,26.774157,5.685284,,,,,2952.5625,,,275343.878527,454420.991996,96266.765058,8880.95499,14656.917009,3104.992971


In [15]:
train_sample = train_sample.merge(client_profile, how="left", on="APPLICATION_NUMBER".lower())
train_sample

Unnamed: 0,application_number,target,name_contract_type,gender,childrens,total_salary,amount_credit,amount_annuity,education_level,family_status,region_population,age,days_on_last_job,own_car_age,flag_phone,flag_email,family_size,external_scoring_rating_1,external_scoring_rating_2,external_scoring_rating_3,amt_req_credit_bureau_hour,amt_req_credit_bureau_day,amt_req_credit_bureau_week,amt_req_credit_bureau_mon,amt_req_credit_bureau_qrt,amt_req_credit_bureau_year,bki_requests_count,bki_kurtosis,external_scoring_prod,external_scoring_weighted,external_scoring_rating_min,external_scoring_rating_max,external_scoring_rating_mean,external_scoring_rating_nanmedian,external_scoring_rating_var,ratio_credit_to_annuity,ratio_annuity_to_salary,ratio_credit_to_salary,ratio_annuity_to_age,ratio_credit_to_age,ratio_salary_to_age,ratio_salary_to_experience,ratio_credit_to_experience,ratio_annuity_to_experience,ratio_age_to_experience,ratio_salary_to_region_population,ratio_car_to_experience,ratio_car_to_age,expected_total_loss_1,expected_total_loss_2,expected_total_loss_3,expected_monthly_loss_1,expected_monthly_loss_2,expected_monthly_loss_3
0,123687442,0,Cash,M,1.0,157500.0,855000.0,25128.0,Secondary / secondary special,Married,0.019101,15728.0,1719.0,11.0,0.0,0.0,3.0,0.700784,0.645914,0.716570,0.0,0.0,1.0,0.0,0.0,2.0,3.0,1.428571,0.324353,4.197193,0.645914,0.716570,0.687756,0.700784,9.169231e-04,34.025788,0.159543,5.428571,1.597660,54.361648,10.013988,91.623037,497.382199,14.617801,9.149506,3008.4075,0.006399,0.000699,599170.547652,552256.266546,612667.559305,17609.307043,16230.521013,18005.977111
1,123597908,1,Cash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,123526683,0,Cash,F,0.0,135000.0,1006920.0,42660.0,Higher education,Married,0.026392,21557.0,3618.0,,1.0,0.0,2.0,0.475009,0.682149,0.267869,0.0,0.0,0.0,7.0,0.0,4.0,11.0,0.539379,0.086797,2.435775,0.267869,0.682149,0.475009,0.475009,2.860472e-02,23.603376,0.316000,7.458667,1.978940,46.709653,6.262467,37.313433,278.308458,11.791045,5.958264,3562.9200,,,478296.232619,686869.876357,269722.588880,20263.891157,29100.493510,11427.288803
3,123710391,1,Cash,M,0.0,180000.0,518562.0,22972.5,Secondary / secondary special,Married,0.031329,22338.0,,,0.0,0.0,2.0,0.170873,0.171299,0.170446,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.004989,1.024383,0.170446,0.171299,0.170873,0.170873,1.212534e-07,22.573164,0.127625,2.880900,1.028405,23.214343,8.058018,,,,,5639.2200,,,88608.035653,88829.188848,88386.882459,3925.370735,3935.167908,3915.573562
4,123590329,1,Cash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110088,123458312,0,Cash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
110089,123672463,0,Cash,F,0.0,175500.0,269550.0,12618.0,Secondary / secondary special,Married,0.010032,10519.0,1354.0,,0.0,0.0,2.0,0.528096,0.471774,0.581484,0.0,0.0,0.0,0.0,0.0,2.0,2.0,6.000000,0.144872,3.272417,0.471774,0.581484,0.527118,0.528096,2.006532e-03,21.362340,0.071897,1.535897,1.199544,25.625059,16.684095,129.615953,199.076809,9.319055,7.768833,1760.6160,,,142348.303154,127166.558499,156738.932900,6663.516562,5952.838565,7337.161400
110090,123723001,0,Cash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
110091,123554358,0,Cash,F,2.0,270000.0,1024740.0,49428.0,Incomplete higher,Married,0.019101,15008.0,361.0,22.0,0.0,0.0,4.0,0.653826,0.651227,0.558507,0.0,0.0,0.0,1.0,0.0,1.0,2.0,-1.875000,0.237806,3.634399,0.558507,0.653826,0.621186,0.651227,1.965506e-03,20.731974,0.183067,3.795333,3.293443,68.279584,17.990405,747.922438,2838.614958,136.919668,41.573407,5157.2700,0.060942,0.001466,670001.539719,667338.205604,572324.081646,32317.305956,32188.840903,27605.865593


In [16]:
test = test.merge(client_profile, how="left", on="APPLICATION_NUMBER".lower())
test

Unnamed: 0,application_number,name_contract_type,gender,childrens,total_salary,amount_credit,amount_annuity,education_level,family_status,region_population,age,days_on_last_job,own_car_age,flag_phone,flag_email,family_size,external_scoring_rating_1,external_scoring_rating_2,external_scoring_rating_3,amt_req_credit_bureau_hour,amt_req_credit_bureau_day,amt_req_credit_bureau_week,amt_req_credit_bureau_mon,amt_req_credit_bureau_qrt,amt_req_credit_bureau_year,bki_requests_count,bki_kurtosis,external_scoring_prod,external_scoring_weighted,external_scoring_rating_min,external_scoring_rating_max,external_scoring_rating_mean,external_scoring_rating_nanmedian,external_scoring_rating_var,ratio_credit_to_annuity,ratio_annuity_to_salary,ratio_credit_to_salary,ratio_annuity_to_age,ratio_credit_to_age,ratio_salary_to_age,ratio_salary_to_experience,ratio_credit_to_experience,ratio_annuity_to_experience,ratio_age_to_experience,ratio_salary_to_region_population,ratio_car_to_experience,ratio_car_to_age,expected_total_loss_1,expected_total_loss_2,expected_total_loss_3,expected_monthly_loss_1,expected_monthly_loss_2,expected_monthly_loss_3
0,123724268,Cash,M,0.0,117000.0,1125000.0,32895.0,Secondary / secondary special,Married,0.028663,16007.0,2646.0,20.0,0.0,0.0,2.0,0.628266,0.628266,0.628266,0.0,0.0,0.0,0.0,1.0,4.0,5.0,4.639906,0.247988,3.769597,0.628266,0.628266,0.628266,0.628266,0.000000,34.199726,0.281154,9.615385,2.055038,70.281752,7.309302,44.217687,425.170068,12.431973,6.049509,3353.5710,0.007559,0.001249,706799.427892,706799.427892,706799.427892,20666.815272,20666.815272,20666.815272
1,123456549,Cash,F,2.0,81000.0,312768.0,17095.5,Secondary / secondary special,Married,0.019689,10315.0,459.0,,0.0,0.0,4.0,0.383325,0.578161,0.188490,0.0,0.0,1.0,0.0,0.0,2.0,3.0,1.428571,0.041774,1.910280,0.188490,0.578161,0.383325,0.383325,0.025307,18.295341,0.211056,3.861333,1.657344,30.321667,7.852642,176.470588,681.411765,37.245098,22.472767,1594.8090,,,119891.880210,180830.265914,58953.494506,6553.137271,9883.951718,3222.322825
2,123428178,Credit Card,F,2.0,157500.0,450000.0,22500.0,Secondary / secondary special,Married,0.019101,13016.0,977.0,,1.0,0.0,4.0,0.267523,0.152544,0.382502,0.0,0.0,0.0,0.0,1.0,6.0,7.0,5.393765,0.015610,1.835096,0.152544,0.382502,0.267523,0.267523,0.008813,20.000000,0.142857,2.857143,1.728642,34.572833,12.100492,161.207779,460.593654,23.029683,13.322416,3008.4075,,,120385.398093,68644.984320,172125.811865,6019.269905,3432.249216,8606.290593
3,123619984,Cash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,123671104,Cash,F,1.0,90000.0,254700.0,24939.0,Higher education,Married,0.015221,17743.0,9258.0,,1.0,0.0,3.0,0.718604,0.505704,0.415347,0.0,0.0,0.0,0.0,1.0,0.0,1.0,6.000000,0.150937,3.188953,0.415347,0.718604,0.546552,0.505704,0.016162,10.212920,0.277100,2.830000,1.405568,14.354957,5.072423,9.721322,27.511342,2.693778,1.916505,1369.8900,,,183028.441193,128802.709015,105788.917802,17921.265390,12611.742286,10358.342446
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165136,123487967,Cash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
165137,123536402,Cash,M,0.0,135000.0,450000.0,16807.5,Secondary / secondary special,Single / not married,0.028663,18610.0,10398.0,23.0,0.0,0.0,1.0,0.600624,0.665971,0.535276,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.214109,3.473047,0.535276,0.665971,0.600624,0.600624,0.002847,26.773762,0.124500,3.333333,0.903143,24.180548,7.254164,12.983266,43.277553,1.616417,1.789767,3869.5050,0.002212,0.001236,270280.577196,299686.841680,240874.312713,10094.979558,11193.303537,8996.655580
165138,123718238,Cash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
165139,123631557,Cash,F,0.0,112500.0,350181.0,36769.5,Secondary / secondary special,Married,0.030755,20551.0,,,0.0,1.0,2.0,0.761376,0.761376,0.761376,,,,,,,0.0,,0.441364,4.568253,0.761376,0.761376,0.761376,0.761376,0.000000,9.523681,0.326840,3.112720,1.789183,17.039609,5.474186,,,,,3459.9375,,,266619.257702,266619.257702,266619.257702,27995.398940,27995.398940,27995.398940


In [17]:
"""
mask1:
Берем для каждого APPLICATION_NUMBER все предыдущие кредиты со статусом "Approved", такие 
чтобы DAYS_TERMINATION == 365243.0 дней. 
В моем понимании это одобренные, действующие, незакрытые кредиты. Суммируем по ним платежи AMOUNT_ANNUITY 
и добавляем как новую колонку. 
Ноль для отсутствующих значений.


mask2: тоже самое, но по закрытым кредитам. добавим как сумму по закрытым кредитам

Так же делаем для обеих масок и получаем количество закрытых и незакрытых кредитов.


"""
mask1 = (apps["NAME_CONTRACT_STATUS".lower()] == "Approved") & (apps["DAYS_TERMINATION".lower()] == 365243.0)
myseries1_sum = apps[mask1].groupby("APPLICATION_NUMBER".lower())["AMOUNT_ANNUITY".lower()].sum()
myseries1_count = apps[mask1].groupby("APPLICATION_NUMBER".lower())["AMOUNT_ANNUITY".lower()].count()

train_sample = train_sample.merge(myseries1_sum.rename("OTHER_OPEN_CREDITS_SUM".lower()), how="left", 
                                  on="APPLICATION_NUMBER".lower())
train_sample["OTHER_OPEN_CREDITS_SUM".lower()] = train_sample["OTHER_OPEN_CREDITS_SUM".lower()].fillna(0.0)
train_sample = train_sample.merge(myseries1_count.rename("OTHER_OPEN_CREDITS_COUNT".lower()), how="left", 
                                  on="APPLICATION_NUMBER".lower())
train_sample["OTHER_OPEN_CREDITS_COUNT".lower()] = train_sample["OTHER_OPEN_CREDITS_COUNT".lower()].fillna(0.0)

test = test.merge(myseries1_sum.rename("OTHER_OPEN_CREDITS_SUM".lower()), how="left", 
                                  on="APPLICATION_NUMBER".lower())
test["OTHER_OPEN_CREDITS_SUM".lower()] = test["OTHER_OPEN_CREDITS_SUM".lower()].fillna(0.0)
test = test.merge(myseries1_count.rename("OTHER_OPEN_CREDITS_COUNT".lower()), how="left", 
                                  on="APPLICATION_NUMBER".lower())
test["OTHER_OPEN_CREDITS_COUNT".lower()] = test["OTHER_OPEN_CREDITS_COUNT".lower()].fillna(0.0)


mask2 = (apps["NAME_CONTRACT_STATUS".lower()] == "Approved") & (apps["DAYS_TERMINATION".lower()] < 365243.0)
myseries2_sum = apps[mask2].groupby("APPLICATION_NUMBER".lower())["AMOUNT_ANNUITY".lower()].sum()
myseries2_count = apps[mask2].groupby("APPLICATION_NUMBER".lower())["AMOUNT_ANNUITY".lower()].count()

train_sample = train_sample.merge(myseries2_sum.rename("CLOSED_CREDITS_SUM".lower()), how="left", 
                                  on="APPLICATION_NUMBER".lower())
train_sample["CLOSED_CREDITS_SUM".lower()] = train_sample["CLOSED_CREDITS_SUM".lower()].fillna(0.0)
train_sample = train_sample.merge(myseries2_count.rename("CLOSED_CREDITS_COUNT".lower()), how="left", 
                                  on="APPLICATION_NUMBER".lower())
train_sample["CLOSED_CREDITS_COUNT".lower()] = train_sample["CLOSED_CREDITS_COUNT".lower()].fillna(0.0)

test = test.merge(myseries2_sum.rename("CLOSED_CREDITS_SUM".lower()), how="left", 
                                  on="APPLICATION_NUMBER".lower())
test["CLOSED_CREDITS_SUM".lower()] = test["CLOSED_CREDITS_SUM".lower()].fillna(0.0)
test = test.merge(myseries2_count.rename("CLOSED_CREDITS_COUNT".lower()), how="left", 
                                  on="APPLICATION_NUMBER".lower())
test["CLOSED_CREDITS_COUNT".lower()] = test["CLOSED_CREDITS_COUNT".lower()].fillna(0.0)


In [18]:
train_sample.head()

Unnamed: 0,application_number,target,name_contract_type,gender,childrens,total_salary,amount_credit,amount_annuity,education_level,family_status,region_population,age,days_on_last_job,own_car_age,flag_phone,flag_email,family_size,external_scoring_rating_1,external_scoring_rating_2,external_scoring_rating_3,amt_req_credit_bureau_hour,amt_req_credit_bureau_day,amt_req_credit_bureau_week,amt_req_credit_bureau_mon,amt_req_credit_bureau_qrt,amt_req_credit_bureau_year,bki_requests_count,bki_kurtosis,external_scoring_prod,external_scoring_weighted,external_scoring_rating_min,external_scoring_rating_max,external_scoring_rating_mean,external_scoring_rating_nanmedian,external_scoring_rating_var,ratio_credit_to_annuity,ratio_annuity_to_salary,ratio_credit_to_salary,ratio_annuity_to_age,ratio_credit_to_age,ratio_salary_to_age,ratio_salary_to_experience,ratio_credit_to_experience,ratio_annuity_to_experience,ratio_age_to_experience,ratio_salary_to_region_population,ratio_car_to_experience,ratio_car_to_age,expected_total_loss_1,expected_total_loss_2,expected_total_loss_3,expected_monthly_loss_1,expected_monthly_loss_2,expected_monthly_loss_3,other_open_credits_sum,other_open_credits_count,closed_credits_sum,closed_credits_count
0,123687442,0,Cash,M,1.0,157500.0,855000.0,25128.0,Secondary / secondary special,Married,0.019101,15728.0,1719.0,11.0,0.0,0.0,3.0,0.700784,0.645914,0.71657,0.0,0.0,1.0,0.0,0.0,2.0,3.0,1.428571,0.324353,4.197193,0.645914,0.71657,0.687756,0.700784,0.0009169231,34.025788,0.159543,5.428571,1.59766,54.361648,10.013988,91.623037,497.382199,14.617801,9.149506,3008.4075,0.006399,0.000699,599170.547652,552256.266546,612667.559305,17609.307043,16230.521013,18005.977111,6510.015,1.0,16601.4,2.0
1,123597908,1,Cash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,35355.33,2.0
2,123526683,0,Cash,F,0.0,135000.0,1006920.0,42660.0,Higher education,Married,0.026392,21557.0,3618.0,,1.0,0.0,2.0,0.475009,0.682149,0.267869,0.0,0.0,0.0,7.0,0.0,4.0,11.0,0.539379,0.086797,2.435775,0.267869,0.682149,0.475009,0.475009,0.02860472,23.603376,0.316,7.458667,1.97894,46.709653,6.262467,37.313433,278.308458,11.791045,5.958264,3562.92,,,478296.232619,686869.876357,269722.58888,20263.891157,29100.49351,11427.288803,0.0,0.0,130153.905,4.0
3,123710391,1,Cash,M,0.0,180000.0,518562.0,22972.5,Secondary / secondary special,Married,0.031329,22338.0,,,0.0,0.0,2.0,0.170873,0.171299,0.170446,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004989,1.024383,0.170446,0.171299,0.170873,0.170873,1.212534e-07,22.573164,0.127625,2.8809,1.028405,23.214343,8.058018,,,,,5639.22,,,88608.035653,88829.188848,88386.882459,3925.370735,3935.167908,3915.573562,6025.275,1.0,2450.115,1.0
4,123590329,1,Cash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,21873.15,2.0


In [19]:
test.head()

Unnamed: 0,application_number,name_contract_type,gender,childrens,total_salary,amount_credit,amount_annuity,education_level,family_status,region_population,age,days_on_last_job,own_car_age,flag_phone,flag_email,family_size,external_scoring_rating_1,external_scoring_rating_2,external_scoring_rating_3,amt_req_credit_bureau_hour,amt_req_credit_bureau_day,amt_req_credit_bureau_week,amt_req_credit_bureau_mon,amt_req_credit_bureau_qrt,amt_req_credit_bureau_year,bki_requests_count,bki_kurtosis,external_scoring_prod,external_scoring_weighted,external_scoring_rating_min,external_scoring_rating_max,external_scoring_rating_mean,external_scoring_rating_nanmedian,external_scoring_rating_var,ratio_credit_to_annuity,ratio_annuity_to_salary,ratio_credit_to_salary,ratio_annuity_to_age,ratio_credit_to_age,ratio_salary_to_age,ratio_salary_to_experience,ratio_credit_to_experience,ratio_annuity_to_experience,ratio_age_to_experience,ratio_salary_to_region_population,ratio_car_to_experience,ratio_car_to_age,expected_total_loss_1,expected_total_loss_2,expected_total_loss_3,expected_monthly_loss_1,expected_monthly_loss_2,expected_monthly_loss_3,other_open_credits_sum,other_open_credits_count,closed_credits_sum,closed_credits_count
0,123724268,Cash,M,0.0,117000.0,1125000.0,32895.0,Secondary / secondary special,Married,0.028663,16007.0,2646.0,20.0,0.0,0.0,2.0,0.628266,0.628266,0.628266,0.0,0.0,0.0,0.0,1.0,4.0,5.0,4.639906,0.247988,3.769597,0.628266,0.628266,0.628266,0.628266,0.0,34.199726,0.281154,9.615385,2.055038,70.281752,7.309302,44.217687,425.170068,12.431973,6.049509,3353.571,0.007559,0.001249,706799.427892,706799.427892,706799.427892,20666.815272,20666.815272,20666.815272,22500.0,1.0,11175.615,1.0
1,123456549,Cash,F,2.0,81000.0,312768.0,17095.5,Secondary / secondary special,Married,0.019689,10315.0,459.0,,0.0,0.0,4.0,0.383325,0.578161,0.18849,0.0,0.0,1.0,0.0,0.0,2.0,3.0,1.428571,0.041774,1.91028,0.18849,0.578161,0.383325,0.383325,0.025307,18.295341,0.211056,3.861333,1.657344,30.321667,7.852642,176.470588,681.411765,37.245098,22.472767,1594.809,,,119891.88021,180830.265914,58953.494506,6553.137271,9883.951718,3222.322825,47870.775,1.0,4616.685,1.0
2,123428178,Credit Card,F,2.0,157500.0,450000.0,22500.0,Secondary / secondary special,Married,0.019101,13016.0,977.0,,1.0,0.0,4.0,0.267523,0.152544,0.382502,0.0,0.0,0.0,0.0,1.0,6.0,7.0,5.393765,0.01561,1.835096,0.152544,0.382502,0.267523,0.267523,0.008813,20.0,0.142857,2.857143,1.728642,34.572833,12.100492,161.207779,460.593654,23.029683,13.322416,3008.4075,,,120385.398093,68644.98432,172125.811865,6019.269905,3432.249216,8606.290593,0.0,0.0,134636.805,5.0
3,123619984,Cash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,13925.385,1.0,0.0,0.0
4,123671104,Cash,F,1.0,90000.0,254700.0,24939.0,Higher education,Married,0.015221,17743.0,9258.0,,1.0,0.0,3.0,0.718604,0.505704,0.415347,0.0,0.0,0.0,0.0,1.0,0.0,1.0,6.0,0.150937,3.188953,0.415347,0.718604,0.546552,0.505704,0.016162,10.21292,0.2771,2.83,1.405568,14.354957,5.072423,9.721322,27.511342,2.693778,1.916505,1369.89,,,183028.441193,128802.709015,105788.917802,17921.26539,12611.742286,10358.342446,0.0,0.0,22354.56,3.0


In [20]:
"""
добавил поле ОБЩЕЙ суммы платежей в месяц
"""
train_sample["TOTAL_OPEN_CREDITS_MONTHLY_PAMENTS".lower()] = train_sample["OTHER_OPEN_CREDITS_SUM".lower()] + train_sample["AMOUNT_ANNUITY".lower()]
test["TOTAL_OPEN_CREDITS_MONTHLY_PAMENTS".lower()] = test["OTHER_OPEN_CREDITS_SUM".lower()] + test["AMOUNT_ANNUITY".lower()]

In [21]:
"""
добавим поле показывающее какую часть от дохода человек должен отдавать в месяц
"""
train_sample["MONTHLY_PAYMENTS_SALARY_PART".lower()] = train_sample["TOTAL_OPEN_CREDITS_MONTHLY_PAMENTS".lower()] / train_sample["TOTAL_SALARY".lower()]
test["MONTHLY_PAYMENTS_SALARY_PART".lower()] = test["TOTAL_OPEN_CREDITS_MONTHLY_PAMENTS".lower()] / test["TOTAL_SALARY".lower()]

In [22]:
"""
посчитаем среднюю зарплату в зависимости от образования
"""
median_salaries = train_sample.groupby("EDUCATION_LEVEL".lower())["TOTAL_SALARY".lower()].median()

train_sample = train_sample.merge(median_salaries.rename("SALARY_PART_OF_MEDIAN_EDU_SALARY".lower()), how="left", on="EDUCATION_LEVEL".lower())
train_sample["SALARY_PART_OF_MEDIAN_EDU_SALARY".lower()] = train_sample["TOTAL_SALARY".lower()] / train_sample["SALARY_PART_OF_MEDIAN_EDU_SALARY".lower()]

test = test.merge(median_salaries.rename("SALARY_PART_OF_MEDIAN_EDU_SALARY".lower()), how="left", on="EDUCATION_LEVEL".lower())
test["SALARY_PART_OF_MEDIAN_EDU_SALARY".lower()] = test["TOTAL_SALARY".lower()] / test["SALARY_PART_OF_MEDIAN_EDU_SALARY".lower()]

In [23]:
"""
посчитаем среднюю зарплату в зависимости от семейного положения
"""
median_salaries = train_sample.groupby("FAMILY_STATUS".lower())["TOTAL_SALARY".lower()].median()

train_sample = train_sample.merge(median_salaries.rename("SALARY_PART_OF_MEDIAN_FAM_STAT_SALARY".lower()), how="left", on="FAMILY_STATUS".lower())
train_sample["SALARY_PART_OF_MEDIAN_FAM_STAT_SALARY".lower()] = train_sample["TOTAL_SALARY".lower()] / train_sample["SALARY_PART_OF_MEDIAN_FAM_STAT_SALARY".lower()]

test = test.merge(median_salaries.rename("SALARY_PART_OF_MEDIAN_FAM_STAT_SALARY".lower()), how="left", on="FAMILY_STATUS".lower())
test["SALARY_PART_OF_MEDIAN_FAM_STAT_SALARY".lower()] = test["TOTAL_SALARY".lower()] / test["SALARY_PART_OF_MEDIAN_FAM_STAT_SALARY".lower()]

In [24]:
"""
посчитаем среднюю зарплату в зависимости от числа детей
"""
median_salaries = train_sample.groupby("CHILDRENS".lower())["TOTAL_SALARY".lower()].median()

train_sample = train_sample.merge(median_salaries.rename("SALARY_PART_OF_MEDIAN_CHILDS_SALARY".lower()), how="left", on="CHILDRENS".lower())
train_sample["SALARY_PART_OF_MEDIAN_CHILDS_SALARY".lower()] = train_sample["TOTAL_SALARY".lower()] / train_sample["SALARY_PART_OF_MEDIAN_CHILDS_SALARY".lower()]

test = test.merge(median_salaries.rename("SALARY_PART_OF_MEDIAN_CHILDS_SALARY".lower()), how="left", on="CHILDRENS".lower())
test["SALARY_PART_OF_MEDIAN_CHILDS_SALARY".lower()] = test["TOTAL_SALARY".lower()] / test["SALARY_PART_OF_MEDIAN_CHILDS_SALARY".lower()]

In [25]:
train_sample

Unnamed: 0,application_number,target,name_contract_type,gender,childrens,total_salary,amount_credit,amount_annuity,education_level,family_status,region_population,age,days_on_last_job,own_car_age,flag_phone,flag_email,family_size,external_scoring_rating_1,external_scoring_rating_2,external_scoring_rating_3,amt_req_credit_bureau_hour,amt_req_credit_bureau_day,amt_req_credit_bureau_week,amt_req_credit_bureau_mon,amt_req_credit_bureau_qrt,amt_req_credit_bureau_year,bki_requests_count,bki_kurtosis,external_scoring_prod,external_scoring_weighted,external_scoring_rating_min,external_scoring_rating_max,external_scoring_rating_mean,external_scoring_rating_nanmedian,external_scoring_rating_var,ratio_credit_to_annuity,ratio_annuity_to_salary,ratio_credit_to_salary,ratio_annuity_to_age,ratio_credit_to_age,ratio_salary_to_age,ratio_salary_to_experience,ratio_credit_to_experience,ratio_annuity_to_experience,ratio_age_to_experience,ratio_salary_to_region_population,ratio_car_to_experience,ratio_car_to_age,expected_total_loss_1,expected_total_loss_2,expected_total_loss_3,expected_monthly_loss_1,expected_monthly_loss_2,expected_monthly_loss_3,other_open_credits_sum,other_open_credits_count,closed_credits_sum,closed_credits_count,total_open_credits_monthly_paments,monthly_payments_salary_part,salary_part_of_median_edu_salary,salary_part_of_median_fam_stat_salary,salary_part_of_median_childs_salary
0,123687442,0,Cash,M,1.0,157500.0,855000.0,25128.0,Secondary / secondary special,Married,0.019101,15728.0,1719.0,11.0,0.0,0.0,3.0,0.700784,0.645914,0.716570,0.0,0.0,1.0,0.0,0.0,2.0,3.0,1.428571,0.324353,4.197193,0.645914,0.716570,0.687756,0.700784,9.169231e-04,34.025788,0.159543,5.428571,1.597660,54.361648,10.013988,91.623037,497.382199,14.617801,9.149506,3008.4075,0.006399,0.000699,599170.547652,552256.266546,612667.559305,17609.307043,16230.521013,18005.977111,6510.015,1.0,16601.400,2.0,31638.015,0.200876,1.166667,1.000000,1.000000
1,123597908,1,Cash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.000,0.0,35355.330,2.0,,,,,
2,123526683,0,Cash,F,0.0,135000.0,1006920.0,42660.0,Higher education,Married,0.026392,21557.0,3618.0,,1.0,0.0,2.0,0.475009,0.682149,0.267869,0.0,0.0,0.0,7.0,0.0,4.0,11.0,0.539379,0.086797,2.435775,0.267869,0.682149,0.475009,0.475009,2.860472e-02,23.603376,0.316000,7.458667,1.978940,46.709653,6.262467,37.313433,278.308458,11.791045,5.958264,3562.9200,,,478296.232619,686869.876357,269722.588880,20263.891157,29100.493510,11427.288803,0.000,0.0,130153.905,4.0,42660.000,0.316000,0.750000,0.857143,0.937500
3,123710391,1,Cash,M,0.0,180000.0,518562.0,22972.5,Secondary / secondary special,Married,0.031329,22338.0,,,0.0,0.0,2.0,0.170873,0.171299,0.170446,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.004989,1.024383,0.170446,0.171299,0.170873,0.170873,1.212534e-07,22.573164,0.127625,2.880900,1.028405,23.214343,8.058018,,,,,5639.2200,,,88608.035653,88829.188848,88386.882459,3925.370735,3935.167908,3915.573562,6025.275,1.0,2450.115,1.0,28997.775,0.161099,1.333333,1.142857,1.250000
4,123590329,1,Cash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.000,0.0,21873.150,2.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110088,123458312,0,Cash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.000,0.0,53521.155,4.0,,,,,
110089,123672463,0,Cash,F,0.0,175500.0,269550.0,12618.0,Secondary / secondary special,Married,0.010032,10519.0,1354.0,,0.0,0.0,2.0,0.528096,0.471774,0.581484,0.0,0.0,0.0,0.0,0.0,2.0,2.0,6.000000,0.144872,3.272417,0.471774,0.581484,0.527118,0.528096,2.006532e-03,21.362340,0.071897,1.535897,1.199544,25.625059,16.684095,129.615953,199.076809,9.319055,7.768833,1760.6160,,,142348.303154,127166.558499,156738.932900,6663.516562,5952.838565,7337.161400,13500.000,1.0,23134.950,4.0,26118.000,0.148821,1.300000,1.114286,1.218750
110090,123723001,0,Cash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.000,0.0,8000.595,2.0,,,,,
110091,123554358,0,Cash,F,2.0,270000.0,1024740.0,49428.0,Incomplete higher,Married,0.019101,15008.0,361.0,22.0,0.0,0.0,4.0,0.653826,0.651227,0.558507,0.0,0.0,0.0,1.0,0.0,1.0,2.0,-1.875000,0.237806,3.634399,0.558507,0.653826,0.621186,0.651227,1.965506e-03,20.731974,0.183067,3.795333,3.293443,68.279584,17.990405,747.922438,2838.614958,136.919668,41.573407,5157.2700,0.060942,0.001466,670001.539719,667338.205604,572324.081646,32317.305956,32188.840903,27605.865593,0.000,0.0,20264.805,1.0,49428.000,0.183067,1.714286,1.714286,1.714286


In [26]:
test

Unnamed: 0,application_number,name_contract_type,gender,childrens,total_salary,amount_credit,amount_annuity,education_level,family_status,region_population,age,days_on_last_job,own_car_age,flag_phone,flag_email,family_size,external_scoring_rating_1,external_scoring_rating_2,external_scoring_rating_3,amt_req_credit_bureau_hour,amt_req_credit_bureau_day,amt_req_credit_bureau_week,amt_req_credit_bureau_mon,amt_req_credit_bureau_qrt,amt_req_credit_bureau_year,bki_requests_count,bki_kurtosis,external_scoring_prod,external_scoring_weighted,external_scoring_rating_min,external_scoring_rating_max,external_scoring_rating_mean,external_scoring_rating_nanmedian,external_scoring_rating_var,ratio_credit_to_annuity,ratio_annuity_to_salary,ratio_credit_to_salary,ratio_annuity_to_age,ratio_credit_to_age,ratio_salary_to_age,ratio_salary_to_experience,ratio_credit_to_experience,ratio_annuity_to_experience,ratio_age_to_experience,ratio_salary_to_region_population,ratio_car_to_experience,ratio_car_to_age,expected_total_loss_1,expected_total_loss_2,expected_total_loss_3,expected_monthly_loss_1,expected_monthly_loss_2,expected_monthly_loss_3,other_open_credits_sum,other_open_credits_count,closed_credits_sum,closed_credits_count,total_open_credits_monthly_paments,monthly_payments_salary_part,salary_part_of_median_edu_salary,salary_part_of_median_fam_stat_salary,salary_part_of_median_childs_salary
0,123724268,Cash,M,0.0,117000.0,1125000.0,32895.0,Secondary / secondary special,Married,0.028663,16007.0,2646.0,20.0,0.0,0.0,2.0,0.628266,0.628266,0.628266,0.0,0.0,0.0,0.0,1.0,4.0,5.0,4.639906,0.247988,3.769597,0.628266,0.628266,0.628266,0.628266,0.000000,34.199726,0.281154,9.615385,2.055038,70.281752,7.309302,44.217687,425.170068,12.431973,6.049509,3353.5710,0.007559,0.001249,706799.427892,706799.427892,706799.427892,20666.815272,20666.815272,20666.815272,22500.000,1.0,11175.615,1.0,55395.000,0.473462,0.866667,0.742857,0.812500
1,123456549,Cash,F,2.0,81000.0,312768.0,17095.5,Secondary / secondary special,Married,0.019689,10315.0,459.0,,0.0,0.0,4.0,0.383325,0.578161,0.188490,0.0,0.0,1.0,0.0,0.0,2.0,3.0,1.428571,0.041774,1.910280,0.188490,0.578161,0.383325,0.383325,0.025307,18.295341,0.211056,3.861333,1.657344,30.321667,7.852642,176.470588,681.411765,37.245098,22.472767,1594.8090,,,119891.880210,180830.265914,58953.494506,6553.137271,9883.951718,3222.322825,47870.775,1.0,4616.685,1.0,64966.275,0.802053,0.600000,0.514286,0.514286
2,123428178,Credit Card,F,2.0,157500.0,450000.0,22500.0,Secondary / secondary special,Married,0.019101,13016.0,977.0,,1.0,0.0,4.0,0.267523,0.152544,0.382502,0.0,0.0,0.0,0.0,1.0,6.0,7.0,5.393765,0.015610,1.835096,0.152544,0.382502,0.267523,0.267523,0.008813,20.000000,0.142857,2.857143,1.728642,34.572833,12.100492,161.207779,460.593654,23.029683,13.322416,3008.4075,,,120385.398093,68644.984320,172125.811865,6019.269905,3432.249216,8606.290593,0.000,0.0,134636.805,5.0,22500.000,0.142857,1.166667,1.000000,1.000000
3,123619984,Cash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,13925.385,1.0,0.000,0.0,,,,,
4,123671104,Cash,F,1.0,90000.0,254700.0,24939.0,Higher education,Married,0.015221,17743.0,9258.0,,1.0,0.0,3.0,0.718604,0.505704,0.415347,0.0,0.0,0.0,0.0,1.0,0.0,1.0,6.000000,0.150937,3.188953,0.415347,0.718604,0.546552,0.505704,0.016162,10.212920,0.277100,2.830000,1.405568,14.354957,5.072423,9.721322,27.511342,2.693778,1.916505,1369.8900,,,183028.441193,128802.709015,105788.917802,17921.265390,12611.742286,10358.342446,0.000,0.0,22354.560,3.0,24939.000,0.277100,0.500000,0.571429,0.571429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165136,123487967,Cash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,27659.745,2.0,0.000,0.0,,,,,
165137,123536402,Cash,M,0.0,135000.0,450000.0,16807.5,Secondary / secondary special,Single / not married,0.028663,18610.0,10398.0,23.0,0.0,0.0,1.0,0.600624,0.665971,0.535276,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.214109,3.473047,0.535276,0.665971,0.600624,0.600624,0.002847,26.773762,0.124500,3.333333,0.903143,24.180548,7.254164,12.983266,43.277553,1.616417,1.789767,3869.5050,0.002212,0.001236,270280.577196,299686.841680,240874.312713,10094.979558,11193.303537,8996.655580,12065.265,1.0,24301.935,1.0,28872.765,0.213872,1.000000,0.937500,0.937500
165138,123718238,Cash,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,9000.000,1.0,76233.915,4.0,,,,,
165139,123631557,Cash,F,0.0,112500.0,350181.0,36769.5,Secondary / secondary special,Married,0.030755,20551.0,,,0.0,1.0,2.0,0.761376,0.761376,0.761376,,,,,,,0.0,,0.441364,4.568253,0.761376,0.761376,0.761376,0.761376,0.000000,9.523681,0.326840,3.112720,1.789183,17.039609,5.474186,,,,,3459.9375,,,266619.257702,266619.257702,266619.257702,27995.398940,27995.398940,27995.398940,30734.370,1.0,84044.790,5.0,67503.870,0.600034,0.833333,0.714286,0.781250


In [27]:
def another_credits_info(row):
    main_appnum = row["APPLICATION_NUMBER".lower()]
    prev_app_nums = apps.loc[apps["APPLICATION_NUMBER".lower()] == main_appnum]["PREV_APPLICATION_NUMBER".lower()].values
    prev_app_nums = np.append(prev_app_nums, main_appnum)
    prolong_times = 0
    max_overdue = 0.0
    sum_overdue = 0.0
    max_days_overdue = 0
    for appnum in prev_app_nums:
        tmp_df = bki.loc[bki["APPLICATION_NUMBER".lower()] == appnum]
        prolong_times += tmp_df["CNT_CREDIT_PROLONG".lower()].sum()
        if tmp_df["AMT_CREDIT_MAX_OVERDUE".lower()].max() > max_overdue:
            max_overdue = tmp_df["AMT_CREDIT_MAX_OVERDUE".lower()].max()
        sum_overdue += tmp_df["AMT_CREDIT_SUM_OVERDUE".lower()].sum()
        if tmp_df["CREDIT_DAY_OVERDUE".lower()].max() > max_days_overdue:
            max_days_overdue = tmp_df["CREDIT_DAY_OVERDUE".lower()].max()
    
    row["PROLONG_TIMES".lower()] = prolong_times
    row["MAX_OVERDUE".lower()] = max_overdue
    row["SUM_OVERDUE".lower()] = sum_overdue
    row["MAX_DAYS_OVERDUE".lower()] = max_days_overdue
    return row

In [28]:
%%time
train_sample = train_sample.apply(another_credits_info, axis=1)

CPU times: user 11min 2s, sys: 705 ms, total: 11min 3s
Wall time: 11min 2s


In [29]:
%%time
test = test.apply(another_credits_info, axis=1)

CPU times: user 16min 32s, sys: 772 ms, total: 16min 32s
Wall time: 16min 31s


In [30]:
def another_payments_info(orig_row):
    main_appnum = orig_row["APPLICATION_NUMBER".lower()]
    tmp_df = payments.loc[payments["APPLICATION_NUMBER".lower()] == main_appnum]
    max_overdue_days = -1.0
    max_overdue_sum = 0.0
    for i, row in tmp_df.iterrows():
        overdue_days = row["DAYS_INSTALMENT".lower()] - row["DAYS_ENTRY_PAYMENT".lower()]
        if overdue_days >= 0:
            if overdue_days > max_overdue_days:
                max_overdue_days = overdue_days
        overdue_payment = row["AMT_INSTALMENT".lower()] - row["AMT_PAYMENT".lower()]
        if overdue_payment > 0:
            if overdue_payment > max_overdue_sum:
                max_overdue_sum = overdue_payment
    orig_row["MAX_DAY_OVERDUE_FROM_PAYMENTS".lower()] = max_overdue_days
    orig_row["MAX_OVERDUE_SUM_FROM_PAYMENTS".lower()] = max_overdue_sum
    orig_row["OVERDUE_MORE_THAN_0_DAYS".lower()] = 1.0 if max_overdue_days > 0 else 0.0
    orig_row["OVERDUE_MORE_THAN_10_DAYS".lower()] = 1.0 if max_overdue_days > 10 else 0.0
    orig_row["OVERDUE_MORE_THAN_30_DAYS".lower()] = 1.0 if max_overdue_days > 30 else 0.0
    orig_row["OVERDUE_MORE_THAN_60_DAYS".lower()] = 1.0 if max_overdue_days > 60 else 0.0
    orig_row["OVERDUE_MORE_THAN_90_DAYS".lower()] = 1.0 if max_overdue_days > 90 else 0.0
    orig_row["OVERDUE_MORE_THAN_180_DAYS".lower()] = 1.0 if max_overdue_days > 180 else 0.0
    orig_row["OVERDUE_MORE_THAN_360_DAYS".lower()] = 1.0 if max_overdue_days > 360 else 0.0
    return orig_row

In [31]:
%%time
train_sample = train_sample.apply(another_payments_info, axis=1)

CPU times: user 8min 49s, sys: 328 ms, total: 8min 49s
Wall time: 8min 49s


In [32]:
%%time
test = test.apply(another_payments_info, axis=1)

CPU times: user 13min 3s, sys: 408 ms, total: 13min 3s
Wall time: 13min 3s


In [33]:
# чтоб не перезаписать случайно
# train_sample.to_csv("data/train_data_complete.csv", index=False)
# test.to_csv("data/test_data_complete.csv", index=False)

In [34]:
train_sample.describe()

Unnamed: 0,application_number,target,childrens,total_salary,amount_credit,amount_annuity,region_population,age,days_on_last_job,own_car_age,flag_phone,flag_email,family_size,external_scoring_rating_1,external_scoring_rating_2,external_scoring_rating_3,amt_req_credit_bureau_hour,amt_req_credit_bureau_day,amt_req_credit_bureau_week,amt_req_credit_bureau_mon,amt_req_credit_bureau_qrt,amt_req_credit_bureau_year,bki_requests_count,bki_kurtosis,external_scoring_prod,external_scoring_weighted,external_scoring_rating_min,external_scoring_rating_max,external_scoring_rating_mean,external_scoring_rating_nanmedian,external_scoring_rating_var,ratio_credit_to_annuity,ratio_annuity_to_salary,ratio_credit_to_salary,ratio_annuity_to_age,ratio_credit_to_age,ratio_salary_to_age,ratio_salary_to_experience,ratio_credit_to_experience,ratio_annuity_to_experience,ratio_age_to_experience,ratio_salary_to_region_population,ratio_car_to_experience,ratio_car_to_age,expected_total_loss_1,expected_total_loss_2,expected_total_loss_3,expected_monthly_loss_1,expected_monthly_loss_2,expected_monthly_loss_3,other_open_credits_sum,other_open_credits_count,closed_credits_sum,closed_credits_count,total_open_credits_monthly_paments,monthly_payments_salary_part,salary_part_of_median_edu_salary,salary_part_of_median_fam_stat_salary,salary_part_of_median_childs_salary,prolong_times,max_overdue,sum_overdue,max_days_overdue,max_day_overdue_from_payments,max_overdue_sum_from_payments,overdue_more_than_0_days,overdue_more_than_10_days,overdue_more_than_30_days,overdue_more_than_60_days,overdue_more_than_90_days,overdue_more_than_180_days,overdue_more_than_360_days
count,110093.0,110093.0,89539.0,89539.0,89539.0,89534.0,89539.0,89539.0,73530.0,30533.0,89539.0,89539.0,89538.0,89539.0,89539.0,89539.0,77480.0,77480.0,77480.0,77480.0,77480.0,77480.0,89539.0,77480.0,89539.0,89539.0,89539.0,89539.0,89539.0,89539.0,89539.0,89534.0,89534.0,89539.0,89534.0,89539.0,89539.0,73530.0,73530.0,73525.0,73530.0,89539.0,27620.0,30533.0,89539.0,89539.0,89539.0,89534.0,89534.0,89534.0,110093.0,110093.0,110093.0,110093.0,89534.0,89534.0,89539.0,89539.0,89539.0,110093.0,110093.0,110093.0,110093.0,110093.0,110093.0,110093.0,110093.0,110093.0,110093.0,110093.0,110093.0,110093.0
mean,123577200.0,0.080813,0.421034,168323.0,599749.5,27119.655449,0.020843,16029.821162,2383.686931,12.096224,0.28219,0.055373,2.157844,0.50609,0.513891,0.506551,0.006066,0.006776,0.034151,0.266585,0.265333,1.903601,2.14817,3.486055,0.1545153,3.045722,0.399107,0.615525,0.508844,0.5119,0.013467,21.624537,0.180611,3.953862,1.823217,39.725241,11.416766,220.052846,753.351762,34.928442,18.159553,3764.710509,0.015674,0.000855,314092.2,318534.8,309397.9,14024.462693,14295.505466,13904.615844,10257.275419,0.635317,30236.351897,2.164198,37371.298697,0.258068,1.148244,1.097539,1.137646,0.016332,2977.067,89.2201,2.097363,1.633092,1549.720488,0.161,0.042573,0.007312,0.002997,0.002244,0.001417,0.000827
std,88809.39,0.27255,0.728803,99622.78,403169.4,14503.018565,0.013797,4355.461281,2340.093356,11.932761,0.450068,0.228707,0.915425,0.183223,0.191181,0.193609,0.083264,0.111512,0.206492,0.905296,0.614651,1.875062,2.239779,3.008924,0.1175453,0.933115,0.187375,0.156221,0.149712,0.157152,0.017473,7.841686,0.093771,2.677549,1.107191,28.422119,7.560455,1131.769378,3386.192512,151.838395,66.704402,4654.251136,0.079655,0.000946,264648.8,268828.0,259179.7,9981.741984,10251.584771,9902.077079,16241.193442,0.74396,38110.341864,1.867453,21747.102058,0.178916,0.636506,0.647184,0.673033,0.151972,71707.69,7632.113,58.120806,31.332352,6298.836687,0.367533,0.201894,0.085197,0.054667,0.047313,0.037616,0.028738
min,123423300.0,0.0,0.0,25650.0,45000.0,1615.5,0.000533,7679.0,2.0,0.0,0.0,0.0,1.0,1.7e-05,1e-05,1.7e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.333333,4.663257e-15,0.0001,1e-05,1.7e-05,1.7e-05,1.7e-05,0.0,8.036674,0.006,0.07619,0.068777,1.808391,1.09188,2.717118,4.088611,0.315249,1.391833,19.188,0.0,0.0,10.69377,2.235707,10.69377,0.500706,0.147609,0.500706,0.0,0.0,0.0,0.0,1980.0,0.006,0.15,0.168571,0.168571,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,123500500.0,0.0,0.0,112500.0,270000.0,16524.0,0.010006,12433.0,762.0,5.0,0.0,0.0,2.0,0.378046,0.391413,0.368969,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05556865,2.419297,0.253854,0.539462,0.413255,0.40906,0.000808,15.610373,0.1148,2.018667,1.020208,18.738832,6.566882,47.426188,143.906981,7.376516,4.568062,1366.605,0.002053,0.000316,122032.7,120874.7,120442.5,6915.212052,6749.118729,6785.723191,0.0,0.0,7154.73,1.0,21982.06125,0.143,0.75,0.714286,0.740625,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,123576900.0,0.0,0.0,148500.0,517788.0,24903.0,0.01885,15728.0,1646.0,9.0,0.0,0.0,2.0,0.522246,0.564984,0.531686,0.0,0.0,0.0,0.0,0.0,1.0,2.0,6.0,0.1306859,3.139657,0.402844,0.647977,0.52373,0.531686,0.006383,20.0,0.162671,3.26568,1.597588,32.814779,9.834632,97.234226,310.011502,15.220288,8.447932,2538.135,0.005303,0.000632,234450.0,239777.4,229874.8,11795.256415,12283.478841,11745.411017,0.0,0.0,17671.23,2.0,32806.89,0.213163,1.0,1.0,1.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,123653900.0,0.0,1.0,202500.0,808650.0,34596.0,0.028663,19668.0,3180.0,15.0,1.0,0.0,3.0,0.645418,0.663627,0.661024,0.0,0.0,0.0,0.0,0.0,3.0,3.0,6.0,0.2340353,3.759535,0.55245,0.725276,0.622503,0.634008,0.019713,27.117327,0.228766,5.151679,2.360687,53.477086,14.267144,219.512195,713.408223,34.090909,17.917104,4348.728,0.013793,0.001063,431049.9,444310.1,425512.9,18527.753429,19341.086524,18575.239746,15133.14,1.0,39164.94,3.0,48172.5,0.318937,1.35,1.333333,1.40625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,123730800.0,1.0,19.0,4500000.0,4050000.0,225000.0,0.072508,25196.0,17522.0,65.0,1.0,1.0,20.0,0.951624,0.855,0.89601,3.0,9.0,8.0,27.0,8.0,25.0,30.0,6.0,0.6250256,5.220172,0.855,0.951624,0.855,0.885488,0.158088,45.302013,1.373917,36.692308,21.436738,332.873064,373.634081,213750.0,506250.0,21514.5,9308.5,176791.5,11.5,0.008367,3200364.0,2686898.0,3260330.0,157343.068725,178802.217739,182389.054849,254434.365,5.0,922687.695,20.0,280943.865,2.91678,33.333333,28.571429,31.25,7.0,13975260.0,1748115.0,2792.0,2562.0,410971.095,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [35]:
test.describe()

Unnamed: 0,application_number,childrens,total_salary,amount_credit,amount_annuity,region_population,age,days_on_last_job,own_car_age,flag_phone,flag_email,family_size,external_scoring_rating_1,external_scoring_rating_2,external_scoring_rating_3,amt_req_credit_bureau_hour,amt_req_credit_bureau_day,amt_req_credit_bureau_week,amt_req_credit_bureau_mon,amt_req_credit_bureau_qrt,amt_req_credit_bureau_year,bki_requests_count,bki_kurtosis,external_scoring_prod,external_scoring_weighted,external_scoring_rating_min,external_scoring_rating_max,external_scoring_rating_mean,external_scoring_rating_nanmedian,external_scoring_rating_var,ratio_credit_to_annuity,ratio_annuity_to_salary,ratio_credit_to_salary,ratio_annuity_to_age,ratio_credit_to_age,ratio_salary_to_age,ratio_salary_to_experience,ratio_credit_to_experience,ratio_annuity_to_experience,ratio_age_to_experience,ratio_salary_to_region_population,ratio_car_to_experience,ratio_car_to_age,expected_total_loss_1,expected_total_loss_2,expected_total_loss_3,expected_monthly_loss_1,expected_monthly_loss_2,expected_monthly_loss_3,other_open_credits_sum,other_open_credits_count,closed_credits_sum,closed_credits_count,total_open_credits_monthly_paments,monthly_payments_salary_part,salary_part_of_median_edu_salary,salary_part_of_median_fam_stat_salary,salary_part_of_median_childs_salary,prolong_times,max_overdue,sum_overdue,max_days_overdue,max_day_overdue_from_payments,max_overdue_sum_from_payments,overdue_more_than_0_days,overdue_more_than_10_days,overdue_more_than_30_days,overdue_more_than_60_days,overdue_more_than_90_days,overdue_more_than_180_days,overdue_more_than_360_days
count,165141.0,134176.0,134176.0,134176.0,134170.0,134176.0,134176.0,110075.0,45501.0,134176.0,134176.0,134175.0,134176.0,134176.0,134176.0,116037.0,116037.0,116037.0,116037.0,116037.0,116037.0,134176.0,116037.0,134176.0,134176.0,134176.0,134176.0,134176.0,134176.0,134176.0,134170.0,134170.0,134176.0,134170.0,134176.0,134176.0,110075.0,110075.0,110069.0,110075.0,134176.0,41065.0,45501.0,134176.0,134176.0,134176.0,134170.0,134170.0,134170.0,165141.0,165141.0,165141.0,165141.0,134170.0,134170.0,134176.0,134176.0,134174.0,165141.0,165141.0,165141.0,165141.0,165141.0,165141.0,165141.0,165141.0,165141.0,165141.0,165141.0,165141.0,165141.0
mean,123577100.0,0.415939,169407.2,598829.5,27129.127126,0.020878,16033.005962,2380.35087,11.991583,0.279051,0.057633,2.152771,0.506767,0.514132,0.506592,0.006403,0.006601,0.033886,0.267432,0.26763,1.891181,2.138795,3.482023,0.1549394,3.047443,0.399778,0.615588,0.509164,0.512126,0.013411,21.60896,0.181228,3.96207,1.825537,39.697008,11.510367,inf,inf,inf,inf,3771.1,0.01526,0.000845,313843.5,317822.9,308545.8,14051.778521,14296.915855,13898.453257,10206.282816,0.633507,30194.81318,2.165991,37331.447642,0.258414,1.156287,1.104545,1.144874,0.016695,3083.678,102.5617,1.941438,1.755803,1549.086629,0.160687,0.042963,0.007442,0.003131,0.002513,0.001653,0.000957
std,88756.06,0.718754,338673.8,400984.4,14546.022649,0.013803,4364.173246,2334.96537,11.832588,0.448535,0.23305,0.909045,0.183762,0.190942,0.193611,0.083666,0.098199,0.201103,0.915861,0.981682,1.862127,2.347466,3.008363,0.1177797,0.934956,0.187624,0.156139,0.150017,0.157695,0.017499,7.819855,0.095154,2.699799,1.116707,28.372371,26.656493,,,,,5889.86,0.044538,0.000935,263663.3,266362.3,257103.5,10069.709834,10243.372719,9934.835985,16261.327364,0.742586,38010.17322,1.870359,21857.310541,0.182367,2.4779,2.15581,2.163742,0.157047,286905.0,9784.638,56.381068,32.337874,6175.613887,0.367243,0.202775,0.085946,0.055865,0.050067,0.040625,0.030917
min,123423300.0,0.0,25650.0,45000.0,1980.0,0.00029,7489.0,0.0,0.0,0.0,0.0,1.0,6e-06,1e-06,6e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.333333,2.095476e-16,3.6e-05,1e-06,6e-06,6e-06,6e-06,0.0,8.036791,0.000224,0.004808,0.087394,1.826744,1.105523,2.232881,3.903539,0.1990805,1.372097,33.579,0.0,0.0,1.336421,0.236872,1.336421,0.106352,0.011844,0.106352,0.0,0.0,0.0,0.0,1980.0,0.000548,0.15,0.165714,0.171429,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,123500100.0,0.0,112500.0,270000.0,16537.5,0.010006,12399.0,767.0,5.0,0.0,0.0,2.0,0.378221,0.392524,0.368969,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05587799,2.42316,0.253967,0.539684,0.413115,0.409112,0.000818,15.617122,0.114875,2.018667,1.023153,18.851884,6.559406,47.43976,144.1467,7.389163,4.570653,1366.605,0.002033,0.000309,122014.6,121865.3,121721.2,6910.97342,6813.193335,6814.723397,0.0,0.0,7128.495,1.0,21958.065,0.142857,0.75,0.714286,0.71875,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,123577200.0,0.0,144900.0,514777.5,24939.0,0.01885,15755.0,1650.0,9.0,0.0,0.0,2.0,0.523519,0.565569,0.531686,0.0,0.0,0.0,0.0,0.0,1.0,2.0,6.0,0.1315128,3.144274,0.404549,0.648266,0.524567,0.532573,0.006257,20.0,0.163,3.269539,1.595608,32.879132,9.792366,97.31834,309.0659,15.16276,8.418042,2544.75,0.005222,0.000625,234618.1,239900.8,230134.4,11805.475722,12245.147641,11738.59232,0.0,0.0,17716.905,2.0,32767.1775,0.213786,1.0,1.0,1.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,123654100.0,1.0,202500.0,808650.0,34635.375,0.028663,19684.0,3172.0,15.0,1.0,0.0,3.0,0.646246,0.663185,0.661024,0.0,0.0,0.0,0.0,0.0,3.0,3.0,6.0,0.2346778,3.76355,0.553165,0.724901,0.622985,0.633806,0.019602,27.099985,0.229696,5.166667,2.360218,53.168867,14.311286,217.0985,707.1429,34.0234,17.80859,4394.268,0.013553,0.001055,430842.4,444172.1,424901.9,18598.325942,19343.442691,18567.67981,14911.605,1.0,39208.5,3.0,47947.41,0.319217,1.366667,1.333333,1.40625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,123730900.0,9.0,117000000.0,4050000.0,230161.5,0.072508,25229.0,17912.0,69.0,1.0,1.0,11.0,0.962693,0.878903,0.893976,4.0,6.0,8.0,24.0,261.0,23.0,262.0,6.0,0.6789275,5.27342,0.878903,0.962693,0.878903,0.878903,0.15496,45.305079,1.875965,84.736842,22.805595,323.06956,9274.673008,inf,inf,inf,inf,1245231.0,4.333333,0.008679,3166811.0,2777865.0,3184380.0,195759.403752,168949.983453,187826.435827,222394.815,5.0,834652.215,24.0,273447.675,3.875422,866.666667,742.857143,742.857143,6.0,115987200.0,2387232.0,2766.0,2545.0,316342.35,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [36]:
for train_column_name in train_sample.columns.tolist():
    if train_column_name not in test.columns.tolist():
        print("%s not in test" % train_column_name)

target not in test


In [37]:
train_sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 110093 entries, 0 to 110092
Data columns (total 76 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   application_number                     110093 non-null  int64  
 1   target                                 110093 non-null  int64  
 2   name_contract_type                     110093 non-null  object 
 3   gender                                 89539 non-null   object 
 4   childrens                              89539 non-null   float64
 5   total_salary                           89539 non-null   float64
 6   amount_credit                          89539 non-null   float64
 7   amount_annuity                         89534 non-null   float64
 8   education_level                        89539 non-null   object 
 9   family_status                          89539 non-null   object 
 10  region_population                      89539 non-null   

In [38]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 165141 entries, 0 to 165140
Data columns (total 75 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   application_number                     165141 non-null  int64  
 1   name_contract_type                     165141 non-null  object 
 2   gender                                 134176 non-null  object 
 3   childrens                              134176 non-null  float64
 4   total_salary                           134176 non-null  float64
 5   amount_credit                          134176 non-null  float64
 6   amount_annuity                         134170 non-null  float64
 7   education_level                        134176 non-null  object 
 8   family_status                          134176 non-null  object 
 9   region_population                      134176 non-null  float64
 10  age                                    134176 non-null  