In [1]:
%matplotlib notebook

In [2]:
import polars as pl
import numpy as np
import pandas as pd 

from sklearn.preprocessing import LabelEncoder

import os

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn .metrics import roc_auc_score

import lightgbm as lgb

In [3]:
print(os.listdir("/kaggle/input/"))

['home-credit-credit-risk-model-stability']


In [4]:
# reading the training data
path = ('../input/home-credit-credit-risk-model-stability/')

In [5]:
def set_table_dtypes(df: pl.DataFrame) -> pl.DataFrame:
    # implement here all desired dtypes for tables
    # the following is just an example
    for clm in df.columns:
        # last letter of column name will help you determine the type
        if clm[-1] in ("P", "A"):
            df = df.with_columns(pl.col(clm).cast(pl.Float64).alias(clm))

    return df

In [6]:
def convert_strings(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.columns:  
        if df[col].dtype.name in ['object', 'string']:
            df[col] = df[col].astype("string").astype('category')
            current_categories = df[col].cat.categories
            new_categories = current_categories.to_list() + ["Unknown"]
            new_dtype = pd.CategoricalDtype(categories=new_categories, ordered=True)
            df[col] = df[col].astype(new_dtype)
    return df

In [7]:
# checking missing data
def counting_null_values(df: pl.DataFrame) -> pl.DataFrame:
    total = 0
    
    for col in df.columns: 
        total += df.select(pl.col(col)).null_count()
    
    return(total)


In [8]:
train_basetable = pl.read_csv(path + "csv_files/train/train_base.csv")
train_static = pl.concat(
    [
        pl.read_csv(path + "csv_files/train/train_static_0_0.csv").pipe(set_table_dtypes),
        pl.read_csv(path + "csv_files/train/train_static_0_1.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)
train_static_cb = pl.read_csv(path + "csv_files/train/train_static_cb_0.csv").pipe(set_table_dtypes)
train_person_1 = pl.read_csv(path + "csv_files/train/train_person_1.csv").pipe(set_table_dtypes) 
train_credit_bureau_b_2 = pl.read_csv(path + "csv_files/train/train_credit_bureau_b_2.csv").pipe(set_table_dtypes) 
applprev_1 = pl.concat(
    [
        pl.read_csv(path + "csv_files/train/train_applprev_1_0.csv").pipe(set_table_dtypes),
        pl.read_csv(path + "csv_files/train/train_applprev_1_1.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)
other_1 = pl.read_csv(path + "csv_files/train/train_other_1.csv").pipe(set_table_dtypes)
tax_registry_a_1 = pl.read_csv(path + "csv_files/train/train_tax_registry_a_1.csv").pipe(set_table_dtypes)
tax_registry_b_1 = pl.read_csv(path + "csv_files/train/train_tax_registry_b_1.csv").pipe(set_table_dtypes)
tax_registry_c_1 = pl.read_csv(path + "csv_files/train/train_tax_registry_c_1.csv").pipe(set_table_dtypes)
credit_bureau_b_1 = pl.read_csv(path + "csv_files/train/train_credit_bureau_b_1.csv").pipe(set_table_dtypes)
deposit_1 = pl.read_csv(path + "csv_files/train/train_deposit_1.csv").pipe(set_table_dtypes)
person_1 = pl.read_csv(path + "csv_files/train/train_person_1.csv").pipe(set_table_dtypes)
debitcard_1 = pl.read_csv(path + "csv_files/train/train_debitcard_1.csv").pipe(set_table_dtypes)
applprev_2 = pl.read_csv(path + "csv_files/train/train_applprev_2.csv").pipe(set_table_dtypes)
person_2 = pl.read_csv(path + "csv_files/train/train_person_2.csv").pipe(set_table_dtypes)

In [9]:
dataset_names = [train_basetable , train_static, train_static_cb, train_person_1, train_credit_bureau_b_2, applprev_1, 
                other_1, tax_registry_a_1, tax_registry_b_1, tax_registry_c_1, credit_bureau_b_1, deposit_1,
                person_1, debitcard_1, applprev_2, person_2]
dataset_names_str = ['train_basetable' , 'train_static', 'train_static_cb', 'train_person_1',
                     'train_credit_bureau_b_2','applprev_1', 'other_1', 'tax_registry_a_1', 
                     'tax_registry_b_1', 'tax_registry_c_1', 'credit_bureau_b_1',
                     'deposit_1','person_1', 'debitcard_1', 'applprev_2', 'person_2']
null_value_count = []

for dataset in dataset_names:
    null_value_count.append(pl.lit(dataset.pipe(counting_null_values))) 
    
null_dataset_train = pl.DataFrame(
    {
        "Dataset_Names": dataset_names_str,
        "Null_Value_Count": null_value_count
         
    }
)
null_dataset_train

Dataset_Names,Null_Value_Count
str,object
"""train_basetabl…",0
"""train_static""",78503607
"""train_static_c…",49375701
"""train_person_1…",51051536
"""train_credit_b…",10722
…,…
"""deposit_1""",79682
"""person_1""",51051536
"""debitcard_1""",450239
"""applprev_2""",16236709


In [10]:
credit_bureau_a_1 = pl.concat(
    [
        pl.read_csv(path + "csv_files/train/train_credit_bureau_a_1_0.csv").pipe(set_table_dtypes),
        pl.read_csv(path + "csv_files/train/train_credit_bureau_a_1_1.csv").pipe(set_table_dtypes),
#         pl.read_csv(path + "csv_files/train/train_credit_bureau_a_1_2.csv").pipe(set_table_dtypes),
#         pl.read_csv(path + "csv_files/train/train_credit_bureau_a_1_3.csv").pipe(set_table_dtypes),

    ],
    how="vertical_relaxed",
)

credit_bureau_a_2 = pl.concat(
    [
        pl.read_csv(path + "csv_files/train/train_credit_bureau_a_2_0.csv").pipe(set_table_dtypes),
        pl.read_csv(path + "csv_files/train/train_credit_bureau_a_2_1.csv").pipe(set_table_dtypes),
#         pl.read_csv(path + "csv_files/train/train_credit_bureau_a_2_2.csv").pipe(set_table_dtypes),
#         pl.read_csv(path + "csv_files/train/train_credit_bureau_a_2_3.csv").pipe(set_table_dtypes),
#         pl.read_csv(path + "csv_files/train/train_credit_bureau_a_2_4.csv").pipe(set_table_dtypes),
#         pl.read_csv(path + "csv_files/train/train_credit_bureau_a_2_5.csv").pipe(set_table_dtypes),
#         pl.read_csv(path + "csv_files/train/train_credit_bureau_a_2_6.csv").pipe(set_table_dtypes),
#         pl.read_csv(path + "csv_files/train/train_credit_bureau_a_2_7.csv").pipe(set_table_dtypes),
#         pl.read_csv(path + "csv_files/train/train_credit_bureau_a_2_8.csv").pipe(set_table_dtypes),
#         pl.read_csv(path + "csv_files/train/train_credit_bureau_a_2_9.csv").pipe(set_table_dtypes),
#         pl.read_csv(path + "csv_files/train/train_credit_bureau_a_2_10.csv").pipe(set_table_dtypes),


    ],
    how="vertical_relaxed",
)


# Explore the Train Datasets

In [11]:
train_basetable.head()

case_id,date_decision,MONTH,WEEK_NUM,target
i64,str,i64,i64,i64
0,"""2019-01-03""",201901,0,0
1,"""2019-01-03""",201901,0,0
2,"""2019-01-04""",201901,0,0
3,"""2019-01-03""",201901,0,0
4,"""2019-01-04""",201901,0,1


In [12]:
train_static.head()

case_id,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,applicationscnt_1086L,applicationscnt_464L,applicationscnt_629L,applicationscnt_867L,avgdbddpdlast24m_3658932P,avgdbddpdlast3m_4187120P,avgdbdtollast24m_4525197P,avgdpdtolclosure24_3658938P,avginstallast24m_3658937A,avglnamtstart24m_4525187A,avgmaxdpdlast9m_3716943P,avgoutstandbalancel6m_4187114A,avgpmtlast12m_4525200A,bankacctype_710L,cardtype_51L,clientscnt12m_3712952L,clientscnt3m_3712950L,clientscnt6m_3712949L,clientscnt_100L,clientscnt_1022L,clientscnt_1071L,clientscnt_1130L,clientscnt_136L,clientscnt_157L,clientscnt_257L,clientscnt_304L,clientscnt_360L,clientscnt_493L,clientscnt_533L,clientscnt_887L,…,numinstpaidearlyest_4493214L,numinstpaidlastcontr_4325080L,numinstpaidlate1d_3546852L,numinstregularpaid_973L,numinstregularpaidest_4493210L,numinsttopaygr_769L,numinsttopaygrest_4493213L,numinstunpaidmax_3546851L,numinstunpaidmaxest_4493212L,numnotactivated_1143L,numpmtchanneldd_318L,numrejects9m_859L,opencred_647L,paytype1st_925L,paytype_783L,payvacationpostpone_4187118D,pctinstlsallpaidearl3d_427L,pctinstlsallpaidlat10d_839L,pctinstlsallpaidlate1d_3546856L,pctinstlsallpaidlate4d_3546849L,pctinstlsallpaidlate6d_3546844L,pmtnum_254L,posfpd10lastmonth_333P,posfpd30lastmonth_3976960P,posfstqpd30lastmonth_3976962P,previouscontdistrict_112M,price_1097A,sellerplacecnt_915L,sellerplacescnt_216L,sumoutstandtotal_3546847A,sumoutstandtotalest_4493215A,totaldebt_9A,totalsettled_863A,totinstallast1m_4525188A,twobodfilling_608L,typesuite_864L,validfrom_1069D
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,f64,f64,f64,f64,f64,f64,str,f64,f64,f64,f64,f64,f64,f64,…,str,str,f64,f64,str,f64,str,f64,str,f64,f64,f64,bool,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f64,f64,f64,f64,f64,f64,f64,f64,str,str,str
0,,,1917.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,,,,,,,,,,0.0,0.0,0.0,,"""OTHER""","""OTHER""",,,,,,,24.0,0.0,0.0,,"""a55475b1""",,0.0,0.0,,,0.0,0.0,,"""BO""",,
1,,,3134.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""0.0""",3.0,0.0,0.0,0.0,0.0,0.0,0.0,…,,,,,,,,,,0.0,0.0,0.0,,"""OTHER""","""OTHER""",,,,,,,18.0,0.0,0.0,,"""a55475b1""",,0.0,0.0,,,0.0,0.0,,"""BO""",,
2,,,4937.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,,,,,,,,,,0.0,0.0,0.0,False,"""OTHER""","""OTHER""",,,,,,,36.0,0.0,0.0,,"""a55475b1""",,0.0,0.0,,,0.0,0.0,,"""BO""","""AL""",
3,,,4643.6,0.0,0.0,1.0,0.0,2.0,0.0,1.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,,,,,,,,,,0.0,0.0,1.0,False,"""OTHER""","""OTHER""",,,,,,,12.0,0.0,0.0,,"""a55475b1""",,1.0,1.0,,,0.0,0.0,,"""BO""","""AL""",
4,,,3390.2,0.0,0.0,1.0,0.0,0.0,0.0,1.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,,,,,,,,,,0.0,0.0,0.0,False,"""OTHER""","""OTHER""",,,,,,,24.0,0.0,0.0,,"""a55475b1""",,0.0,0.0,,,0.0,0.0,,"""BO""","""AL""",


In [13]:
train_static_cb.head()

case_id,assignmentdate_238D,assignmentdate_4527235D,assignmentdate_4955616D,birthdate_574D,contractssum_5085716L,dateofbirth_337D,dateofbirth_342D,days120_123L,days180_256L,days30_165L,days360_512L,days90_310L,description_5085714M,education_1103M,education_88M,firstquarter_103L,for3years_128L,for3years_504L,for3years_584L,formonth_118L,formonth_206L,formonth_535L,forquarter_1017L,forquarter_462L,forquarter_634L,fortoday_1092L,forweek_1077L,forweek_528L,forweek_601L,foryear_618L,foryear_818L,foryear_850L,fourthquarter_440L,maritalst_385M,maritalst_893M,numberofqueries_373L,pmtaverage_3A,pmtaverage_4527227A,pmtaverage_4955615A,pmtcount_4527229L,pmtcount_4955617L,pmtcount_693L,pmtscount_423L,pmtssum_45A,requesttype_4525192L,responsedate_1012D,responsedate_4527233D,responsedate_4917613D,riskassesment_302T,riskassesment_940T,secondquarter_766L,thirdquarter_1082L
i64,str,str,str,str,str,str,str,f64,f64,f64,f64,f64,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,f64,f64,f64,str,str,f64,f64,f64,str,str,str,str,str,f64,f64,f64
357,,,,"""1988-04-01""",,,,,,,,,"""a55475b1""","""a55475b1""","""a55475b1""",,,,,,,,,,,,,,,,,,,"""a55475b1""","""a55475b1""",,,,,,,,6.0,6301.4,,"""2019-01-25""",,,,,,
381,,,,"""1973-11-01""",,,,,,,,,"""a55475b1""","""a55475b1""","""a55475b1""",,,,,,,,,,,,,,,,,,,"""a55475b1""","""a55475b1""",,,,,,,,6.0,4019.6,,"""2019-01-25""",,,,,,
388,,,,"""1989-04-01""",,"""1989-04-01""",,6.0,8.0,2.0,10.0,4.0,"""a55475b1""","""a55475b1""","""a55475b1""",2.0,,,,,,,,,,,,,,,,,6.0,"""a55475b1""","""a55475b1""",10.0,,,,,,,6.0,14548.0,,"""2019-01-28""",,,,,3.0,5.0
405,,,,"""1974-03-01""",,"""1974-03-01""",,0.0,0.0,0.0,1.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",0.0,,,,,,,,,,,,,,,,,4.0,"""a55475b1""","""a55475b1""",1.0,,,,,,,6.0,10498.24,,"""2019-01-21""",,,,,2.0,0.0
409,,,,"""1993-06-01""",,"""1993-06-01""",,2.0,3.0,0.0,3.0,1.0,"""a55475b1""","""717ddd49""","""a55475b1""",4.0,,,,,,,,,,,,,,,,,1.0,"""a7fcb6e5""","""a55475b1""",3.0,,,,,,,7.0,6344.8804,,"""2019-01-21""",,,,,0.0,4.0


In [14]:
train_person_1.head()

case_id,birth_259D,birthdate_87D,childnum_185L,contaddr_district_15M,contaddr_matchlist_1032L,contaddr_smempladdr_334L,contaddr_zipcode_807M,education_927M,empl_employedfrom_271D,empl_employedtotal_800L,empl_industry_691L,empladdr_district_926M,empladdr_zipcode_114M,familystate_447L,gender_992L,housetype_905L,housingtype_772L,incometype_1044T,isreference_387L,language1_981M,mainoccupationinc_384A,maritalst_703L,num_group1,personindex_1023L,persontype_1072L,persontype_792L,registaddr_district_1083M,registaddr_zipcode_184M,relationshiptoclient_415T,relationshiptoclient_642T,remitter_829L,role_1084L,role_993L,safeguarantyflag_411L,sex_738L,type_25L
i64,str,str,f64,str,bool,bool,str,str,str,str,str,str,str,str,str,str,str,str,bool,str,f64,str,i64,f64,f64,f64,str,str,str,str,bool,str,str,bool,str,str
0,"""1986-07-01""",,,"""P88_18_84""",False,False,"""P167_100_165""","""P97_36_170""","""2017-09-15""","""MORE_FIVE""","""OTHER""","""P142_57_166""","""P167_100_165""","""MARRIED""",,,,"""SALARIED_GOVT""",,"""P10_39_147""",10800.0,,0,0.0,1.0,1.0,"""P88_18_84""","""P167_100_165""",,,,"""CL""",,True,"""F""","""PRIMARY_MOBILE…"
0,,,,"""a55475b1""",,,"""a55475b1""","""a55475b1""",,,,"""a55475b1""","""a55475b1""",,,,,,,"""a55475b1""",,,1,1.0,1.0,4.0,"""a55475b1""","""a55475b1""","""SPOUSE""",,False,"""EM""",,,,"""PHONE"""
0,,,,"""a55475b1""",,,"""a55475b1""","""a55475b1""",,,,"""a55475b1""","""a55475b1""",,,,,,,"""a55475b1""",,,2,2.0,4.0,5.0,"""a55475b1""","""a55475b1""","""COLLEAGUE""","""SPOUSE""",False,"""PE""",,,,"""PHONE"""
0,,,,"""a55475b1""",,,"""a55475b1""","""a55475b1""",,,,"""a55475b1""","""a55475b1""",,,,,,,"""a55475b1""",,,3,,5.0,,"""a55475b1""","""a55475b1""",,"""COLLEAGUE""",,"""PE""",,,,"""PHONE"""
1,"""1957-08-01""",,,"""P103_93_94""",False,False,"""P176_37_166""","""P97_36_170""","""2008-10-29""","""MORE_FIVE""","""OTHER""","""P49_46_174""","""P160_59_140""","""DIVORCED""",,,,"""SALARIED_GOVT""",,"""P10_39_147""",10000.0,,0,0.0,1.0,1.0,"""P103_93_94""","""P176_37_166""",,,,"""CL""",,True,"""M""","""PRIMARY_MOBILE…"


In [15]:
train_credit_bureau_b_2.head()

case_id,num_group1,num_group2,pmts_date_1107D,pmts_dpdvalue_108P,pmts_pmtsoverdue_635A
i64,i64,i64,str,f64,f64
467,0,0,"""2018-11-15""",,
467,0,1,"""2018-12-15""",,
467,1,0,"""2018-12-15""",,
467,2,0,"""2016-10-15""",0.0,0.0
467,2,1,"""2016-11-15""",0.0,0.0


In [16]:
applprev_1.head()

case_id,actualdpd_943P,annuity_853A,approvaldate_319D,byoccupationinc_3656910L,cancelreason_3545846M,childnum_21L,creationdate_885D,credacc_actualbalance_314A,credacc_credlmt_575A,credacc_maxhisbal_375A,credacc_minhisbal_90A,credacc_status_367L,credacc_transactions_402L,credamount_590A,credtype_587L,currdebt_94A,dateactivated_425D,district_544M,downpmt_134A,dtlastpmt_581D,dtlastpmtallstes_3545839D,education_1138M,employedfrom_700D,familystate_726L,firstnonzeroinstldate_307D,inittransactioncode_279L,isbidproduct_390L,isdebitcard_527L,mainoccupationinc_437A,maxdpdtolerance_577P,num_group1,outstandingdebt_522A,pmtnum_8L,postype_4733339M,profession_152M,rejectreason_755M,rejectreasonclient_4145042M,revolvingaccount_394A,status_219L,tenor_203L
i64,f64,f64,str,f64,str,f64,str,f64,f64,f64,f64,str,f64,f64,str,f64,str,str,f64,str,str,str,str,str,str,str,bool,bool,f64,f64,i64,f64,f64,str,str,str,str,f64,str,f64
2,0.0,640.2,,,"""a55475b1""",0.0,"""2013-04-03""",,0.0,,,,,10000.0,"""CAL""",,,"""P136_108_173""",0.0,,,"""P97_36_170""","""2010-02-15""","""SINGLE""","""2013-05-04""","""CASH""",False,,8200.0,,0,,24.0,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,"""D""",24.0
2,0.0,1682.4,,,"""a55475b1""",0.0,"""2013-04-03""",,0.0,,,,,16000.0,"""CAL""",,,"""P136_108_173""",0.0,,,"""P97_36_170""","""2010-02-15""","""SINGLE""","""2013-05-04""","""CASH""",False,,8200.0,,1,,12.0,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,"""D""",12.0
3,0.0,6140.0,,,"""P94_109_143""",,"""2019-01-07""",,0.0,,,,,59999.8,"""CAL""",,,"""P131_33_167""",0.0,,,"""P97_36_170""","""2018-05-15""","""MARRIED""","""2019-02-07""","""CASH""",False,,11000.0,,0,,12.0,"""a55475b1""","""a55475b1""","""P94_109_143""","""a55475b1""",,"""D""",12.0
4,0.0,2556.6,,,"""P24_27_36""",,"""2019-01-08""",,0.0,,,,,40000.0,"""CAL""",,,"""P194_82_174""",0.0,,,"""a55475b1""",,,"""2019-02-08""","""CASH""",False,,16000.0,,0,,24.0,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,"""T""",24.0
5,0.0,,,,"""P85_114_140""",,"""2019-01-16""",,,,,,,,,,,"""P54_133_26""",,,,"""a55475b1""",,,,,False,,62000.0,,0,,,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,"""T""",


In [17]:
other_1.head()

case_id,amtdebitincoming_4809443A,amtdebitoutgoing_4809440A,amtdepositbalance_4809441A,amtdepositincoming_4809444A,amtdepositoutgoing_4809442A,num_group1
i64,f64,f64,f64,f64,f64,i64
43801,12466.601,12291.2,914.2,0.0,304.80002,0
43991,3333.4001,3273.4001,0.0,0.0,0.0,0
44001,10000.0,10000.0,0.0,0.0,0.0,0
44053,0.0,0.0,2586.4001,0.0,88.8,0
44130,63.8,60.8,0.0,0.0,0.0,0


In [18]:
tax_registry_a_1.head()

case_id,amount_4527230A,name_4527232M,num_group1,recorddate_4527225D
i64,f64,str,i64,str
28631,711.0,"""f980a1ea""",3,"""2019-09-13"""
28631,1946.0,"""f980a1ea""",2,"""2019-09-13"""
28631,2600.0,"""f980a1ea""",1,"""2019-09-13"""
28631,3616.4001,"""f980a1ea""",0,"""2019-09-13"""
28632,400.0,"""5f9b74f5""",6,"""2019-09-13"""


In [19]:
tax_registry_b_1.head()

case_id,amount_4917619A,deductiondate_4917603D,name_4917606M,num_group1
i64,f64,str,str,i64
49435,6885.0,"""2019-10-16""","""6b730375""",0
49435,6885.0,"""2019-10-16""","""6b730375""",1
49435,6885.0,"""2019-10-16""","""6b730375""",2
49435,6885.0,"""2019-10-16""","""6b730375""",3
49435,6885.0,"""2019-10-16""","""6b730375""",4


In [20]:
tax_registry_c_1.head()

case_id,employername_160M,num_group1,pmtamount_36A,processingdate_168D
i64,str,i64,f64,str
357,"""c91b12ff""",0,1200.0,"""2019-01-04"""
357,"""c91b12ff""",1,1200.0,"""2018-11-28"""
357,"""c91b12ff""",2,972.8,"""2018-11-01"""
357,"""c91b12ff""",3,628.60004,"""2018-10-08"""
357,"""c91b12ff""",4,1200.0,"""2018-09-10"""


In [21]:
credit_bureau_b_1.head()

case_id,amount_1115A,classificationofcontr_1114M,contractdate_551D,contractmaturitydate_151D,contractst_516M,contracttype_653M,credlmt_1052A,credlmt_228A,credlmt_3940954A,credor_3940957M,credquantity_1099L,credquantity_984L,debtpastduevalue_732A,debtvalue_227A,dpd_550P,dpd_733P,dpdmax_851P,dpdmaxdatemonth_804T,dpdmaxdateyear_742T,installmentamount_644A,installmentamount_833A,instlamount_892A,interesteffectiverate_369L,interestrateyearly_538L,lastupdate_260D,maxdebtpduevalodued_3940955A,num_group1,numberofinstls_810L,overdueamountmax_950A,overdueamountmaxdatemonth_494T,overdueamountmaxdateyear_432T,periodicityofpmts_997L,periodicityofpmts_997M,pmtdaysoverdue_1135P,pmtmethod_731M,pmtnumpending_403L,purposeofcred_722M,residualamount_1093A,residualamount_127A,residualamount_3940956A,subjectrole_326M,subjectrole_43M,totalamount_503A,totalamount_881A
i64,f64,str,str,str,str,str,f64,f64,f64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f64,i64,f64,f64,f64,f64,str,str,f64,str,f64,str,f64,f64,f64,str,str,f64,f64
467,78000.0,"""ea6782cc""","""2016-10-25""","""2019-10-25""","""7241344e""","""4257cbed""",,,,"""c5a72b57""",,,0.0,26571.969,,,0.0,11.0,2016.0,,,2898.76,,,"""2019-01-10""",0.0,2,36.0,0.0,11.0,2016.0,,"""a0b598e4""",0.0,"""e914c86c""",10.0,"""96a8fdfe""",,,,"""a55475b1""","""a55475b1""",,
467,,"""ea6782cc""","""2011-06-15""","""2031-06-13""","""7241344e""","""724be82a""",3000000.0,10000.0,3000000.0,"""P164_34_168""",2.0,1.0,,,0.0,0.0,,,,0.0,0.0,,,,"""2019-01-20""",,0,,,,,,"""a55475b1""",,"""a55475b1""",,"""96a8fdfe""",0.0,0.0,,"""fa4f56f1""","""ab3c25cf""",3000000.0,10000.0
467,,"""ea6782cc""","""2019-01-04""","""2021-08-04""","""7241344e""","""724be82a""",,,130365.0,"""P164_34_168""",1.0,2.0,,,0.0,0.0,,,,0.0,26571.969,,,,"""2019-01-20""",,1,,,,,,"""a55475b1""",,"""a55475b1""",,"""96a8fdfe""",,,,"""ab3c25cf""","""ab3c25cf""",78000.0,960000.0
1445,12000.0,"""ea6782cc""","""2018-12-31""","""2019-01-29""","""7241344e""","""4257cbed""",,,,"""0aebc0bb""",,,0.0,19066.64,,,0.0,1.0,2019.0,,,19571.412,,,"""2019-01-27""",0.0,2,1.0,0.0,1.0,2019.0,,"""d479a207""",0.0,"""dbcbe8f8""",1.0,"""96a8fdfe""",,,,"""a55475b1""","""a55475b1""",,
1445,31400.0,"""01f63ac8""","""2018-07-25""","""2019-12-25""","""7241344e""","""4257cbed""",,,,"""50babcd4""",,,0.0,23390.16,,,0.0,8.0,2018.0,,,2124.142,,,"""2019-01-28""",0.0,3,17.0,0.0,8.0,2018.0,,"""a0b598e4""",0.0,"""dbcbe8f8""",12.0,"""60c73645""",,,,"""a55475b1""","""a55475b1""",,


In [22]:
deposit_1.head()

case_id,amount_416A,contractenddate_991D,num_group1,openingdate_313D
i64,f64,str,i64,str
225,0.0,,0,"""2016-08-16"""
331,260.374,"""2018-03-18""",0,"""2015-03-19"""
358,0.0,,0,"""2014-09-02"""
390,203.602,"""2017-09-30""",1,"""2015-10-01"""
390,223.68001,,2,"""2016-06-08"""


In [23]:
person_1.head()

case_id,birth_259D,birthdate_87D,childnum_185L,contaddr_district_15M,contaddr_matchlist_1032L,contaddr_smempladdr_334L,contaddr_zipcode_807M,education_927M,empl_employedfrom_271D,empl_employedtotal_800L,empl_industry_691L,empladdr_district_926M,empladdr_zipcode_114M,familystate_447L,gender_992L,housetype_905L,housingtype_772L,incometype_1044T,isreference_387L,language1_981M,mainoccupationinc_384A,maritalst_703L,num_group1,personindex_1023L,persontype_1072L,persontype_792L,registaddr_district_1083M,registaddr_zipcode_184M,relationshiptoclient_415T,relationshiptoclient_642T,remitter_829L,role_1084L,role_993L,safeguarantyflag_411L,sex_738L,type_25L
i64,str,str,f64,str,bool,bool,str,str,str,str,str,str,str,str,str,str,str,str,bool,str,f64,str,i64,f64,f64,f64,str,str,str,str,bool,str,str,bool,str,str
0,"""1986-07-01""",,,"""P88_18_84""",False,False,"""P167_100_165""","""P97_36_170""","""2017-09-15""","""MORE_FIVE""","""OTHER""","""P142_57_166""","""P167_100_165""","""MARRIED""",,,,"""SALARIED_GOVT""",,"""P10_39_147""",10800.0,,0,0.0,1.0,1.0,"""P88_18_84""","""P167_100_165""",,,,"""CL""",,True,"""F""","""PRIMARY_MOBILE…"
0,,,,"""a55475b1""",,,"""a55475b1""","""a55475b1""",,,,"""a55475b1""","""a55475b1""",,,,,,,"""a55475b1""",,,1,1.0,1.0,4.0,"""a55475b1""","""a55475b1""","""SPOUSE""",,False,"""EM""",,,,"""PHONE"""
0,,,,"""a55475b1""",,,"""a55475b1""","""a55475b1""",,,,"""a55475b1""","""a55475b1""",,,,,,,"""a55475b1""",,,2,2.0,4.0,5.0,"""a55475b1""","""a55475b1""","""COLLEAGUE""","""SPOUSE""",False,"""PE""",,,,"""PHONE"""
0,,,,"""a55475b1""",,,"""a55475b1""","""a55475b1""",,,,"""a55475b1""","""a55475b1""",,,,,,,"""a55475b1""",,,3,,5.0,,"""a55475b1""","""a55475b1""",,"""COLLEAGUE""",,"""PE""",,,,"""PHONE"""
1,"""1957-08-01""",,,"""P103_93_94""",False,False,"""P176_37_166""","""P97_36_170""","""2008-10-29""","""MORE_FIVE""","""OTHER""","""P49_46_174""","""P160_59_140""","""DIVORCED""",,,,"""SALARIED_GOVT""",,"""P10_39_147""",10000.0,,0,0.0,1.0,1.0,"""P103_93_94""","""P176_37_166""",,,,"""CL""",,True,"""M""","""PRIMARY_MOBILE…"


In [24]:
debitcard_1.head()

case_id,last180dayaveragebalance_704A,last180dayturnover_1134A,last30dayturnover_651A,num_group1,openingdate_857D
i64,f64,f64,f64,i64,str
225,,,,0,"""2016-08-16"""
331,,,,0,"""2015-03-19"""
358,,,,0,"""2014-09-02"""
390,,,,0,"""2014-07-23"""
390,,,,1,"""2015-10-01"""


In [25]:
applprev_2.head()

case_id,cacccardblochreas_147M,conts_type_509L,credacc_cards_status_52L,num_group1,num_group2
i64,str,str,str,i64,i64
2,,"""PRIMARY_MOBILE…",,0,0
2,,"""EMPLOYMENT_PHO…",,0,1
2,,"""PRIMARY_MOBILE…",,1,0
2,,"""EMPLOYMENT_PHO…",,1,1
3,,"""PHONE""",,0,0


In [26]:
person_2.head()

case_id,addres_district_368M,addres_role_871L,addres_zip_823M,conts_role_79M,empls_economicalst_849M,empls_employedfrom_796D,empls_employer_name_740M,num_group1,num_group2,relatedpersons_role_762T
i64,str,str,str,str,str,str,str,i64,i64,str
5,"""a55475b1""",,"""a55475b1""","""a55475b1""","""a55475b1""",,"""a55475b1""",0,0,
6,"""P55_110_32""","""CONTACT""","""P10_68_40""","""P38_92_157""","""P164_110_33""",,"""a55475b1""",0,0,
6,"""P55_110_32""","""PERMANENT""","""P10_68_40""","""a55475b1""","""a55475b1""",,"""a55475b1""",0,1,
6,"""P204_92_178""","""CONTACT""","""P65_136_169""","""P38_92_157""","""P164_110_33""",,"""a55475b1""",1,0,"""OTHER_RELATIVE…"
6,"""P191_109_75""","""CONTACT""","""P10_68_40""","""P7_147_157""","""a55475b1""",,"""a55475b1""",1,1,"""OTHER_RELATIVE…"


In [27]:
credit_bureau_a_1.head()

case_id,annualeffectiverate_199L,annualeffectiverate_63L,classificationofcontr_13M,classificationofcontr_400M,contractst_545M,contractst_964M,contractsum_5085717L,credlmt_230A,credlmt_935A,dateofcredend_289D,dateofcredend_353D,dateofcredstart_181D,dateofcredstart_739D,dateofrealrepmt_138D,debtoutstand_525A,debtoverdue_47A,description_351M,dpdmax_139P,dpdmax_757P,dpdmaxdatemonth_442T,dpdmaxdatemonth_89T,dpdmaxdateyear_596T,dpdmaxdateyear_896T,financialinstitution_382M,financialinstitution_591M,instlamount_768A,instlamount_852A,interestrate_508L,lastupdate_1112D,lastupdate_388D,monthlyinstlamount_332A,monthlyinstlamount_674A,nominalrate_281L,nominalrate_498L,num_group1,numberofcontrsvalue_258L,…,numberofoverdueinstlmax_1039L,numberofoverdueinstlmax_1151L,numberofoverdueinstlmaxdat_148D,numberofoverdueinstlmaxdat_641D,numberofoverdueinstls_725L,numberofoverdueinstls_834L,outstandingamount_354A,outstandingamount_362A,overdueamount_31A,overdueamount_659A,overdueamountmax2_14A,overdueamountmax2_398A,overdueamountmax2date_1002D,overdueamountmax2date_1142D,overdueamountmax_155A,overdueamountmax_35A,overdueamountmaxdatemonth_284T,overdueamountmaxdatemonth_365T,overdueamountmaxdateyear_2T,overdueamountmaxdateyear_994T,periodicityofpmts_1102L,periodicityofpmts_837L,prolongationcount_1120L,prolongationcount_599L,purposeofcred_426M,purposeofcred_874M,refreshdate_3813885D,residualamount_488A,residualamount_856A,subjectrole_182M,subjectrole_93M,totalamount_6A,totalamount_996A,totaldebtoverduevalue_178A,totaldebtoverduevalue_718A,totaloutstanddebtvalue_39A,totaloutstanddebtvalue_668A
i64,str,f64,str,str,str,str,str,f64,f64,str,str,str,str,str,f64,f64,str,f64,f64,str,f64,f64,str,str,str,f64,f64,str,str,str,f64,f64,f64,str,i64,f64,…,f64,str,str,str,f64,str,f64,f64,f64,f64,f64,f64,str,str,f64,f64,str,f64,f64,str,str,f64,str,str,str,str,str,f64,f64,str,str,f64,f64,f64,f64,f64,f64
388,,,"""ea6782cc""","""a55475b1""","""7241344e""","""a55475b1""",,,135806.0,"""2020-08-06""",,,"""2018-08-06""",,,,"""a55475b1""",0.0,,,8.0,2018.0,,"""a55475b1""","""P204_66_73""",8742.8,,,"""2019-01-11""",,8742.8,,,,1,,…,0.0,,,,0.0,,,,,0.0,0.0,,,,0.0,,,8.0,2018.0,,,,,,"""60c73645""","""a55475b1""",,,114325.805,"""a55475b1""","""a55475b1""",,,,,,
388,,,"""4408ff0f""","""a55475b1""","""7241344e""","""a55475b1""",,,,"""2023-06-20""",,,"""2018-06-20""",,374419.5,0.0,"""a55475b1""",0.0,,,7.0,2018.0,,"""a55475b1""","""55b002a9""",,,,"""2019-01-24""",,7811.4463,,,,0,2.0,…,0.0,,,,0.0,,,260093.7,,0.0,0.0,,,,0.0,,,7.0,2018.0,,,30.0,,,"""96a8fdfe""","""a55475b1""",,,,"""ab3c25cf""","""ab3c25cf""",,268897.62,0.0,0.0,374419.5,0.0
388,,,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,,,,,,,,,,"""a55475b1""",,,,,,,"""a55475b1""","""a55475b1""",,,,,,,,,,2,,…,,,,,,,,,,,,,,,,,,,,,,,,,"""a55475b1""","""a55475b1""","""2019-01-28""",,,"""a55475b1""","""a55475b1""",,,,,,
388,,,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,,,,,,,,,,"""a55475b1""",,,,,,,"""a55475b1""","""a55475b1""",,,,,,,,,,3,,…,,,,,,,,,,,,,,,,,,,,,,,,,"""a55475b1""","""a55475b1""","""2019-01-28""",,,"""a55475b1""","""a55475b1""",,,,,,
388,,,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,,,,,,,,,,"""a55475b1""",,,,,,,"""a55475b1""","""a55475b1""",,,,,,,,,,4,,…,,,,,,,,,,,,,,,,,,,,,,,,,"""a55475b1""","""a55475b1""","""2019-01-28""",,,"""a55475b1""","""a55475b1""",,,,,,


In [28]:
credit_bureau_a_2.head()

case_id,collater_typofvalofguarant_298M,collater_typofvalofguarant_407M,collater_valueofguarantee_1124L,collater_valueofguarantee_876L,collaterals_typeofguarante_359M,collaterals_typeofguarante_669M,num_group1,num_group2,pmts_dpd_1073P,pmts_dpd_303P,pmts_month_158T,pmts_month_706T,pmts_overdue_1140A,pmts_overdue_1152A,pmts_year_1139T,pmts_year_507T,subjectroles_name_541M,subjectroles_name_838M
i64,str,str,f64,str,str,str,i64,i64,f64,f64,f64,str,f64,f64,f64,str,str,str
388,"""8fd95e4b""","""a55475b1""",0.0,,"""a55475b1""","""c7a5ad39""",0,0,,,2.0,,,,2018.0,,"""a55475b1""","""ab3c25cf"""
388,"""9a0c095e""","""a55475b1""",0.0,,"""a55475b1""","""c7a5ad39""",1,0,,,2.0,,,,2018.0,,"""a55475b1""","""ab3c25cf"""
388,"""a55475b1""","""a55475b1""",,,"""a55475b1""","""a55475b1""",0,1,,,3.0,,,,2018.0,,"""a55475b1""","""a55475b1"""
388,"""a55475b1""","""a55475b1""",,,"""a55475b1""","""a55475b1""",0,2,,,4.0,,,,2018.0,,"""a55475b1""","""a55475b1"""
388,"""a55475b1""","""a55475b1""",,,"""a55475b1""","""a55475b1""",0,3,,,5.0,,,,2018.0,,"""a55475b1""","""a55475b1"""


In [29]:
test_basetable = pl.read_csv(path + "csv_files/test/test_base.csv")
test_static = pl.concat(
    [
        pl.read_csv(path + "csv_files/test/test_static_0_0.csv").pipe(set_table_dtypes),
        pl.read_csv(path + "csv_files/test/test_static_0_1.csv").pipe(set_table_dtypes),
        pl.read_csv(path + "csv_files/test/test_static_0_2.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)
test_static_cb = pl.read_csv(path + "csv_files/test/test_static_cb_0.csv").pipe(set_table_dtypes)
test_person_1 = pl.read_csv(path + "csv_files/test/test_person_1.csv").pipe(set_table_dtypes) 
test_credit_bureau_b_2 = pl.read_csv(path + "csv_files/test/test_credit_bureau_b_2.csv").pipe(set_table_dtypes)
applprev_1_t = pl.concat(
    [
        pl.read_csv(path + "csv_files/test/test_applprev_1_0.csv").pipe(set_table_dtypes),
        pl.read_csv(path + "csv_files/test/test_applprev_1_1.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)
other_1_t = pl.read_csv(path + "csv_files/test/test_other_1.csv").pipe(set_table_dtypes)
tax_registry_a_1_t = pl.read_csv(path + "csv_files/test/test_tax_registry_a_1.csv").pipe(set_table_dtypes)
tax_registry_b_1_t = pl.read_csv(path + "csv_files/test/test_tax_registry_b_1.csv").pipe(set_table_dtypes)
tax_registry_c_1_t = pl.read_csv(path + "csv_files/test/test_tax_registry_c_1.csv").pipe(set_table_dtypes)
credit_bureau_b_1_t = pl.read_csv(path + "csv_files/test/test_credit_bureau_b_1.csv").pipe(set_table_dtypes)
deposit_1_t = pl.read_csv(path + "csv_files/test/test_deposit_1.csv").pipe(set_table_dtypes)
person_1_t= pl.read_csv(path + "csv_files/test/test_person_1.csv").pipe(set_table_dtypes)
debitcard_1_t = pl.read_csv(path + "csv_files/test/test_debitcard_1.csv").pipe(set_table_dtypes)
applprev_2_t = pl.read_csv(path + "csv_files/test/test_applprev_2.csv").pipe(set_table_dtypes)
person_2_t = pl.read_csv(path + "csv_files/test/test_person_2.csv").pipe(set_table_dtypes)

In [30]:
dataset_names = [test_basetable , test_static, test_static_cb, test_person_1, test_credit_bureau_b_2, applprev_1, 
                other_1, tax_registry_a_1, tax_registry_b_1, tax_registry_c_1, credit_bureau_b_1, deposit_1,
                person_1, debitcard_1, applprev_2, person_2]
dataset_names_str = ['test_basetable' , 'test_static', 'test_static_cb', 'test_person_1',
                     'test_credit_bureau_b_2','applprev_1', 'other_1', 'tax_registry_a_1', 
                     'tax_registry_b_1', 'tax_registry_c_1', 'credit_bureau_b_1',
                     'deposit_1','person_1', 'debitcard_1', 'applprev_2', 'person_2']
null_value_count = []

for dataset in dataset_names:
    null_value_count.append(pl.lit(dataset.pipe(counting_null_values))) 
    
null_dataset_test = pl.DataFrame(
    {
        "Dataset_Names": dataset_names_str,
        "Null_Value_Count": null_value_count
         
    }
)
null_dataset_test

Dataset_Names,Null_Value_Count
str,object
"""test_basetable…",0
"""test_static""",1187
"""test_static_cb…",329
"""test_person_1""",149
"""test_credit_bu…",0
…,…
"""deposit_1""",79682
"""person_1""",51051536
"""debitcard_1""",450239
"""applprev_2""",16236709


In [31]:
credit_bureau_a_1_t = pl.concat(
    [
        pl.read_csv(path + "csv_files/test/test_credit_bureau_a_1_0.csv").pipe(set_table_dtypes),
        pl.read_csv(path + "csv_files/test/test_credit_bureau_a_1_1.csv").pipe(set_table_dtypes),
        pl.read_csv(path + "csv_files/test/test_credit_bureau_a_1_2.csv").pipe(set_table_dtypes),
        pl.read_csv(path + "csv_files/test/test_credit_bureau_a_1_3.csv").pipe(set_table_dtypes),
        pl.read_csv(path + "csv_files/test/test_credit_bureau_a_1_4.csv").pipe(set_table_dtypes),

    ],
    how="vertical_relaxed",
)

credit_bureau_a_2_t = pl.concat(
    [
        pl.read_csv(path + "csv_files/test/test_credit_bureau_a_2_0.csv").pipe(set_table_dtypes),
        pl.read_csv(path + "csv_files/test/test_credit_bureau_a_2_1.csv").pipe(set_table_dtypes),
#         pl.read_csv(path + "csv_files/test/test_credit_bureau_a_2_2.csv").pipe(set_table_dtypes),
#         pl.read_csv(path + "csv_files/test/test_credit_bureau_a_2_3.csv").pipe(set_table_dtypes),
#         pl.read_csv(path + "csv_files/test/test_credit_bureau_a_2_4.csv").pipe(set_table_dtypes),
#         pl.read_csv(path + "csv_files/test/test_credit_bureau_a_2_5.csv").pipe(set_table_dtypes),
#         pl.read_csv(path + "csv_files/test/test_credit_bureau_a_2_6.csv").pipe(set_table_dtypes),
#         pl.read_csv(path + "csv_files/test/test_credit_bureau_a_2_7.csv").pipe(set_table_dtypes),
#         pl.read_csv(path + "csv_files/test/test_credit_bureau_a_2_8.csv").pipe(set_table_dtypes),
#         pl.read_csv(path + "csv_files/test/test_credit_bureau_a_2_9.csv").pipe(set_table_dtypes),
#         pl.read_csv(path + "csv_files/test/test_credit_bureau_a_2_10.csv").pipe(set_table_dtypes),
#         pl.read_csv(path + "csv_files/test/test_credit_bureau_a_2_11.csv").pipe(set_table_dtypes),



   ],
    how="vertical_relaxed",
)


In [32]:
test_basetable.head()

case_id,date_decision,MONTH,WEEK_NUM
i64,str,i64,i64
57543,"""2021-05-14""",202201,100
57549,"""2022-01-17""",202201,100
57551,"""2020-11-27""",202201,100
57552,"""2020-11-27""",202201,100
57569,"""2021-12-20""",202201,100


In [33]:
test_static.head()

case_id,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,applicationscnt_1086L,applicationscnt_464L,applicationscnt_629L,applicationscnt_867L,avgdbddpdlast24m_3658932P,avgdbddpdlast3m_4187120P,avgdbdtollast24m_4525197P,avgdpdtolclosure24_3658938P,avginstallast24m_3658937A,avglnamtstart24m_4525187A,avgmaxdpdlast9m_3716943P,avgoutstandbalancel6m_4187114A,avgpmtlast12m_4525200A,bankacctype_710L,cardtype_51L,clientscnt12m_3712952L,clientscnt3m_3712950L,clientscnt6m_3712949L,clientscnt_100L,clientscnt_1022L,clientscnt_1071L,clientscnt_1130L,clientscnt_136L,clientscnt_157L,clientscnt_257L,clientscnt_304L,clientscnt_360L,clientscnt_493L,clientscnt_533L,clientscnt_887L,…,numinstpaidearlyest_4493214L,numinstpaidlastcontr_4325080L,numinstpaidlate1d_3546852L,numinstregularpaid_973L,numinstregularpaidest_4493210L,numinsttopaygr_769L,numinsttopaygrest_4493213L,numinstunpaidmax_3546851L,numinstunpaidmaxest_4493212L,numnotactivated_1143L,numpmtchanneldd_318L,numrejects9m_859L,opencred_647L,paytype1st_925L,paytype_783L,payvacationpostpone_4187118D,pctinstlsallpaidearl3d_427L,pctinstlsallpaidlat10d_839L,pctinstlsallpaidlate1d_3546856L,pctinstlsallpaidlate4d_3546849L,pctinstlsallpaidlate6d_3546844L,pmtnum_254L,posfpd10lastmonth_333P,posfpd30lastmonth_3976960P,posfstqpd30lastmonth_3976962P,previouscontdistrict_112M,price_1097A,sellerplacecnt_915L,sellerplacescnt_216L,sumoutstandtotal_3546847A,sumoutstandtotalest_4493215A,totaldebt_9A,totalsettled_863A,totinstallast1m_4525188A,twobodfilling_608L,typesuite_864L,validfrom_1069D
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,f64,f64,f64,f64,f64,f64,str,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,bool,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f64,f64,f64,f64,f64,f64,f64,f64,str,str,str
57543,0.0,191767.36,3674.6,1218.2001,0.0,0.0,0.0,0.0,0.0,9.0,1.0,2.0,1.0,1.0,16049.4,17054.4,2.0,14554.4,24482.0,"""CA""",,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,25.0,1.0,31.0,96.0,96.0,10.0,10.0,10.0,10.0,0.0,0.0,0.0,False,,,"""2020-05-23""",0.35417,0.0,0.32292,0.07292,0.05208,6.0,0.0,0.0,0.0,"""a55475b1""",0.0,0.0,5.0,12154.4,12154.4,12154.4,456031.1,17859.6,"""FO""","""AL""",
57551,0.0,71036.4,2844.6,0.0,0.0,1.0,0.0,0.0,0.0,2.0,-1.0,,-1.0,1.0,8357.2,,1.0,0.0,9551.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,9.0,1.0,9.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,"""OTHER""","""OTHER""",,0.33333,0.0,0.11111,0.11111,0.0,12.0,,,,"""P11_36_178""",27095.201,1.0,2.0,0.0,0.0,0.0,75219.0,,"""FO""",,
57552,0.0,183992.0,6298.8003,12155.4,0.0,0.0,0.0,0.0,0.0,9.0,-9.0,-7.0,-9.0,0.0,7440.4,,0.0,199322.4,9148.4,"""CA""",,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,24.0,0.0,0.0,32.0,32.0,30.0,30.0,22.0,22.0,1.0,0.0,1.0,False,"""OTHER""","""OTHER""",,0.83871,0.0,0.0,0.0,0.0,24.0,0.0,0.0,0.0,"""P21_87_50""",,0.0,6.0,191269.61,191269.61,191269.61,284213.0,18889.0,"""BO""",,
57569,0.0,0.0,4682.6,0.0,0.0,1.0,0.0,0.0,0.0,6.0,2824.0,,2824.0,2517.0,,,,,10796.4,"""CA""",,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,5.0,5.0,10.0,15.0,15.0,0.0,0.0,0.0,0.0,1.0,0.0,4.0,True,,,,0.33333,0.6,0.66667,0.66667,0.6,24.0,0.0,,,"""a55475b1""",,2.0,3.0,0.0,0.0,0.0,95348.42,,"""FO""",,
57630,0.0,0.0,8905.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,,,0.0,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,3.0,7.0,2.0,7.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,,,,0.42857,0.0,0.28571,0.0,0.0,12.0,0.0,0.0,0.0,"""a55475b1""",96174.0,0.0,1.0,0.0,0.0,0.0,9677.601,,"""FO""",,


In [34]:
test_static_cb.head()

case_id,assignmentdate_238D,assignmentdate_4527235D,assignmentdate_4955616D,birthdate_574D,contractssum_5085716L,dateofbirth_337D,dateofbirth_342D,days120_123L,days180_256L,days30_165L,days360_512L,days90_310L,description_5085714M,education_1103M,education_88M,firstquarter_103L,for3years_128L,for3years_504L,for3years_584L,formonth_118L,formonth_206L,formonth_535L,forquarter_1017L,forquarter_462L,forquarter_634L,fortoday_1092L,forweek_1077L,forweek_528L,forweek_601L,foryear_618L,foryear_818L,foryear_850L,fourthquarter_440L,maritalst_385M,maritalst_893M,numberofqueries_373L,pmtaverage_3A,pmtaverage_4527227A,pmtaverage_4955615A,pmtcount_4527229L,pmtcount_4955617L,pmtcount_693L,pmtscount_423L,pmtssum_45A,requesttype_4525192L,responsedate_1012D,responsedate_4527233D,responsedate_4917613D,riskassesment_302T,riskassesment_940T,secondquarter_766L,thirdquarter_1082L
i64,str,str,str,str,f64,str,str,f64,f64,f64,f64,f64,str,str,str,f64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,f64,str,str,f64,f64,f64,f64,str,f64,str,str,f64,str,str,str,str,str,str,f64,f64
57543,,,,,151364.0,"""1980-11-01""",,2.0,4.0,1.0,8.0,2.0,"""2fc785b2""","""6b2ae0fa""","""a55475b1""",4.0,,,,,,,,,,,,,,,,,9.0,"""38c061ee""","""a55475b1""",8.0,,,,,,,,,,,,"""2021-05-28""",,,2.0,3.0
57549,,,"""2018-05-06""",,1563100.0,"""1959-11-01""",,6.0,9.0,3.0,12.0,4.0,"""2fc785b2""","""39a0853f""","""a55475b1""",9.0,,,,,,,,,,,,,,,,,5.0,"""a7fcb6e5""","""a55475b1""",12.0,,,26815.6,,14.0,,,,,,,"""2022-01-31""",,,8.0,2.0
57551,,,,,2926195.3,"""1982-05-01""",,1.0,3.0,1.0,4.0,1.0,"""2fc785b2""","""6b2ae0fa""","""a55475b1""",3.0,,,,,,,,,,,,,,,,,2.0,"""3439d993""","""a55475b1""",4.0,,,,,,,,,,,,"""2020-12-11""",,,5.0,5.0
57552,,,"""2018-11-18""",,747031.73,"""1955-11-01""",,2.0,2.0,0.0,5.0,0.0,"""2fc785b2""","""a55475b1""","""a55475b1""",3.0,,,,,,,,,,,,,,,,,2.0,"""a55475b1""","""a55475b1""",5.0,,,23402.8,,14.0,,,,,,,"""2020-12-11""",,,7.0,1.0
57569,,,"""2011-10-14""",,,"""1949-09-01""",,4.0,4.0,1.0,4.0,4.0,"""2fc785b2""","""717ddd49""","""a55475b1""",0.0,,,,,,,,,,,,,,,,,0.0,"""3439d993""","""a55475b1""",4.0,,,17333.6,,14.0,,,,,,,"""2022-01-03""",,,1.0,3.0


In [35]:
test_person_1.head()

case_id,birth_259D,birthdate_87D,childnum_185L,contaddr_district_15M,contaddr_matchlist_1032L,contaddr_smempladdr_334L,contaddr_zipcode_807M,education_927M,empl_employedfrom_271D,empl_employedtotal_800L,empl_industry_691L,empladdr_district_926M,empladdr_zipcode_114M,familystate_447L,gender_992L,housetype_905L,housingtype_772L,incometype_1044T,isreference_387L,language1_981M,mainoccupationinc_384A,maritalst_703L,num_group1,personindex_1023L,persontype_1072L,persontype_792L,registaddr_district_1083M,registaddr_zipcode_184M,relationshiptoclient_415T,relationshiptoclient_642T,remitter_829L,role_1084L,role_993L,safeguarantyflag_411L,sex_738L,type_25L
i64,str,str,f64,str,bool,bool,str,str,str,str,str,str,str,str,str,str,str,str,bool,str,f64,str,i64,f64,f64,f64,str,str,str,str,bool,str,str,bool,str,str
57543,"""1980-11-01""",,,"""P107_155_111""",False,False,"""P91_47_168""","""P33_146_175""",,,,"""a55475b1""","""a55475b1""","""SINGLE""",,,,"""SALARIED_GOVT""",,"""a55475b1""",34000.0,,0,0.0,1.0,1.0,"""P107_155_111""","""P91_47_168""",,,,"""CL""",,False,"""F""","""PRIMARY_MOBILE…"
57543,,,,"""a55475b1""",,,"""a55475b1""","""a55475b1""",,,,"""a55475b1""","""a55475b1""",,,,,,,"""a55475b1""",,,1,1.0,5.0,5.0,"""a55475b1""","""a55475b1""","""SIBLING""","""SIBLING""",False,"""PE""",,,,"""PHONE"""
57549,"""1959-11-01""",,,"""a55475b1""",,,"""a55475b1""","""P106_81_188""",,,,"""a55475b1""","""a55475b1""","""SINGLE""",,,,"""RETIRED_PENSIO…",,"""a55475b1""",49800.0,,0,0.0,1.0,1.0,"""P121_131_159""","""P111_112_180""",,,,"""CL""",,True,"""F""","""PRIMARY_MOBILE…"
57549,,,,"""a55475b1""",,,"""a55475b1""","""a55475b1""",,,,"""a55475b1""","""a55475b1""",,,,,,,"""a55475b1""",,,1,1.0,5.0,5.0,"""a55475b1""","""a55475b1""","""COLLEAGUE""","""COLLEAGUE""",False,"""PE""",,,,"""PHONE"""
57551,"""1982-05-01""",,,"""P11_36_178""",False,False,"""P97_107_128""","""a55475b1""","""2002-01-08""",,,"""a55475b1""","""a55475b1""",,,,,"""SALARIED_GOVT""",,"""P10_39_147""",59600.0,,0,0.0,1.0,1.0,"""P11_36_178""","""P97_107_128""",,,,"""CL""",,False,"""F""","""PRIMARY_MOBILE…"


In [36]:
test_credit_bureau_b_2.head()

case_id,num_group1,num_group2,pmts_date_1107D,pmts_dpdvalue_108P,pmts_pmtsoverdue_635A
i64,i64,i64,str,f64,f64
57675,0,0,"""2020-05-15""",0.0,0.0
57675,0,1,"""2020-06-14""",0.0,0.0
57675,0,2,"""2020-07-15""",0.0,0.0
57675,0,3,"""2020-08-15""",0.0,0.0
57675,0,4,"""2020-09-13""",0.0,0.0


In [37]:
applprev_1_t.head()

case_id,actualdpd_943P,annuity_853A,approvaldate_319D,byoccupationinc_3656910L,cancelreason_3545846M,childnum_21L,creationdate_885D,credacc_actualbalance_314A,credacc_credlmt_575A,credacc_maxhisbal_375A,credacc_minhisbal_90A,credacc_status_367L,credacc_transactions_402L,credamount_590A,credtype_587L,currdebt_94A,dateactivated_425D,district_544M,downpmt_134A,dtlastpmt_581D,dtlastpmtallstes_3545839D,education_1138M,employedfrom_700D,familystate_726L,firstnonzeroinstldate_307D,inittransactioncode_279L,isbidproduct_390L,isdebitcard_527L,mainoccupationinc_437A,maxdpdtolerance_577P,num_group1,outstandingdebt_522A,pmtnum_8L,postype_4733339M,profession_152M,rejectreason_755M,rejectreasonclient_4145042M,revolvingaccount_394A,status_219L,tenor_203L
i64,f64,f64,str,f64,str,f64,str,f64,f64,f64,f64,str,str,f64,str,f64,str,str,f64,str,str,str,str,str,str,str,bool,str,f64,f64,i64,f64,f64,str,str,str,str,f64,str,f64
57543,0.0,935.2,"""2018-10-24""",,"""a55475b1""",,"""2018-10-24""",,0.0,,,,,8398.0,"""COL""",0.0,"""2018-11-06""","""P98_137_111""",0.0,"""2019-07-24""","""2019-07-24""","""P33_146_175""","""2011-01-01""","""SINGLE""","""2018-11-24""","""POS""",False,,30000.0,0.0,3,0.0,11.0,"""P177_117_192""","""a55475b1""","""a55475b1""","""a55475b1""",,"""K""",11.0
57543,0.0,1218.2001,"""2021-01-13""",,"""a55475b1""",,"""2021-01-13""",,0.0,,,,,14960.0,"""COL""",12154.4,"""2021-01-14""","""P59_55_87""",0.0,,"""2021-05-15""","""P106_81_188""",,"""SINGLE""","""2021-02-13""","""POS""",False,,34000.0,1.0,1,12154.4,14.0,"""P60_146_156""","""a55475b1""","""a55475b1""","""a55475b1""",,"""A""",14.0
57543,0.0,1892.0,"""2014-10-13""",1.0,"""a55475b1""",1.0,"""2014-10-13""",,0.0,,,,,20000.0,"""CAL""",0.0,"""2014-10-13""","""P98_137_111""",0.0,"""2016-04-13""","""2016-04-13""","""P33_146_175""","""2011-07-01""","""MARRIED""","""2014-11-13""","""CASH""",False,,30000.0,1.0,8,0.0,18.0,"""P46_145_78""","""a55475b1""","""a55475b1""","""a55475b1""",,"""K""",18.0
57543,0.0,2570.8,"""2021-04-03""",,"""a55475b1""",,"""2021-04-03""",,0.0,,,,,14000.0,"""CAL""",0.0,"""2021-04-26""","""P107_155_111""",0.0,"""2021-04-26""","""2021-04-26""","""P33_146_175""",,"""SINGLE""","""2021-05-03""","""CASH""",False,,34000.0,0.0,0,0.0,6.0,"""P46_145_78""","""a55475b1""","""a55475b1""","""a55475b1""",,"""K""",6.0
57543,0.0,2743.0,"""2011-12-24""",15000.0,"""a55475b1""",1.0,"""2011-12-24""",,0.0,,,,,25998.0,"""COL""",0.0,"""2011-12-29""","""a55475b1""",0.0,,,"""P33_146_175""","""2011-07-01""","""MARRIED""","""2012-01-24""","""POS""",False,,10000.0,0.0,9,0.0,15.0,"""P217_110_186""","""a55475b1""","""a55475b1""","""a55475b1""",,"""K""",15.0


In [38]:
other_1_t.head()

case_id,amtdebitincoming_4809443A,amtdebitoutgoing_4809440A,amtdepositbalance_4809441A,amtdepositincoming_4809444A,amtdepositoutgoing_4809442A,num_group1
i64,f64,f64,f64,f64,f64,i64
57543,0.0,0.0,1579.6,6900.0,33.4,0
57549,34933.402,34820.0,0.0,0.0,0.0,0
57644,0.0,51.8,0.0,0.0,0.0,1
57644,0.0,60.4,0.0,0.0,0.0,0
57648,15233.4,15233.4,0.0,0.0,0.0,0


In [39]:
tax_registry_a_1_t.head() 

case_id,amount_4527230A,name_4527232M,num_group1,recorddate_4527225D
i64,f64,str,i64,str
57675,2372.2,"""d7cce9e8""",1,"""2022-01-07"""
57675,2958.2,"""ff9eb829""",2,"""2022-01-07"""
57675,3055.8,"""ff9eb829""",4,"""2022-01-07"""
57675,3368.8,"""d7cce9e8""",0,"""2022-01-07"""
57675,4299.4,"""ff9eb829""",3,"""2022-01-07"""


In [40]:
tax_registry_b_1_t.head()

case_id,amount_4917619A,deductiondate_4917603D,name_4917606M,num_group1
i64,f64,str,str,i64
57543,24867.0,"""2020-12-15""","""787c689d""",0
57543,24867.0,"""2021-02-03""","""787c689d""",1
57543,24867.0,"""2021-02-26""","""787c689d""",2
57543,24867.0,"""2021-05-19""","""787c689d""",5
57543,25111.6,"""2021-03-29""","""787c689d""",3


In [41]:
tax_registry_c_1_t.head()

case_id,employername_160M,num_group1,pmtamount_36A,processingdate_168D
str,str,str,f64,str


In [42]:
credit_bureau_b_1_t.head()

case_id,amount_1115A,classificationofcontr_1114M,contractdate_551D,contractmaturitydate_151D,contractst_516M,contracttype_653M,credlmt_1052A,credlmt_228A,credlmt_3940954A,credor_3940957M,credquantity_1099L,credquantity_984L,debtpastduevalue_732A,debtvalue_227A,dpd_550P,dpd_733P,dpdmax_851P,dpdmaxdatemonth_804T,dpdmaxdateyear_742T,installmentamount_644A,installmentamount_833A,instlamount_892A,interesteffectiverate_369L,interestrateyearly_538L,lastupdate_260D,maxdebtpduevalodued_3940955A,num_group1,numberofinstls_810L,overdueamountmax_950A,overdueamountmaxdatemonth_494T,overdueamountmaxdateyear_432T,periodicityofpmts_997L,periodicityofpmts_997M,pmtdaysoverdue_1135P,pmtmethod_731M,pmtnumpending_403L,purposeofcred_722M,residualamount_1093A,residualamount_127A,residualamount_3940956A,subjectrole_326M,subjectrole_43M,totalamount_503A,totalamount_881A
i64,f64,str,str,str,str,str,f64,f64,f64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f64,i64,f64,f64,f64,f64,str,str,f64,str,f64,str,f64,f64,f64,str,str,f64,f64
57675,1488000.0,"""01f63ac8""","""2020-04-22""","""2035-04-23""","""04bf6e27""","""60e784d6""",,,,"""74bd67a8""",,,0.0,1433179.0,,,0.0,11.0,2019.0,,,17030.264,,,"""2022-01-02""",0.0,2,181.0,0.0,11.0,2019.0,,"""a0b598e4""",0.0,"""e914c86c""",160.0,"""60c73645""",,,,"""a55475b1""","""a55475b1""",,
57675,,"""ea6782cc""","""2020-06-01""","""2022-06-02""","""7241344e""","""1c9c5356""",220598.0,10000.0,20000.0,"""b619fa46""",2.0,1.0,0.0,,0.0,0.0,0.0,12.0,2019.0,0.0,182816.69,,,,"""2021-12-27""",0.0,0,,0.0,12.0,2019.0,,"""a55475b1""",0.0,"""a55475b1""",,"""60c73645""",0.0,182816.69,17720.8,"""ab3c25cf""","""ab3c25cf""",220598.0,10000.0
57675,,"""ea6782cc""","""2012-09-29""","""2022-09-29""","""7241344e""","""1c9c5356""",,,200598.0,"""b619fa46""",1.0,19.0,0.0,,0.0,0.0,37403.0,8.0,2013.0,0.0,1433179.0,,,,"""2021-12-24""",0.0,1,,10.6,4.0,2015.0,,"""a55475b1""",0.0,"""a55475b1""",,"""60c73645""",,,165095.89,"""ab3c25cf""","""ab3c25cf""",1488000.0,480262.6
57754,1932619.4,"""01f63ac8""","""2017-06-09""","""2037-06-09""","""7241344e""","""c6678a8f""",,,,"""3169c2b9""",,,0.0,1473622.4,,,0.0,2.0,2018.0,,,8052.5796,,,"""2022-03-14""",0.0,4,240.0,0.0,2.0,2018.0,,"""a0b598e4""",0.0,"""f6e26148""",183.0,"""9e302002""",,,,"""a55475b1""","""a55475b1""",,
57754,,"""ea6782cc""","""2018-11-18""","""2022-11-18""","""7241344e""","""1c9c5356""",132032.0,38000.0,35232.0,"""b619fa46""",4.0,3.0,0.0,,0.0,0.0,0.0,11.0,2017.0,0.0,68059.91,,,,"""2022-06-03""",0.0,0,,0.0,11.0,2017.0,,"""a55475b1""",0.0,"""a55475b1""",,"""60c73645""",0.0,68059.91,11287.2,"""ab3c25cf""","""ab3c25cf""",132032.0,38000.0


In [43]:
deposit_1_t.head()

case_id,amount_416A,contractenddate_991D,num_group1,openingdate_313D
i64,f64,str,i64,str
57644,0.0,"""2016-11-29""",0,"""2013-11-29"""
57644,0.0,,1,"""2012-08-01"""
57644,0.0,,2,"""2013-08-08"""
57701,0.0,"""2017-09-05""",0,"""2014-09-06"""
57708,262.558,"""2019-04-21""",0,"""2016-04-21"""


In [44]:
person_1_t.head()

case_id,birth_259D,birthdate_87D,childnum_185L,contaddr_district_15M,contaddr_matchlist_1032L,contaddr_smempladdr_334L,contaddr_zipcode_807M,education_927M,empl_employedfrom_271D,empl_employedtotal_800L,empl_industry_691L,empladdr_district_926M,empladdr_zipcode_114M,familystate_447L,gender_992L,housetype_905L,housingtype_772L,incometype_1044T,isreference_387L,language1_981M,mainoccupationinc_384A,maritalst_703L,num_group1,personindex_1023L,persontype_1072L,persontype_792L,registaddr_district_1083M,registaddr_zipcode_184M,relationshiptoclient_415T,relationshiptoclient_642T,remitter_829L,role_1084L,role_993L,safeguarantyflag_411L,sex_738L,type_25L
i64,str,str,f64,str,bool,bool,str,str,str,str,str,str,str,str,str,str,str,str,bool,str,f64,str,i64,f64,f64,f64,str,str,str,str,bool,str,str,bool,str,str
57543,"""1980-11-01""",,,"""P107_155_111""",False,False,"""P91_47_168""","""P33_146_175""",,,,"""a55475b1""","""a55475b1""","""SINGLE""",,,,"""SALARIED_GOVT""",,"""a55475b1""",34000.0,,0,0.0,1.0,1.0,"""P107_155_111""","""P91_47_168""",,,,"""CL""",,False,"""F""","""PRIMARY_MOBILE…"
57543,,,,"""a55475b1""",,,"""a55475b1""","""a55475b1""",,,,"""a55475b1""","""a55475b1""",,,,,,,"""a55475b1""",,,1,1.0,5.0,5.0,"""a55475b1""","""a55475b1""","""SIBLING""","""SIBLING""",False,"""PE""",,,,"""PHONE"""
57549,"""1959-11-01""",,,"""a55475b1""",,,"""a55475b1""","""P106_81_188""",,,,"""a55475b1""","""a55475b1""","""SINGLE""",,,,"""RETIRED_PENSIO…",,"""a55475b1""",49800.0,,0,0.0,1.0,1.0,"""P121_131_159""","""P111_112_180""",,,,"""CL""",,True,"""F""","""PRIMARY_MOBILE…"
57549,,,,"""a55475b1""",,,"""a55475b1""","""a55475b1""",,,,"""a55475b1""","""a55475b1""",,,,,,,"""a55475b1""",,,1,1.0,5.0,5.0,"""a55475b1""","""a55475b1""","""COLLEAGUE""","""COLLEAGUE""",False,"""PE""",,,,"""PHONE"""
57551,"""1982-05-01""",,,"""P11_36_178""",False,False,"""P97_107_128""","""a55475b1""","""2002-01-08""",,,"""a55475b1""","""a55475b1""",,,,,"""SALARIED_GOVT""",,"""P10_39_147""",59600.0,,0,0.0,1.0,1.0,"""P11_36_178""","""P97_107_128""",,,,"""CL""",,False,"""F""","""PRIMARY_MOBILE…"


In [45]:
debitcard_1_t.head()

case_id,last180dayaveragebalance_704A,last180dayturnover_1134A,last30dayturnover_651A,num_group1,openingdate_857D
i64,f64,f64,f64,i64,str
57644,,,,0,"""2013-11-29"""
57644,,,,1,"""2012-08-01"""
57644,,,,2,"""2013-08-08"""
57701,,,,0,"""2014-09-06"""
57708,,,,0,"""2016-04-21"""


In [46]:
applprev_2_t.head()

case_id,cacccardblochreas_147M,conts_type_509L,credacc_cards_status_52L,num_group1,num_group2
i64,str,str,str,i64,i64
57543,"""a55475b1""","""PRIMARY_MOBILE…",,0,0
57543,"""a55475b1""",,,0,1
57543,"""a55475b1""","""PRIMARY_MOBILE…",,1,0
57543,"""a55475b1""",,,1,1
57543,"""a55475b1""","""PRIMARY_MOBILE…",,2,0


In [47]:
person_2_t.head()

case_id,addres_district_368M,addres_role_871L,addres_zip_823M,conts_role_79M,empls_economicalst_849M,empls_employedfrom_796D,empls_employer_name_740M,num_group1,num_group2,relatedpersons_role_762T
i64,str,str,str,str,str,str,str,i64,i64,str
57551,"""a55475b1""",,"""a55475b1""","""a55475b1""","""a55475b1""",,"""a55475b1""",0,0,
57552,"""a55475b1""",,"""a55475b1""","""a55475b1""","""a55475b1""",,"""a55475b1""",0,0,
57569,"""P121_131_159""","""CONTACT""","""P96_113_139""","""P38_92_157""","""P148_57_109""",,"""a55475b1""",0,0,
57569,"""P121_131_159""","""PERMANENT""","""P96_113_139""","""a55475b1""","""a55475b1""",,"""a55475b1""",0,1,
57569,"""a55475b1""","""CONTACT""","""P47_66_61""","""P125_105_50""","""P148_57_109""",,"""a55475b1""",1,0,"""PARENT"""


# Feature engineering

In [48]:
# Dictionary of Polars DataFrames
tables = {'train_basetable':train_basetable, 'train_static': train_static,'train_static_cb':train_static_cb,
          'train_person_1': train_person_1, 'train_credit_bureau_b_2': train_credit_bureau_b_2, 
          'applprev_1':applprev_1, 'other_1':other_1, 'tax_registry_a_1':tax_registry_a_1, 
          'tax_registry_b_1':tax_registry_b_1,'tax_registry_c_1':tax_registry_c_1 ,
          'credit_bureau_b_1':credit_bureau_b_1, 'deposit_1':deposit_1,'person_1':person_1, 
          'debitcard_1':debitcard_1, 'applprev_2':applprev_2, 'person_2':person_2,
          'credit_bureau_a_1':credit_bureau_a_1,'credit_bureau_a_2':credit_bureau_a_2}

# Specify the target column
target_column_1 = "num_group1"
target_column_2 = "num_group2"


# Loop through each table
print("Table contains", target_column_1, "or", target_column_2 ,":","\n")

for table_name, table in tables.items():
    # Check if the target column exists in the current table
    
    if target_column_1 in table.columns or target_column_2 in table.columns:
              
        # If the column exists, print the table name
       
        print(table_name)

Table contains num_group1 or num_group2 : 

train_person_1
train_credit_bureau_b_2
applprev_1
other_1
tax_registry_a_1
tax_registry_b_1
tax_registry_c_1
credit_bureau_b_1
deposit_1
person_1
debitcard_1
applprev_2
person_2
credit_bureau_a_1
credit_bureau_a_2


In [49]:
# Dictionary of Polars DataFrames
table = {'train_basetable':train_basetable, 'train_static': train_static,'train_static_cb':train_static_cb,
          'train_person_1': train_person_1, 'train_credit_bureau_b_2': train_credit_bureau_b_2, 
          'applprev_1':applprev_1, 'other_1':other_1, 'tax_registry_a_1':tax_registry_a_1, 
          'tax_registry_b_1':tax_registry_b_1,'tax_registry_c_1':tax_registry_c_1 ,
          'credit_bureau_b_1':credit_bureau_b_1, 'deposit_1':deposit_1,'person_1':person_1, 
          'debitcard_1':debitcard_1, 'applprev_2':applprev_2, 'person_2':person_2,
          'credit_bureau_a_1':credit_bureau_a_1,'credit_bureau_a_2':credit_bureau_a_2}

# Specify the target column
target_column_1 = "num_group1"
target_column_2 = "num_group2"


# Loop through each table
print("Table contains", target_column_1, "and", target_column_2 ,":","\n")

for table_name, table in tables.items():
    # Check if the target column exists in the current table
    
    if target_column_1 in table.columns and target_column_2 in table.columns:
              
        # If the column exists, print the table name
       
        print(table_name)

Table contains num_group1 and num_group2 : 

train_credit_bureau_b_2
applprev_2
person_2
credit_bureau_a_2


In [50]:
# We need to use aggregation functions in tables with depth > 1, so tables that contain num_group1 column or 
# also num_group2 column.
train_person_1_feats_1 = train_person_1.group_by("case_id").agg(
    pl.col("mainoccupationinc_384A").max().alias("mainoccupationinc_384A_max"),
    (pl.col("incometype_1044T") == "SELFEMPLOYED").max().alias("mainoccupationinc_384A_any_selfemployed")
)

train_person_1_feats_1

case_id,mainoccupationinc_384A_max,mainoccupationinc_384A_any_selfemployed
i64,f64,bool
2317,19200.0,false
624343,20000.0,false
1357670,70000.0,false
1432989,118000.0,false
1807225,40000.0,false
…,…,…
1646207,42000.0,false
1538316,40000.0,false
1796935,20000.0,false
5826,80000.0,false


In [51]:
# Here num_group1=0 has special meaning, it is the person who applied for the loan.
train_person_1_feats_2 = train_person_1.select(["case_id", "num_group1", "housetype_905L"]).filter(
    pl.col("num_group1") == 0
).drop("num_group1").rename({"housetype_905L": "person_housetype"})

train_person_1_feats_2

case_id,person_housetype
i64,str
0,
1,
2,
3,
4,
…,…
2703450,"""OWNED"""
2703451,
2703452,
2703453,


In [52]:
# Here we have num_goup1 and num_group2, so we need to aggregate again.
train_credit_bureau_b_2_feats = train_credit_bureau_b_2.group_by("case_id").agg(
    pl.col("pmts_pmtsoverdue_635A").max().alias("pmts_pmtsoverdue_635A_max"),
    (pl.col("pmts_dpdvalue_108P") > 31).max().alias("pmts_dpdvalue_108P_over31")
)

train_credit_bureau_b_2_feats

case_id,pmts_pmtsoverdue_635A_max,pmts_dpdvalue_108P_over31
i64,f64,bool
742970,22.0,true
997683,0.0,false
19204,0.0,false
131698,0.0,false
917695,0.0,false
…,…,…
1940997,25.2,false
171840,15.8,true
142686,3.8,true
221522,0.0,false


In [53]:
# 
applprev_1_feats = applprev_1.group_by("case_id").agg(
    pl.col("annuity_853A").max().alias("pmts_pmtsoverdue_635A_max"),
    (pl.col("actualdpd_943P") > 31).max().alias("actualdpd_943P_above31")
)

applprev_1_feats

case_id,pmts_pmtsoverdue_635A_max,actualdpd_943P_above31
i64,f64,bool
1907893,10382.8,false
2599538,4356.2,false
231630,3957.2,false
1330669,2448.4001,false
2642988,6046.6,false
…,…,…
719059,1513.4,false
2694608,5835.4,false
1477865,15258.4,false
1510755,5248.0,false


In [54]:
# 
train_credit_bureau_b_1_feat = credit_bureau_b_1.group_by("case_id").agg(
    pl.col("dpd_733P").max().alias("dpd_for_terminated_loans")
)

train_credit_bureau_b_1_feat

case_id,dpd_for_terminated_loans
i64,f64
912634,0.0
1932715,0.0
744804,
259423,0.0
36130,0.0
…,…
1350964,0.0
1717852,0.0
1742290,0.0
2574087,0.0


In [55]:
# 
train_credit_bureau_a_1_feat = credit_bureau_a_1.group_by("case_id").agg(
    pl.col("dpdmax_139P").max().alias("dpdmax_for_active_contracts")
)

train_credit_bureau_a_1_feat

case_id,dpdmax_for_active_contracts
i64,f64
135336,0.0
1271573,20.0
801248,0.0
836576,0.0
859561,0.0
…,…
1597266,79.0
852206,48.0
2616409,0.0
881750,0.0


In [56]:
# We will process in this examples only A-type and M-type columns, so we need to select them.
selected_static_cols = []
for col in train_static.columns:
    if col[-1] in ("A", "M", "P"):
        selected_static_cols.append(col)
print(selected_static_cols)

['actualdpdtolerance_344P', 'amtinstpaidbefduel24m_4187115A', 'annuity_780A', 'annuitynextmonth_57A', 'avgdbddpdlast24m_3658932P', 'avgdbddpdlast3m_4187120P', 'avgdbdtollast24m_4525197P', 'avgdpdtolclosure24_3658938P', 'avginstallast24m_3658937A', 'avglnamtstart24m_4525187A', 'avgmaxdpdlast9m_3716943P', 'avgoutstandbalancel6m_4187114A', 'avgpmtlast12m_4525200A', 'credamount_770A', 'currdebt_22A', 'currdebtcredtyperange_828A', 'disbursedcredamount_1113A', 'downpmt_116A', 'inittransactionamount_650A', 'lastapprcommoditycat_1041M', 'lastapprcommoditytypec_5251766M', 'lastapprcredamount_781A', 'lastcancelreason_561M', 'lastotherinc_902A', 'lastotherlnsexpense_631A', 'lastrejectcommoditycat_161M', 'lastrejectcommodtypec_5251769M', 'lastrejectcredamount_222A', 'lastrejectreason_759M', 'lastrejectreasonclient_4145040M', 'maininc_215A', 'maxannuity_159A', 'maxannuity_4075009A', 'maxdbddpdlast1m_3658939P', 'maxdbddpdtollast12m_3658940P', 'maxdbddpdtollast6m_4187119P', 'maxdebt4_972A', 'maxdpdfr

In [57]:
train_static.select(selected_static_cols)

actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,avgdbddpdlast24m_3658932P,avgdbddpdlast3m_4187120P,avgdbdtollast24m_4525197P,avgdpdtolclosure24_3658938P,avginstallast24m_3658937A,avglnamtstart24m_4525187A,avgmaxdpdlast9m_3716943P,avgoutstandbalancel6m_4187114A,avgpmtlast12m_4525200A,credamount_770A,currdebt_22A,currdebtcredtyperange_828A,disbursedcredamount_1113A,downpmt_116A,inittransactionamount_650A,lastapprcommoditycat_1041M,lastapprcommoditytypec_5251766M,lastapprcredamount_781A,lastcancelreason_561M,lastotherinc_902A,lastotherlnsexpense_631A,lastrejectcommoditycat_161M,lastrejectcommodtypec_5251769M,lastrejectcredamount_222A,lastrejectreason_759M,lastrejectreasonclient_4145040M,maininc_215A,maxannuity_159A,maxannuity_4075009A,maxdbddpdlast1m_3658939P,maxdbddpdtollast12m_3658940P,maxdbddpdtollast6m_4187119P,maxdebt4_972A,maxdpdfrom6mto36m_3546853P,maxdpdinstlnum_3546846P,maxdpdlast12m_727P,maxdpdlast24m_143P,maxdpdlast3m_392P,maxdpdlast6m_474P,maxdpdlast9m_1059P,maxdpdtolerance_374P,maxinstallast24m_3658928A,maxlnamtstart6m_4525199A,maxoutstandbalancel12m_4187113A,maxpmtlast3m_4525190A,mindbddpdlast24m_3658935P,mindbdtollast24m_4525191P,posfpd10lastmonth_333P,posfpd30lastmonth_3976960P,posfstqpd30lastmonth_3976962P,previouscontdistrict_112M,price_1097A,sumoutstandtotal_3546847A,sumoutstandtotalest_4493215A,totaldebt_9A,totalsettled_863A,totinstallast1m_4525188A
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,str,f64,f64,str,str,f64,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f64,f64,f64,f64,f64,f64
,,1917.6,0.0,,,,,,,,,,30000.0,0.0,0.0,30000.0,0.0,,"""a55475b1""","""a55475b1""",,"""a55475b1""",,,"""a55475b1""","""a55475b1""",,"""a55475b1""","""a55475b1""",,0.0,,,,,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,0.0,0.0,,"""a55475b1""",,,,0.0,0.0,
,,3134.0,0.0,,,,,,,,,,19999.8,0.0,0.0,19999.8,0.0,,"""a55475b1""","""a55475b1""",,"""a55475b1""",,,"""a55475b1""","""a55475b1""",,"""a55475b1""","""a55475b1""",,0.0,,,,,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,0.0,0.0,,"""a55475b1""",,,,0.0,0.0,
,,4937.0,0.0,,,,,,,,,,78000.0,0.0,0.0,78000.0,0.0,,"""a55475b1""","""a55475b1""",,"""a55475b1""",,,"""a55475b1""","""a55475b1""",10000.0,"""a55475b1""","""a55475b1""",,0.0,,,,,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,0.0,0.0,,"""a55475b1""",,,,0.0,0.0,
,,4643.6,0.0,,,,,,,,,,40000.0,0.0,0.0,40000.0,0.0,,"""a55475b1""","""a55475b1""",,"""P94_109_143""",,,"""a55475b1""","""a55475b1""",59999.8,"""P94_109_143""","""a55475b1""",,0.0,,,,,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,0.0,0.0,,"""a55475b1""",,,,0.0,0.0,
,,3390.2,0.0,,,,,,,,,,44000.0,0.0,0.0,44000.0,0.0,,"""a55475b1""","""a55475b1""",,"""P24_27_36""",,,"""a55475b1""","""a55475b1""",,"""a55475b1""","""a55475b1""",,0.0,,,,,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,0.0,0.0,,"""a55475b1""",,,,0.0,0.0,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
0.0,176561.36,3675.4001,0.0,-23.0,-43.0,-23.0,0.0,7356.8003,,0.0,16392.496,6750.2,30000.0,0.0,0.0,30000.0,0.0,,"""P12_6_178""","""P142_50_170""",20020.0,"""a55475b1""",,,"""a55475b1""","""a55475b1""",150000.0,"""P94_109_143""","""P94_109_143""",36000.0,75521.91,,0.0,0.0,0.0,105019.79,0.0,6.0,0.0,0.0,0.0,0.0,0.0,8.0,46718.2,49651.402,77533.76,14346.319,-144.0,-144.0,0.0,0.0,0.0,"""P123_39_170""",0.0,0.0,0.0,0.0,428159.66,14346.319
0.0,301276.47,7088.6,6191.6,-18.0,-12.0,-18.0,0.0,12553.2,,0.0,105129.31,15780.4,100000.0,68098.4,68098.4,40739.54,0.0,,"""a55475b1""","""a55475b1""",0.0,"""a55475b1""",,,"""a55475b1""","""a55475b1""",,"""a55475b1""","""a55475b1""",,117251.6,,0.0,2.0,2.0,202775.55,0.0,7.0,2.0,2.0,0.0,2.0,2.0,2.0,40499.8,116813.4,250031.2,40499.805,-92.0,-92.0,0.0,0.0,0.0,"""P162_18_172""",,68098.4,68098.4,68098.4,701247.3,40499.805
0.0,14232.4,7788.8003,0.0,-12.0,,-16.0,1.0,2662.4001,,,,1500.6,60000.0,0.0,0.0,60000.0,0.0,,"""P159_130_59""","""P75_90_70""",3998.0,"""P180_60_137""",,,"""a55475b1""","""a55475b1""",,"""a55475b1""","""a55475b1""",24000.0,6600.0,,,-27.0,,17143.4,4.0,5.0,0.0,4.0,0.0,0.0,0.0,4.0,3243.4001,4182.0,0.0,,-27.0,-55.0,0.0,0.0,0.0,"""P133_44_167""",0.0,0.0,0.0,0.0,24002.0,
0.0,197371.58,1195.4,2827.2,-33.0,-64.0,-34.0,0.0,8212.601,,0.0,47943.062,9921.2,6000.0,46806.6,46806.6,6000.0,0.0,,"""a55475b1""","""a55475b1""",0.0,"""a55475b1""",,,"""P159_130_59""","""P174_113_42""",2198.0,"""a55475b1""","""a55475b1""",,163202.0,,-66.0,0.0,-33.0,126780.0,2.0,13.0,0.0,0.0,0.0,0.0,0.0,34.0,88740.805,94265.2,81604.6,2827.2,-68.0,-68.0,0.0,0.0,0.0,"""P123_6_84""",0.0,46806.6,46806.6,46806.6,440145.3,5654.4


In [58]:
selected_static_cb_cols = []
for col in train_static_cb.columns:
    if col[-1] in ("A", "M", "P"):
        selected_static_cb_cols.append(col)
print(selected_static_cb_cols)

['description_5085714M', 'education_1103M', 'education_88M', 'maritalst_385M', 'maritalst_893M', 'pmtaverage_3A', 'pmtaverage_4527227A', 'pmtaverage_4955615A', 'pmtssum_45A']


In [59]:
train_static_cb.select(selected_static_cb_cols)

description_5085714M,education_1103M,education_88M,maritalst_385M,maritalst_893M,pmtaverage_3A,pmtaverage_4527227A,pmtaverage_4955615A,pmtssum_45A
str,str,str,str,str,f64,f64,f64,f64
"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,,,6301.4
"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,,,4019.6
"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,,,14548.0
"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,,,10498.24
"""a55475b1""","""717ddd49""","""a55475b1""","""a7fcb6e5""","""a55475b1""",,,,6344.8804
…,…,…,…,…,…,…,…,…
"""2fc785b2""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,,12155.0,
"""2fc785b2""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,,22904.6,
"""2fc785b2""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,,,
"""2fc785b2""","""6b2ae0fa""","""a55475b1""","""3439d993""","""a55475b1""",,,15792.4,


In [60]:
# Join all tables together.
data = train_basetable.join(
    train_static.select(["case_id"]+selected_static_cols), how="left", on="case_id"
).join(
    train_static_cb.select(["case_id"]+selected_static_cb_cols), how="left", on="case_id"
).join(
    train_person_1_feats_1, how="left", on="case_id"
).join(
    train_person_1_feats_2, how="left", on="case_id"
).join(
    train_credit_bureau_b_2_feats, how="left", on="case_id"
).join(
    train_credit_bureau_b_1_feat, how="left", on="case_id"
).join(
    train_credit_bureau_a_1_feat, how="left", on="case_id"
).join(
    applprev_1_feats, how="left", on="case_id"
)

data

case_id,date_decision,MONTH,WEEK_NUM,target,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,avgdbddpdlast24m_3658932P,avgdbddpdlast3m_4187120P,avgdbdtollast24m_4525197P,avgdpdtolclosure24_3658938P,avginstallast24m_3658937A,avglnamtstart24m_4525187A,avgmaxdpdlast9m_3716943P,avgoutstandbalancel6m_4187114A,avgpmtlast12m_4525200A,credamount_770A,currdebt_22A,currdebtcredtyperange_828A,disbursedcredamount_1113A,downpmt_116A,inittransactionamount_650A,lastapprcommoditycat_1041M,lastapprcommoditytypec_5251766M,lastapprcredamount_781A,lastcancelreason_561M,lastotherinc_902A,lastotherlnsexpense_631A,lastrejectcommoditycat_161M,lastrejectcommodtypec_5251769M,lastrejectcredamount_222A,lastrejectreason_759M,lastrejectreasonclient_4145040M,maininc_215A,maxannuity_159A,…,maxdpdlast6m_474P,maxdpdlast9m_1059P,maxdpdtolerance_374P,maxinstallast24m_3658928A,maxlnamtstart6m_4525199A,maxoutstandbalancel12m_4187113A,maxpmtlast3m_4525190A,mindbddpdlast24m_3658935P,mindbdtollast24m_4525191P,posfpd10lastmonth_333P,posfpd30lastmonth_3976960P,posfstqpd30lastmonth_3976962P,previouscontdistrict_112M,price_1097A,sumoutstandtotal_3546847A,sumoutstandtotalest_4493215A,totaldebt_9A,totalsettled_863A,totinstallast1m_4525188A,description_5085714M,education_1103M,education_88M,maritalst_385M,maritalst_893M,pmtaverage_3A,pmtaverage_4527227A,pmtaverage_4955615A,pmtssum_45A,mainoccupationinc_384A_max,mainoccupationinc_384A_any_selfemployed,person_housetype,pmts_pmtsoverdue_635A_max,pmts_dpdvalue_108P_over31,dpd_for_terminated_loans,dpdmax_for_active_contracts,pmts_pmtsoverdue_635A_max_right,actualdpd_943P_above31
i64,str,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,str,f64,f64,str,str,f64,str,str,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f64,f64,f64,f64,f64,f64,str,str,str,str,str,f64,f64,f64,f64,f64,bool,str,f64,bool,f64,f64,f64,bool
0,"""2019-01-03""",201901,0,0,,,1917.6,0.0,,,,,,,,,,30000.0,0.0,0.0,30000.0,0.0,,"""a55475b1""","""a55475b1""",,"""a55475b1""",,,"""a55475b1""","""a55475b1""",,"""a55475b1""","""a55475b1""",,0.0,…,0.0,0.0,0.0,,,,,,,0.0,0.0,,"""a55475b1""",,,,0.0,0.0,,,,,,,,,,,10800.0,false,,,,,,,
1,"""2019-01-03""",201901,0,0,,,3134.0,0.0,,,,,,,,,,19999.8,0.0,0.0,19999.8,0.0,,"""a55475b1""","""a55475b1""",,"""a55475b1""",,,"""a55475b1""","""a55475b1""",,"""a55475b1""","""a55475b1""",,0.0,…,0.0,0.0,0.0,,,,,,,0.0,0.0,,"""a55475b1""",,,,0.0,0.0,,,,,,,,,,,10000.0,false,,,,,,,
2,"""2019-01-04""",201901,0,0,,,4937.0,0.0,,,,,,,,,,78000.0,0.0,0.0,78000.0,0.0,,"""a55475b1""","""a55475b1""",,"""a55475b1""",,,"""a55475b1""","""a55475b1""",10000.0,"""a55475b1""","""a55475b1""",,0.0,…,0.0,0.0,0.0,,,,,,,0.0,0.0,,"""a55475b1""",,,,0.0,0.0,,,,,,,,,,,14000.0,false,,,,,,1682.4,false
3,"""2019-01-03""",201901,0,0,,,4643.6,0.0,,,,,,,,,,40000.0,0.0,0.0,40000.0,0.0,,"""a55475b1""","""a55475b1""",,"""P94_109_143""",,,"""a55475b1""","""a55475b1""",59999.8,"""P94_109_143""","""a55475b1""",,0.0,…,0.0,0.0,0.0,,,,,,,0.0,0.0,,"""a55475b1""",,,,0.0,0.0,,,,,,,,,,,10000.0,false,,,,,,6140.0,false
4,"""2019-01-04""",201901,0,1,,,3390.2,0.0,,,,,,,,,,44000.0,0.0,0.0,44000.0,0.0,,"""a55475b1""","""a55475b1""",,"""P24_27_36""",,,"""a55475b1""","""a55475b1""",,"""a55475b1""","""a55475b1""",,0.0,…,0.0,0.0,0.0,,,,,,,0.0,0.0,,"""a55475b1""",,,,0.0,0.0,,,,,,,,,,,24000.0,false,,,,,,2556.6,false
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2703450,"""2020-10-05""",202010,91,0,0.0,176561.36,3675.4001,0.0,-23.0,-43.0,-23.0,0.0,7356.8003,,0.0,16392.496,6750.2,30000.0,0.0,0.0,30000.0,0.0,,"""P12_6_178""","""P142_50_170""",20020.0,"""a55475b1""",,,"""a55475b1""","""a55475b1""",150000.0,"""P94_109_143""","""P94_109_143""",36000.0,75521.91,…,0.0,0.0,8.0,46718.2,49651.402,77533.76,14346.319,-144.0,-144.0,0.0,0.0,0.0,"""P123_39_170""",0.0,0.0,0.0,0.0,428159.66,14346.319,"""2fc785b2""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,,12155.0,,40000.0,false,"""OWNED""",,,,,30875.0,false
2703451,"""2020-10-05""",202010,91,0,0.0,301276.47,7088.6,6191.6,-18.0,-12.0,-18.0,0.0,12553.2,,0.0,105129.31,15780.4,100000.0,68098.4,68098.4,40739.54,0.0,,"""a55475b1""","""a55475b1""",0.0,"""a55475b1""",,,"""a55475b1""","""a55475b1""",,"""a55475b1""","""a55475b1""",,117251.6,…,2.0,2.0,2.0,40499.8,116813.4,250031.2,40499.805,-92.0,-92.0,0.0,0.0,0.0,"""P162_18_172""",,68098.4,68098.4,68098.4,701247.3,40499.805,"""2fc785b2""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,,22904.6,,36800.0,false,,,,,,12809.2,false
2703452,"""2020-10-05""",202010,91,0,0.0,14232.4,7788.8003,0.0,-12.0,,-16.0,1.0,2662.4001,,,,1500.6,60000.0,0.0,0.0,60000.0,0.0,,"""P159_130_59""","""P75_90_70""",3998.0,"""P180_60_137""",,,"""a55475b1""","""a55475b1""",,"""a55475b1""","""a55475b1""",24000.0,6600.0,…,0.0,0.0,4.0,3243.4001,4182.0,0.0,,-27.0,-55.0,0.0,0.0,0.0,"""P133_44_167""",0.0,0.0,0.0,0.0,24002.0,,"""2fc785b2""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,,,,30000.0,false,,,,,,9048.0,false
2703453,"""2020-10-05""",202010,91,0,0.0,197371.58,1195.4,2827.2,-33.0,-64.0,-34.0,0.0,8212.601,,0.0,47943.062,9921.2,6000.0,46806.6,46806.6,6000.0,0.0,,"""a55475b1""","""a55475b1""",0.0,"""a55475b1""",,,"""P159_130_59""","""P174_113_42""",2198.0,"""a55475b1""","""a55475b1""",,163202.0,…,0.0,0.0,34.0,88740.805,94265.2,81604.6,2827.2,-68.0,-68.0,0.0,0.0,0.0,"""P123_6_84""",0.0,46806.6,46806.6,46806.6,440145.3,5654.4,"""2fc785b2""","""6b2ae0fa""","""a55475b1""","""3439d993""","""a55475b1""",,,15792.4,,30000.0,false,,,,,,5981.4,false


In [61]:
test_person_1_feats_1 = test_person_1.group_by("case_id").agg(
    pl.col("mainoccupationinc_384A").max().alias("mainoccupationinc_384A_max"),
    (pl.col("incometype_1044T") == "SELFEMPLOYED").max().alias("mainoccupationinc_384A_any_selfemployed")
)

test_person_1_feats_1

case_id,mainoccupationinc_384A_max,mainoccupationinc_384A_any_selfemployed
i64,f64,bool
57569,58000.0,False
57549,49800.0,False
57551,59600.0,False
57543,34000.0,False
57552,112000.0,False
57630,60000.0,False


In [62]:
test_person_1_feats_2 = test_person_1.select(["case_id", "num_group1", "housetype_905L"]).filter(
    pl.col("num_group1") == 0
).drop("num_group1").rename({"housetype_905L": "person_housetype"})

test_person_1_feats_2

case_id,person_housetype
i64,str
57543,
57549,
57551,
57552,"""OWNED"""
57569,"""OWNED"""
57630,"""OWNED"""


In [63]:
test_credit_bureau_b_2_feats = test_credit_bureau_b_2.group_by("case_id").agg(
    pl.col("pmts_pmtsoverdue_635A").max().alias("pmts_pmtsoverdue_635A_max"),
    (pl.col("pmts_dpdvalue_108P") > 31).max().alias("pmts_dpdvalue_108P_over31")
)

test_credit_bureau_b_2_feats

case_id,pmts_pmtsoverdue_635A_max,pmts_dpdvalue_108P_over31
i64,f64,bool
57675,0.0,False


In [64]:
# 
test_applprev_1_feats = applprev_1_t.group_by("case_id").agg(
    pl.col("annuity_853A").max().alias("pmts_pmtsoverdue_635A_max"),
    (pl.col("actualdpd_943P") > 31).max().alias("actualdpd_943P_above31")
)

test_applprev_1_feats

case_id,pmts_pmtsoverdue_635A_max,actualdpd_943P_above31
i64,f64,bool
57543,5172.0,False
57549,3819.8,False


In [65]:
# 
train_credit_bureau_b_1_test_feat = credit_bureau_b_1_t.group_by("case_id").agg(
    pl.col("dpd_733P").max().alias("dpd_for_terminated_loans")
)

train_credit_bureau_b_1_test_feat

case_id,dpd_for_terminated_loans
i64,f64
57754,0.0
57675,0.0
57775,0.0


In [66]:
# 
train_credit_bureau_a_1_test_feat = credit_bureau_a_1_t.group_by("case_id").agg(
    pl.col("dpdmax_139P").max().alias("dpdmax_for_active_contracts")
)

train_credit_bureau_a_1_test_feat

case_id,dpdmax_for_active_contracts
i64,f64
57543,0.0
57549,0.0
57633,0.0
57760,0.0
57551,0.0


In [67]:
data_submission = test_basetable.join(
    test_static.select(["case_id"]+selected_static_cols), how="left", on="case_id"
).join(
    test_static_cb.select(["case_id"]+selected_static_cb_cols), how="left", on="case_id"
).join(
    test_person_1_feats_1, how="left", on="case_id"
).join(
    test_person_1_feats_2, how="left", on="case_id"
).join(
    test_credit_bureau_b_2_feats, how="left", on="case_id"
).join(
    train_credit_bureau_b_1_test_feat, how="left", on="case_id"
).join(
    train_credit_bureau_a_1_test_feat, how="left", on="case_id"
).join(
    test_applprev_1_feats, how="left", on="case_id"
)


data_submission

case_id,date_decision,MONTH,WEEK_NUM,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,avgdbddpdlast24m_3658932P,avgdbddpdlast3m_4187120P,avgdbdtollast24m_4525197P,avgdpdtolclosure24_3658938P,avginstallast24m_3658937A,avglnamtstart24m_4525187A,avgmaxdpdlast9m_3716943P,avgoutstandbalancel6m_4187114A,avgpmtlast12m_4525200A,credamount_770A,currdebt_22A,currdebtcredtyperange_828A,disbursedcredamount_1113A,downpmt_116A,inittransactionamount_650A,lastapprcommoditycat_1041M,lastapprcommoditytypec_5251766M,lastapprcredamount_781A,lastcancelreason_561M,lastotherinc_902A,lastotherlnsexpense_631A,lastrejectcommoditycat_161M,lastrejectcommodtypec_5251769M,lastrejectcredamount_222A,lastrejectreason_759M,lastrejectreasonclient_4145040M,maininc_215A,maxannuity_159A,maxannuity_4075009A,…,maxdpdlast6m_474P,maxdpdlast9m_1059P,maxdpdtolerance_374P,maxinstallast24m_3658928A,maxlnamtstart6m_4525199A,maxoutstandbalancel12m_4187113A,maxpmtlast3m_4525190A,mindbddpdlast24m_3658935P,mindbdtollast24m_4525191P,posfpd10lastmonth_333P,posfpd30lastmonth_3976960P,posfstqpd30lastmonth_3976962P,previouscontdistrict_112M,price_1097A,sumoutstandtotal_3546847A,sumoutstandtotalest_4493215A,totaldebt_9A,totalsettled_863A,totinstallast1m_4525188A,description_5085714M,education_1103M,education_88M,maritalst_385M,maritalst_893M,pmtaverage_3A,pmtaverage_4527227A,pmtaverage_4955615A,pmtssum_45A,mainoccupationinc_384A_max,mainoccupationinc_384A_any_selfemployed,person_housetype,pmts_pmtsoverdue_635A_max,pmts_dpdvalue_108P_over31,dpd_for_terminated_loans,dpdmax_for_active_contracts,pmts_pmtsoverdue_635A_max_right,actualdpd_943P_above31
i64,str,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,str,f64,f64,str,str,f64,str,str,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f64,f64,f64,f64,f64,f64,str,str,str,str,str,f64,f64,f64,f64,f64,bool,str,f64,bool,f64,f64,f64,bool
57543,"""2021-05-14""",202201,100,0.0,191767.36,3674.6,1218.2001,1.0,2.0,1.0,1.0,16049.4,17054.4,2.0,14554.4,24482.0,20000.0,12154.4,0.0,20000.0,0.0,,"""a55475b1""","""a55475b1""",14000.0,"""a55475b1""",,,"""P109_133_183""","""P49_111_165""",24000.0,"""a55475b1""","""a55475b1""",34000.0,280983.56,,…,3.0,3.0,7.0,131700.8,16672.6,157731.78,16641.4,-7.0,-7.0,0.0,0.0,0.0,"""a55475b1""",0.0,12154.4,12154.4,12154.4,456031.1,17859.6,"""2fc785b2""","""6b2ae0fa""","""a55475b1""","""38c061ee""","""a55475b1""",,,,,34000.0,False,,,,,0.0,5172.0,False
57549,"""2022-01-17""",202201,100,0.0,129704.4,5742.6,3546.6,0.0,0.0,-1.0,0.0,32426.201,118964.805,0.0,13681.714,32426.201,75000.0,10638.2,10638.2,75000.0,0.0,,"""a55475b1""","""a55475b1""",94000.0,"""a55475b1""",,,"""a55475b1""","""a55475b1""",160000.0,"""a55475b1""","""P30_86_84""",44000.0,337659.8,,…,0.0,0.0,54.0,122511.4,31820.6,21278.0,122511.4,-2.0,-2.0,0.0,0.0,0.0,"""a55475b1""",,10638.2,10638.2,10638.2,373720.84,126058.0,"""2fc785b2""","""39a0853f""","""a55475b1""","""a7fcb6e5""","""a55475b1""",,,26815.6,,49800.0,False,,,,,0.0,3819.8,False
57551,"""2020-11-27""",202201,100,0.0,71036.4,2844.6,0.0,-1.0,,-1.0,1.0,8357.2,,1.0,0.0,9551.0,27095.201,0.0,0.0,27095.201,0.0,,"""a55475b1""","""a55475b1""",200000.0,"""P85_114_140""",,,"""a55475b1""","""a55475b1""",,"""a55475b1""","""a55475b1""",70000.0,83400.0,,…,0.0,4.0,4.0,41783.402,54000.0,62619.0,,-4.0,-4.0,,,,"""P11_36_178""",27095.201,0.0,0.0,0.0,75219.0,,"""2fc785b2""","""6b2ae0fa""","""a55475b1""","""3439d993""","""a55475b1""",,,,,59600.0,False,,,,,0.0,,
57552,"""2020-11-27""",202201,100,0.0,183992.0,6298.8003,12155.4,-9.0,-7.0,-9.0,0.0,7440.4,,0.0,199322.4,9148.4,100000.0,191269.61,191269.61,100000.0,0.0,,"""a55475b1""","""a55475b1""",0.0,"""P94_109_143""",,,"""a55475b1""","""a55475b1""",150000.0,"""a55475b1""","""P94_109_143""",,110500.0,,…,0.0,0.0,0.0,12155.4,104473.6,288642.6,12155.4,-13.0,-13.0,0.0,0.0,0.0,"""P21_87_50""",,191269.61,191269.61,191269.61,284213.0,18889.0,"""2fc785b2""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,,23402.8,,112000.0,False,"""OWNED""",,,,,,
57569,"""2021-12-20""",202201,100,0.0,0.0,4682.6,0.0,2824.0,,2824.0,2517.0,,,,,10796.4,60000.0,0.0,0.0,60000.0,0.0,,"""a55475b1""","""a55475b1""",20000.0,"""P94_109_143""",,,"""a55475b1""","""a55475b1""",40000.0,"""a55475b1""","""P94_109_143""",6000.0,37704.0,,…,0.0,2865.0,2865.0,,,0.0,,2783.0,2783.0,0.0,,,"""a55475b1""",,0.0,0.0,0.0,95348.42,,"""2fc785b2""","""717ddd49""","""a55475b1""","""3439d993""","""a55475b1""",,,17333.6,,58000.0,False,"""OWNED""",,,,,,
57630,"""2021-03-16""",202201,100,0.0,0.0,8905.0,0.0,,,,0.0,,,,,,96174.0,0.0,0.0,96174.0,0.0,,"""P148_110_5""","""P161_88_182""",8876.0,"""P198_89_166""",,,"""a55475b1""","""a55475b1""",,"""a55475b1""","""a55475b1""",12000.0,1382.8,,…,0.0,0.0,2.0,,,,,,,0.0,0.0,0.0,"""a55475b1""",96174.0,0.0,0.0,0.0,9677.601,,"""2fc785b2""","""6b2ae0fa""","""a55475b1""","""3439d993""","""a55475b1""",,,,,60000.0,False,"""OWNED""",,,,,,
57631,"""2022-06-04""",202201,100,0.0,,2540.6,0.0,,,,,,,,,,24920.0,0.0,0.0,24920.0,0.0,,"""a55475b1""","""a55475b1""",,"""P94_109_143""",,,"""P100_96_175""","""P165_57_169""",46279.8,"""P45_84_106""","""P94_109_143""",,0.0,,…,0.0,0.0,0.0,,,,,,,0.0,0.0,0.0,"""a55475b1""",24920.0,,,0.0,0.0,,"""2fc785b2""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,,,,,,,,,,,,
57632,"""2022-02-05""",202201,100,0.0,63647.402,4732.0,0.0,-7.0,-6.0,-7.0,0.0,3536.0,,0.0,10581.714,3536.0,25998.0,0.0,0.0,25998.0,0.0,,"""P53_45_92""","""P200_75_140""",50116.0,"""a55475b1""",,,"""a55475b1""","""a55475b1""",,"""a55475b1""","""a55475b1""",56000.0,7000.0,,…,0.0,0.0,0.0,3536.0,63647.402,42412.0,3536.0,-9.0,-9.0,0.0,0.0,0.0,"""P159_143_123""",25998.0,0.0,0.0,0.0,63652.0,7071.4,"""2fc785b2""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,,15841.2,,,,,,,,,,
57633,"""2022-01-25""",202201,100,0.0,,8273.0,0.0,,,,,,,,,,200000.0,0.0,0.0,200000.0,0.0,,"""a55475b1""","""a55475b1""",,"""P85_114_140""",,,"""P159_130_59""","""P75_90_70""",64996.0,"""P45_84_106""","""P94_109_143""",,0.0,,…,0.0,0.0,0.0,,,,,,,0.0,0.0,0.0,"""a55475b1""",0.0,,,0.0,0.0,,"""2fc785b2""","""a55475b1""","""a55475b1""","""3439d993""","""a55475b1""",,,,,,,,,,,0.0,,
57634,"""2021-01-27""",202201,100,0.0,39948.8,1165.8,0.0,-4.0,,-4.0,0.0,3994.8,,0.0,1675.4,3358.4001,12108.2,0.0,0.0,12108.2,0.0,,"""P159_130_59""","""P174_113_42""",16494.201,"""a55475b1""",,,"""a55475b1""","""a55475b1""",,"""a55475b1""","""a55475b1""",50000.0,5000.0,,…,0.0,0.0,0.0,4949.6,20887.201,20150.8,,-26.0,-26.0,0.0,0.0,0.0,"""a55475b1""",13998.0,0.0,0.0,0.0,39950.8,,"""2fc785b2""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,,,,,,,,,,,,


In [68]:
case_ids = data["case_id"].unique().shuffle(seed=1)
case_ids

case_id
i64
106054
1875612
1523012
628698
861635
…
1012214
1320603
908701
662158


In [69]:
case_ids_train, case_ids_test = train_test_split(case_ids, train_size=0.6, random_state=1)
case_ids_valid, case_ids_test = train_test_split(case_ids_test, train_size=0.5, random_state=1)

In [70]:
cols_pred = []
for col in data.columns:
    if col[-1].isupper() and col[:-1].islower():
        cols_pred.append(col)

print(cols_pred)

['actualdpdtolerance_344P', 'amtinstpaidbefduel24m_4187115A', 'annuity_780A', 'annuitynextmonth_57A', 'avgdbddpdlast24m_3658932P', 'avgdbddpdlast3m_4187120P', 'avgdbdtollast24m_4525197P', 'avgdpdtolclosure24_3658938P', 'avginstallast24m_3658937A', 'avglnamtstart24m_4525187A', 'avgmaxdpdlast9m_3716943P', 'avgoutstandbalancel6m_4187114A', 'avgpmtlast12m_4525200A', 'credamount_770A', 'currdebt_22A', 'currdebtcredtyperange_828A', 'disbursedcredamount_1113A', 'downpmt_116A', 'inittransactionamount_650A', 'lastapprcommoditycat_1041M', 'lastapprcommoditytypec_5251766M', 'lastapprcredamount_781A', 'lastcancelreason_561M', 'lastotherinc_902A', 'lastotherlnsexpense_631A', 'lastrejectcommoditycat_161M', 'lastrejectcommodtypec_5251769M', 'lastrejectcredamount_222A', 'lastrejectreason_759M', 'lastrejectreasonclient_4145040M', 'maininc_215A', 'maxannuity_159A', 'maxannuity_4075009A', 'maxdbddpdlast1m_3658939P', 'maxdbddpdtollast12m_3658940P', 'maxdbddpdtollast6m_4187119P', 'maxdebt4_972A', 'maxdpdfr

In [71]:
data.select(cols_pred)

actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,avgdbddpdlast24m_3658932P,avgdbddpdlast3m_4187120P,avgdbdtollast24m_4525197P,avgdpdtolclosure24_3658938P,avginstallast24m_3658937A,avglnamtstart24m_4525187A,avgmaxdpdlast9m_3716943P,avgoutstandbalancel6m_4187114A,avgpmtlast12m_4525200A,credamount_770A,currdebt_22A,currdebtcredtyperange_828A,disbursedcredamount_1113A,downpmt_116A,inittransactionamount_650A,lastapprcommoditycat_1041M,lastapprcommoditytypec_5251766M,lastapprcredamount_781A,lastcancelreason_561M,lastotherinc_902A,lastotherlnsexpense_631A,lastrejectcommoditycat_161M,lastrejectcommodtypec_5251769M,lastrejectcredamount_222A,lastrejectreason_759M,lastrejectreasonclient_4145040M,maininc_215A,maxannuity_159A,maxannuity_4075009A,maxdbddpdlast1m_3658939P,maxdbddpdtollast12m_3658940P,maxdbddpdtollast6m_4187119P,maxdebt4_972A,maxdpdfrom6mto36m_3546853P,maxdpdinstlnum_3546846P,maxdpdlast12m_727P,maxdpdlast24m_143P,maxdpdlast3m_392P,maxdpdlast6m_474P,maxdpdlast9m_1059P,maxdpdtolerance_374P,maxinstallast24m_3658928A,maxlnamtstart6m_4525199A,maxoutstandbalancel12m_4187113A,maxpmtlast3m_4525190A,mindbddpdlast24m_3658935P,mindbdtollast24m_4525191P,posfpd10lastmonth_333P,posfpd30lastmonth_3976960P,posfstqpd30lastmonth_3976962P,previouscontdistrict_112M,price_1097A,sumoutstandtotal_3546847A,sumoutstandtotalest_4493215A,totaldebt_9A,totalsettled_863A,totinstallast1m_4525188A,description_5085714M,education_1103M,education_88M,maritalst_385M,maritalst_893M,pmtaverage_3A,pmtaverage_4527227A,pmtaverage_4955615A,pmtssum_45A
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,str,f64,f64,str,str,f64,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f64,f64,f64,f64,f64,f64,str,str,str,str,str,f64,f64,f64,f64
,,1917.6,0.0,,,,,,,,,,30000.0,0.0,0.0,30000.0,0.0,,"""a55475b1""","""a55475b1""",,"""a55475b1""",,,"""a55475b1""","""a55475b1""",,"""a55475b1""","""a55475b1""",,0.0,,,,,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,0.0,0.0,,"""a55475b1""",,,,0.0,0.0,,,,,,,,,,
,,3134.0,0.0,,,,,,,,,,19999.8,0.0,0.0,19999.8,0.0,,"""a55475b1""","""a55475b1""",,"""a55475b1""",,,"""a55475b1""","""a55475b1""",,"""a55475b1""","""a55475b1""",,0.0,,,,,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,0.0,0.0,,"""a55475b1""",,,,0.0,0.0,,,,,,,,,,
,,4937.0,0.0,,,,,,,,,,78000.0,0.0,0.0,78000.0,0.0,,"""a55475b1""","""a55475b1""",,"""a55475b1""",,,"""a55475b1""","""a55475b1""",10000.0,"""a55475b1""","""a55475b1""",,0.0,,,,,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,0.0,0.0,,"""a55475b1""",,,,0.0,0.0,,,,,,,,,,
,,4643.6,0.0,,,,,,,,,,40000.0,0.0,0.0,40000.0,0.0,,"""a55475b1""","""a55475b1""",,"""P94_109_143""",,,"""a55475b1""","""a55475b1""",59999.8,"""P94_109_143""","""a55475b1""",,0.0,,,,,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,0.0,0.0,,"""a55475b1""",,,,0.0,0.0,,,,,,,,,,
,,3390.2,0.0,,,,,,,,,,44000.0,0.0,0.0,44000.0,0.0,,"""a55475b1""","""a55475b1""",,"""P24_27_36""",,,"""a55475b1""","""a55475b1""",,"""a55475b1""","""a55475b1""",,0.0,,,,,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,0.0,0.0,,"""a55475b1""",,,,0.0,0.0,,,,,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
0.0,176561.36,3675.4001,0.0,-23.0,-43.0,-23.0,0.0,7356.8003,,0.0,16392.496,6750.2,30000.0,0.0,0.0,30000.0,0.0,,"""P12_6_178""","""P142_50_170""",20020.0,"""a55475b1""",,,"""a55475b1""","""a55475b1""",150000.0,"""P94_109_143""","""P94_109_143""",36000.0,75521.91,,0.0,0.0,0.0,105019.79,0.0,6.0,0.0,0.0,0.0,0.0,0.0,8.0,46718.2,49651.402,77533.76,14346.319,-144.0,-144.0,0.0,0.0,0.0,"""P123_39_170""",0.0,0.0,0.0,0.0,428159.66,14346.319,"""2fc785b2""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,,12155.0,
0.0,301276.47,7088.6,6191.6,-18.0,-12.0,-18.0,0.0,12553.2,,0.0,105129.31,15780.4,100000.0,68098.4,68098.4,40739.54,0.0,,"""a55475b1""","""a55475b1""",0.0,"""a55475b1""",,,"""a55475b1""","""a55475b1""",,"""a55475b1""","""a55475b1""",,117251.6,,0.0,2.0,2.0,202775.55,0.0,7.0,2.0,2.0,0.0,2.0,2.0,2.0,40499.8,116813.4,250031.2,40499.805,-92.0,-92.0,0.0,0.0,0.0,"""P162_18_172""",,68098.4,68098.4,68098.4,701247.3,40499.805,"""2fc785b2""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,,22904.6,
0.0,14232.4,7788.8003,0.0,-12.0,,-16.0,1.0,2662.4001,,,,1500.6,60000.0,0.0,0.0,60000.0,0.0,,"""P159_130_59""","""P75_90_70""",3998.0,"""P180_60_137""",,,"""a55475b1""","""a55475b1""",,"""a55475b1""","""a55475b1""",24000.0,6600.0,,,-27.0,,17143.4,4.0,5.0,0.0,4.0,0.0,0.0,0.0,4.0,3243.4001,4182.0,0.0,,-27.0,-55.0,0.0,0.0,0.0,"""P133_44_167""",0.0,0.0,0.0,0.0,24002.0,,"""2fc785b2""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,,,
0.0,197371.58,1195.4,2827.2,-33.0,-64.0,-34.0,0.0,8212.601,,0.0,47943.062,9921.2,6000.0,46806.6,46806.6,6000.0,0.0,,"""a55475b1""","""a55475b1""",0.0,"""a55475b1""",,,"""P159_130_59""","""P174_113_42""",2198.0,"""a55475b1""","""a55475b1""",,163202.0,,-66.0,0.0,-33.0,126780.0,2.0,13.0,0.0,0.0,0.0,0.0,0.0,34.0,88740.805,94265.2,81604.6,2827.2,-68.0,-68.0,0.0,0.0,0.0,"""P123_6_84""",0.0,46806.6,46806.6,46806.6,440145.3,5654.4,"""2fc785b2""","""6b2ae0fa""","""a55475b1""","""3439d993""","""a55475b1""",,,15792.4,


In [72]:
def from_polars_to_pandas(case_ids: pl.DataFrame) -> pl.DataFrame:
    return (
        data.filter(pl.col("case_id").is_in(case_ids))[["case_id", "WEEK_NUM", "target"]].to_pandas(),
        data.filter(pl.col("case_id").is_in(case_ids))[cols_pred].to_pandas(),
        data.filter(pl.col("case_id").is_in(case_ids))["target"].to_pandas()
    )

In [73]:
base_train, X_train, y_train = from_polars_to_pandas(case_ids_train)
base_valid, X_valid, y_valid = from_polars_to_pandas(case_ids_valid)
base_test, X_test, y_test = from_polars_to_pandas(case_ids_test)

for df in [X_train, X_valid, X_test]:
    df = convert_strings(df)
    
df

Unnamed: 0,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,avgdbddpdlast24m_3658932P,avgdbddpdlast3m_4187120P,avgdbdtollast24m_4525197P,avgdpdtolclosure24_3658938P,avginstallast24m_3658937A,avglnamtstart24m_4525187A,...,totinstallast1m_4525188A,description_5085714M,education_1103M,education_88M,maritalst_385M,maritalst_893M,pmtaverage_3A,pmtaverage_4527227A,pmtaverage_4955615A,pmtssum_45A
0,,,3390.2000,0.0,,,,,,,...,,,,,,,,,,
1,,,9568.6010,0.0,,,,,,,...,,,,,,,,,,
2,,,5109.6000,0.0,,,,,,,...,,,,,,,,,,
3,,,2581.0000,0.0,,,,,,,...,,,,,,,,,,
4,,,2400.0000,0.0,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305327,0.0,119089.99,4138.4000,0.0,-5.0,-2.0,-5.0,0.0,5671.000,,...,12445.385,2fc785b2,717ddd49,a55475b1,3439d993,a55475b1,,,16097.200,
305328,0.0,0.00,4747.2000,0.0,,,,0.0,,,...,,2fc785b2,6b2ae0fa,6b2ae0fa,3439d993,46b968c3,,,,
305329,0.0,335469.25,7088.6000,7216.0,-17.0,-45.0,-18.0,0.0,13376.601,,...,7216.000,2fc785b2,717ddd49,a55475b1,3439d993,a55475b1,,,20508.201,
305330,0.0,169487.72,4960.8003,2717.2,-11.0,-9.0,-11.0,0.0,7369.000,,...,2717.200,2fc785b2,717ddd49,a55475b1,3439d993,a55475b1,,,,


In [74]:
print(f"Train: {X_train.shape}")
print(f"Valid: {X_valid.shape}")
print(f"Test: {X_test.shape}")

Train: (915995, 70)
Valid: (305332, 70)
Test: (305332, 70)


# Training LightGBM

In [75]:
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_valid = lgb.Dataset(X_valid, label=y_valid, reference=lgb_train)

params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 3,
    "num_leaves": 25,
    "learning_rate": 0.05,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "n_estimators": 1000,
    "verbose": -1,
}

gbm = lgb.train(
    params,
    lgb_train,
    valid_sets=lgb_valid,
    callbacks=[lgb.log_evaluation(50), lgb.early_stopping(10)]
)

Training until validation scores don't improve for 10 rounds
[50]	valid_0's auc: 0.742776
[100]	valid_0's auc: 0.759971
[150]	valid_0's auc: 0.767515
[200]	valid_0's auc: 0.771745
[250]	valid_0's auc: 0.774267
[300]	valid_0's auc: 0.775872
[350]	valid_0's auc: 0.777276
[400]	valid_0's auc: 0.7784
[450]	valid_0's auc: 0.779294
[500]	valid_0's auc: 0.77995
[550]	valid_0's auc: 0.780564
[600]	valid_0's auc: 0.781375
[650]	valid_0's auc: 0.781908
[700]	valid_0's auc: 0.782373
Early stopping, best iteration is:
[696]	valid_0's auc: 0.782388


In [76]:
for base, X in [(base_train, X_train), (base_valid, X_valid), (base_test, X_test)]:
    y_pred = gbm.predict(X, num_iteration=gbm.best_iteration)
    base["score"] = y_pred

print(f'The AUC score on the train set is: {roc_auc_score(base_train["target"], base_train["score"])}') 
print(f'The AUC score on the valid set is: {roc_auc_score(base_valid["target"], base_valid["score"])}') 
print(f'The AUC score on the test set is: {roc_auc_score(base_test["target"], base_test["score"])}')  


The AUC score on the train set is: 0.792209066706691
The AUC score on the valid set is: 0.7823877168754692
The AUC score on the test set is: 0.7784630534469104


In [77]:
def gini_stability(base, w_fallingrate=88.0, w_resstd=-0.5):
    gini_in_time = base.loc[:, ["WEEK_NUM", "target", "score"]]\
        .sort_values("WEEK_NUM")\
        .groupby("WEEK_NUM")[["target", "score"]]\
        .apply(lambda x: 2*roc_auc_score(x["target"], x["score"])-1).tolist()
    
    x = np.arange(len(gini_in_time))
    y = gini_in_time
    a, b = np.polyfit(x, y, 1)
    y_hat = a*x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    avg_gini = np.mean(gini_in_time)
    return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std

stability_score_train = gini_stability(base_train)
stability_score_valid = gini_stability(base_valid)
stability_score_test = gini_stability(base_test)

print(f'The stability score on the train set is: {stability_score_train}') 
print(f'The stability score on the valid set is: {stability_score_valid}') 
print(f'The stability score on the test set is: {stability_score_test}')  


The stability score on the train set is: 0.562112479722089
The stability score on the valid set is: 0.5343616743750068
The stability score on the test set is: 0.5210892935596464


In [78]:
X_submission = data_submission[cols_pred].to_pandas()
X_submission = convert_strings(X_submission)
categorical_cols = X_train.select_dtypes(include=['category']).columns

for col in categorical_cols:
    train_categories = set(X_train[col].cat.categories)
    submission_categories = set(X_submission[col].cat.categories)
    new_categories = submission_categories - train_categories
    X_submission.loc[X_submission[col].isin(new_categories), col] = "Unknown"
    new_dtype = pd.CategoricalDtype(categories=train_categories, ordered=True)
    X_train[col] = X_train[col].astype(new_dtype)
    X_submission[col] = X_submission[col].astype(new_dtype)

y_submission_pred = gbm.predict(X_submission, num_iteration=gbm.best_iteration)

In [79]:
submission = pd.DataFrame({
    "case_id": data_submission["case_id"].to_numpy(),
    "score": y_submission_pred
}).set_index('case_id')
submission.to_csv("./submission.csv")

In [80]:
submission

Unnamed: 0_level_0,score
case_id,Unnamed: 1_level_1
57543,0.01292
57549,0.054264
57551,0.007886
57552,0.00901
57569,0.037663
57630,0.01203
57631,0.035755
57632,0.001523
57633,0.077396
57634,0.005463
