In [1]:
import os
import gc
import pandas as pd
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from category_encoders import *
from pathlib import Path
from glob import glob
from datetime import datetime
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.model_selection import StratifiedGroupKFold
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
ROOT = Path("/kaggle/input/home-credit-credit-risk-model-stability")
TRAIN_DIR = ROOT / "parquet_files" / "train"
TEST_DIR = ROOT / "parquet_files" / "test"


MODEL_PATH = "/kaggle/working/homecredit-dataset/best_model.pth"
LOAD_MODEL = False

# Pipline

In [3]:
class Pipeline:
    @staticmethod
    def set_data_type(data_frame):
        for col in data_frame.columns:
            if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
                data_frame = data_frame.with_columns(pl.col(col).cast(pl.Int64))
            elif col in ["date_decision"]:
                data_frame = data_frame.with_columns(pl.col(col).cast(pl.Date))
            elif col[-1] in ("P","A"):
                data_frame = data_frame.with_columns(pl.col(col).cast(pl.Float64))
            elif col[-1] in ("D"):
                data_frame = data_frame.with_columns(pl.col(col).cast(pl.Date))
            elif col[-1] in ("M"):
                data_frame = data_frame.with_columns(pl.col(col).cast(pl.String))
        
        return data_frame
    
    @staticmethod
    def handle_dates(data_frame):
        for col in data_frame.columns:
            if col[-1] in ("D"):
                data_frame = data_frame.with_columns(pl.col(col)-pl.col("date_decision"))
                data_frame = data_frame.with_columns(pl.col(col).dt.total_days())
                
        data_frame = data_frame.drop("date_decision", "MONTH")
        
        return data_frame
    
    @staticmethod
    def filter_cols(data_frame):
        for col in data_frame.columns:
            if col not in ["target", "case_id", "WEEK_NUM"]:
                isnull = data_frame[col].is_null().mean()
                
                if isnull > 0.95:
                    data_frame = data_frame.drop(col)
            
        for col in data_frame.columns:
            if (col not in ["target", "case_id", "WEEK_NUM"]) & (data_frame[col].dtype == pl.String):
                frq = data_frame[col].n_unique()
                if (frq == 1) | (frq == 200):
                    data_frame = data_frame.drop(col)
            
        return data_frame
            
                

# Automatic Aggregation

In [4]:
class Aggregator:
    @staticmethod
    def num_expr(data_frame):
        cols = [col for col in data_frame.columns if col[-1] in ['A','P']]
        expr_mim = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        
        return expr_mim, expr_max
    
    @staticmethod
    def date_expr(data_frame):
        
        cols = [col for col in data_frame.columns if col[-1] in ("D",)]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]

        return expr_max, expr_min
    
    @staticmethod
    def str_expr(data_frame):
        
        cols = [col for col in data_frame.columns if col[-1] in ("M",)]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]

        return expr_max, expr_min
    
    @staticmethod
    def other_expr(data_frame):
        
        cols = [col for col in data_frame.columns if col[-1] in ("T", "L")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]

        return expr_max, expr_min
    
    @staticmethod
    def count_expr(data_frame): 
        
        cols = [col for col in data_frame.columns if "num_group" in col]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]

        return expr_max, expr_min
    
    @staticmethod
    def get_exprs(df):
        maxexprs = Aggregator.num_expr(df)[0] + \
                Aggregator.date_expr(df)[0] + \
                Aggregator.str_expr(df)[0] + \
                Aggregator.other_expr(df)[0] + \
                Aggregator.count_expr(df)[0]
        
        minexprs = Aggregator.num_expr(df)[1] + \
                Aggregator.date_expr(df)[1] + \
                Aggregator.str_expr(df)[1] + \
                Aggregator.other_expr(df)[1] + \
                Aggregator.count_expr(df)[1]
        
        return maxexprs, minexprs

# File I/O

In [5]:
def read_file(path, depth=None): 
    data_frame = pl.read_parquet(path)
    data_frame = data_frame.pipe(Pipeline.set_data_type)
    
    if depth in [1, 2]:
        maxexprs, minexprs = Aggregator.get_exprs(data_frame)
        data_frame = data_frame.group_by("case_id").agg(*maxexprs, *minexprs)
    
    return data_frame

def read_files(regex_path, depth=None):
    chunks = []
    for path in glob(str(regex_path)):
        chunks.append(pl.read_parquet(path).pipe(Pipeline.set_data_type))
        
    data_frame = pl.concat(chunks, how="vertical_relaxed")
    if depth in [1, 2]:
        maxexprs, minexprs = Aggregator.get_exprs(data_frame)
        data_frame = data_frame.group_by("case_id").agg(*maxexprs, *minexprs)
    
    return data_frame

## Feature Engineering

In [6]:
def feature_eng(data_frame_base, depth_0 ,depth_1, depth_2):
    data_frame_base = (
        data_frame_base
        .with_columns(
            month_decision = pl.col("date_decision").dt.month(),
            weekday_decision = pl.col("date_decision").dt.weekday(),
        )
    )
    
    for i, data_frame in enumerate(depth_0 + depth_1 + depth_2):
        data_frame_base = data_frame_base.join(data_frame, how="left", on="case_id", suffix=f"_{i}")
        
    data_frame_base = data_frame_base.pipe(Pipeline.handle_dates)
    
    return data_frame_base

In [7]:
def to_pandas(df_data, cat_cols=None):
    df_data = df_data.to_pandas()
    
    if cat_cols is None:
        cat_cols = list(df_data.select_dtypes("object").columns)
    
    df_data[cat_cols] = df_data[cat_cols].astype("category")
    
    return df_data, cat_cols

In [8]:
data_store = {
    "data_frame_base": read_file(TRAIN_DIR / "train_base.parquet"),
    "depth_0": [
        read_file(TRAIN_DIR / "train_static_cb_0.parquet"),
        read_files(TRAIN_DIR / "train_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TRAIN_DIR / "train_applprev_1_*.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_a_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_b_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_c_1.parquet", 1),
        read_file(TRAIN_DIR / "train_credit_bureau_b_1.parquet", 1),
        read_file(TRAIN_DIR / "train_other_1.parquet", 1),
        read_file(TRAIN_DIR / "train_person_1.parquet", 1),
        read_file(TRAIN_DIR / "train_deposit_1.parquet", 1),
        read_file(TRAIN_DIR / "train_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        read_file(TRAIN_DIR / "train_credit_bureau_b_2.parquet", 2),
    ]
}

In [9]:
df_train = feature_eng(**data_store)

print("train data shape:\t", df_train.shape)

train data shape:	 (1526659, 528)


In [10]:
data_store = {
    "data_frame_base": read_file(TEST_DIR / "test_base.parquet"),
    "depth_0": [
        read_file(TEST_DIR / "test_static_cb_0.parquet"),
        read_files(TEST_DIR / "test_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TEST_DIR / "test_applprev_1_*.parquet", 1),
        read_file(TEST_DIR / "test_tax_registry_a_1.parquet", 1),
        read_file(TEST_DIR / "test_tax_registry_b_1.parquet", 1),
        read_file(TEST_DIR / "test_tax_registry_c_1.parquet", 1),
        read_file(TEST_DIR / "test_credit_bureau_b_1.parquet", 1),
        read_file(TEST_DIR / "test_other_1.parquet", 1),
        read_file(TEST_DIR / "test_person_1.parquet", 1),
        read_file(TEST_DIR / "test_deposit_1.parquet", 1),
        read_file(TEST_DIR / "test_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        read_file(TEST_DIR / "test_credit_bureau_b_2.parquet", 2),
    ]
}

In [11]:
df_test = feature_eng(**data_store)

print("test data shape:\t", df_test.shape)

test data shape:	 (10, 527)


In [12]:
# frq = df_train[col].n_unique()
for col in df_train.columns:
            if (col not in ["target", "case_id", "WEEK_NUM"]) & (df_train[col].dtype == pl.String):
                frq = df_train[col].n_unique()
                if frq <100:
                    print(df_train[col].unique())
                    print(frq)
                else:
                    print("Too Much Unique values")
                    print(frq)

shape: (3,)
Series: 'description_5085714M' [str]
[
	null
	"a55475b1"
	"2fc785b2"
]
3
shape: (6,)
Series: 'education_1103M' [str]
[
	"717ddd49"
	"a55475b1"
	"6b2ae0fa"
	"39a0853f"
	"c8e1a1d0"
	null
]
6
shape: (6,)
Series: 'education_88M' [str]
[
	"a55475b1"
	"c8e1a1d0"
	null
	"a34a13c8"
	"6b2ae0fa"
	"717ddd49"
]
6
shape: (7,)
Series: 'maritalst_385M' [str]
[
	"ecd83604"
	"b6cabe76"
	"a7fcb6e5"
	"38c061ee"
	"a55475b1"
	null
	"3439d993"
]
7
shape: (7,)
Series: 'maritalst_893M' [str]
[
	"1a19667c"
	"46b968c3"
	"e18430ff"
	"ecd83604"
	null
	"977b2a70"
	"a55475b1"
]
7
shape: (4,)
Series: 'requesttype_4525192L' [str]
[
	null
	"SOCIAL_6"
	"DEDUCTION_6"
	"PENSION_6"
]
4
shape: (17,)
Series: 'riskassesment_302T' [str]
[
	"6% - 8%"
	"20% - 25%"
	"59% - 66%"
	"4% - 6%"
	"41% - 49%"
	"2% - 3%"
	"15% - 19%"
	"2% - 2%"
	"67% - 100%"
	null
	"26% - 33%"
	"8% - 11%"
	"33% - 41%"
	"11% - 15%"
	"1% - 1%"
	"3% - 4%"
	"50% - 58%"
]
17
shape: (2,)
Series: 'bankacctype_710L' [str]
[
	null
	"CA"
]
2
shape: (3,

In [13]:
df_train.describe()

describe,case_id,WEEK_NUM,target,month_decision,weekday_decision,assignmentdate_238D,assignmentdate_4527235D,assignmentdate_4955616D,birthdate_574D,contractssum_5085716L,dateofbirth_337D,dateofbirth_342D,days120_123L,days180_256L,days30_165L,days360_512L,days90_310L,description_5085714M,education_1103M,education_88M,firstquarter_103L,for3years_128L,for3years_504L,for3years_584L,formonth_118L,formonth_206L,formonth_535L,forquarter_1017L,forquarter_462L,forquarter_634L,fortoday_1092L,forweek_1077L,forweek_528L,forweek_601L,foryear_618L,foryear_818L,…,min_relationshiptoclient_415T,min_relationshiptoclient_642T,min_remitter_829L,min_role_1084L,min_role_993L,min_safeguarantyflag_411L,min_sex_738L,min_type_25L,min_num_group1_8,min_amount_416A,max_contractenddate_991D,max_openingdate_313D,max_num_group1_9,max_amount_416A,min_contractenddate_991D,min_openingdate_313D,min_num_group1_9,min_last180dayaveragebalance_704A,min_last180dayturnover_1134A,min_last30dayturnover_651A,max_openingdate_857D,max_num_group1_10,max_last180dayaveragebalance_704A,max_last180dayturnover_1134A,max_last30dayturnover_651A,min_openingdate_857D,min_num_group1_10,min_pmts_dpdvalue_108P,min_pmts_pmtsoverdue_635A,max_pmts_date_1107D,max_num_group1_11,max_num_group2,max_pmts_dpdvalue_108P,max_pmts_pmtsoverdue_635A,min_pmts_date_1107D,min_num_group1_11,min_num_group2
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,str,str,str,str,str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",1526659.0,1526659.0,1526659.0,1526659.0,1526659.0,136996.0,114978.0,71633.0,607871.0,157329.0,1385691.0,36500.0,1385691.0,1385691.0,1385691.0,1385691.0,1385691.0,"""1500476""","""1500476""","""1500476""",1385691.0,36514.0,36514.0,36514.0,36514.0,36514.0,36514.0,36514.0,36514.0,36514.0,36514.0,36514.0,36514.0,36514.0,36514.0,36514.0,…,"""591033""","""591033""","""591033""","""1526659""","""12458""","""1526657""","""1526659""","""1526659""",1526659.0,105111.0,59619.0,105111.0,105111.0,105111.0,59619.0,105111.0,105111.0,11289.0,10272.0,10272.0,105087.0,111772.0,11289.0,10272.0,10272.0,105087.0,111772.0,36415.0,36415.0,36447.0,36447.0,36447.0,36415.0,36415.0,36447.0,36447.0,36447.0
"""null_count""",0.0,0.0,0.0,0.0,0.0,1389663.0,1411681.0,1455026.0,918788.0,1369330.0,140968.0,1490159.0,140968.0,140968.0,140968.0,140968.0,140968.0,"""26183""","""26183""","""26183""",140968.0,1490145.0,1490145.0,1490145.0,1490145.0,1490145.0,1490145.0,1490145.0,1490145.0,1490145.0,1490145.0,1490145.0,1490145.0,1490145.0,1490145.0,1490145.0,…,"""935626""","""935626""","""935626""","""0""","""1514201""","""2""","""0""","""0""",0.0,1421548.0,1467040.0,1421548.0,1421548.0,1421548.0,1467040.0,1421548.0,1421548.0,1515370.0,1516387.0,1516387.0,1421572.0,1414887.0,1515370.0,1516387.0,1516387.0,1421572.0,1414887.0,1490244.0,1490244.0,1490212.0,1490212.0,1490212.0,1490244.0,1490244.0,1490212.0,1490212.0,1490212.0
"""mean""",1286100.0,40.769036,0.031437,6.425584,3.984014,-2368.489496,13.945155,-2837.152974,-16065.771127,641604.417651,-16282.376476,-15427.647562,1.607715,2.388656,0.517708,4.777066,1.21142,,,,2.86059,8.2e-05,4.382346,0.007696,0.0,0.000356,0.236759,0.000959,0.0,0.613929,0.036945,5.5e-05,0.09013,0.0,5.5e-05,0.002081,…,,,,,,,,,0.0,5961.107354,-616.17793,-1537.268421,0.380312,11053.429841,-638.582851,-1656.916907,0.0,105.089419,38249.815879,4842.416497,-1541.16728,0.407347,118.227329,40340.876365,5345.528562,-1660.075309,0.0,4514.094906,3.234041,-11.851181,1.246961,22.241419,63313.634217,36.426704,-692.897111,5.5e-05,0.0
"""std""",718946.592285,23.797981,0.174496,3.512803,1.98848,2316.350761,0.410493,1962.589538,5108.123925,980327.297131,5054.532403,4824.322847,2.083003,2.891115,0.899238,5.168856,1.655931,,,,3.610966,0.009064,5.815514,0.091071,0.0,0.018866,0.535493,0.033496,0.0,1.150001,0.189209,0.007401,0.291302,0.0,0.007401,0.049049,…,,,,,,,,,0.0,71454.327223,329.503471,451.406331,0.9579,99455.728421,347.294484,446.452488,0.0,724.018394,40149.934998,19018.025916,448.678936,0.986336,987.545689,42106.663966,19907.9379,443.9568,0.0,60645.593661,33.086764,18.043296,1.322809,12.829507,1267200.0,1547.789956,389.777527,0.010476,0.0
"""min""",0.0,0.0,0.0,1.0,1.0,-16454.0,7.0,-13040.0,-27774.0,0.0,-44073.0,-38192.0,0.0,0.0,0.0,0.0,0.0,"""2fc785b2""","""39a0853f""","""6b2ae0fa""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,"""CHILD""","""CHILD""","""False""","""CL""","""FULL""","""False""","""F""","""ALTERNATIVE_PH…",0.0,-40000.0,-6154.0,-6376.0,0.0,-33779.152,-6453.0,-6634.0,0.0,-308.79413,-187780.0,-477.506,-6376.0,0.0,-308.79413,-187780.0,-477.506,-6634.0,0.0,0.0,0.0,-1003.0,0.0,0.0,0.0,0.0,-1118.0,0.0,0.0
"""25%""",766198.0,23.0,0.0,3.0,2.0,-3409.0,14.0,-3971.0,-20251.0,78531.95,-20417.0,-19021.0,0.0,0.0,0.0,1.0,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,,,,,,,,,0.0,0.0,-781.0,-1887.0,0.0,0.0,-800.0,-1991.0,0.0,0.0,8000.0,0.0,-1887.0,0.0,0.0,10000.0,0.0,-1991.0,0.0,0.0,0.0,-19.0,0.0,10.0,0.0,0.0,-1092.0,0.0,0.0
"""50%""",1357358.0,40.0,0.0,7.0,4.0,-1850.0,14.0,-2489.0,-15434.0,307282.4,-15630.0,-14731.0,1.0,2.0,0.0,3.0,1.0,,,,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,,,,,,,,,0.0,213.848,-602.0,-1535.0,0.0,273.548,-620.0,-1706.0,0.0,0.0,29896.0,0.0,-1539.0,0.0,0.0,30000.0,0.0,-1708.0,0.0,0.0,0.0,-11.0,1.0,24.0,0.0,0.0,-750.0,0.0,0.0
"""75%""",1739023.0,55.0,0.0,9.0,6.0,-610.0,14.0,-1344.0,-11720.0,802114.08,-12033.0,-11478.0,2.0,3.0,1.0,7.0,2.0,,,,4.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,…,,,,,,,,,0.0,401.402,-422.0,-1186.0,0.0,852.1481,-442.0,-1331.0,0.0,0.936,59994.0,0.0,-1195.0,1.0,1.6620001,60000.0,0.0,-1339.0,0.0,0.0,0.0,-3.0,2.0,35.0,21058.0,2.6000001,-315.0,0.0,0.0
"""max""",2703454.0,91.0,1.0,12.0,7.0,14.0,14.0,4.0,-7656.0,31297000.0,-58.0,-1837.0,109.0,110.0,22.0,115.0,41.0,"""a55475b1""","""c8e1a1d0""","""c8e1a1d0""",76.0,1.0,57.0,2.0,0.0,1.0,11.0,2.0,0.0,19.0,2.0,1.0,4.0,0.0,1.0,2.0,…,"""SPOUSE""","""SPOUSE""","""False""","""CL""","""FULL""","""True""","""M""","""PRIMARY_MOBILE…",0.0,12213286.0,424.0,-524.0,64.0,12213286.0,424.0,-524.0,0.0,32115.504,900000.0,390000.0,-524.0,65.0,67777.77,1161820.0,390000.0,-524.0,0.0,4113920.0,929.60004,14.0,20.0,36.0,185124192.0,147470.61,10.0,2.0,0.0


In [14]:
df_train.null_count()

case_id,WEEK_NUM,target,month_decision,weekday_decision,assignmentdate_238D,assignmentdate_4527235D,assignmentdate_4955616D,birthdate_574D,contractssum_5085716L,dateofbirth_337D,dateofbirth_342D,days120_123L,days180_256L,days30_165L,days360_512L,days90_310L,description_5085714M,education_1103M,education_88M,firstquarter_103L,for3years_128L,for3years_504L,for3years_584L,formonth_118L,formonth_206L,formonth_535L,forquarter_1017L,forquarter_462L,forquarter_634L,fortoday_1092L,forweek_1077L,forweek_528L,forweek_601L,foryear_618L,foryear_818L,foryear_850L,…,min_relationshiptoclient_415T,min_relationshiptoclient_642T,min_remitter_829L,min_role_1084L,min_role_993L,min_safeguarantyflag_411L,min_sex_738L,min_type_25L,min_num_group1_8,min_amount_416A,max_contractenddate_991D,max_openingdate_313D,max_num_group1_9,max_amount_416A,min_contractenddate_991D,min_openingdate_313D,min_num_group1_9,min_last180dayaveragebalance_704A,min_last180dayturnover_1134A,min_last30dayturnover_651A,max_openingdate_857D,max_num_group1_10,max_last180dayaveragebalance_704A,max_last180dayturnover_1134A,max_last30dayturnover_651A,min_openingdate_857D,min_num_group1_10,min_pmts_dpdvalue_108P,min_pmts_pmtsoverdue_635A,max_pmts_date_1107D,max_num_group1_11,max_num_group2,max_pmts_dpdvalue_108P,max_pmts_pmtsoverdue_635A,min_pmts_date_1107D,min_num_group1_11,min_num_group2
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,…,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,1389663,1411681,1455026,918788,1369330,140968,1490159,140968,140968,140968,140968,140968,26183,26183,26183,140968,1490145,1490145,1490145,1490145,1490145,1490145,1490145,1490145,1490145,1490145,1490145,1490145,1490145,1490145,1490145,1490145,…,935626,935626,935626,0,1514201,2,0,0,0,1421548,1467040,1421548,1421548,1421548,1467040,1421548,1421548,1515370,1516387,1516387,1421572,1414887,1515370,1516387,1516387,1421572,1414887,1490244,1490244,1490212,1490212,1490212,1490244,1490244,1490212,1490212,1490212
