In [1]:
import gc
import lightgbm as lgb
import numpy as np
import pandas as pd
import polars as pl
import warnings

from catboost import CatBoostClassifier, Pool
from glob import glob
from IPython.display import display
from pathlib import Path
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedGroupKFold
from typing import Any

warnings.filterwarnings('ignore')

ROOT      = Path('/kaggle/input/home-credit-credit-risk-model-stability')
TRAIN_DIR = ROOT / 'parquet_files' / 'train'
TEST_DIR  = ROOT / 'parquet_files' / 'test'

In [2]:
class Utility:
    @staticmethod
    def get_feat_defs(ending_with:str):
        feat_defs:pl.DataFrame = pl.read_csv(ROOT / 'feature_definitions.csv')

        filtered_feats:pl.DataFrame = feat_defs.filter(pl.col('Variable').apply(lambda var: var.endswith(ending_with)))

        with pl.Config(fmt_str_lengths=200, tbl_rows=-1):
            print(filtered_feats)

        filtered_feats = None
        feat_defs = None

     
    @staticmethod
    def find_index(lst:list, item:Any) -> int | None:
        try:
            return lst.index(item)
        except ValueError:
            return None

    
    @staticmethod
    def dtype_to_str(dtype:pl.DataType) -> str:
        dtype_map = {
            pl.Decimal: 'Decimal',

            pl.Float32: 'Float32',
            pl.Float64: 'Float64',

            pl.UInt8: 'UInt8',
            pl.UInt16: 'UInt16',
            pl.UInt32: 'UInt32',
            pl.UInt64: 'UInt64',

            pl.Int8: 'Int8',
            pl.Int16: 'Int16',
            pl.Int32: 'Int32',
            pl.Int64: 'Int64',

            pl.Date: 'Date',
            pl.Datetime: 'Datetime',
            pl.Duration: 'Duration',
            pl.Time: 'Time',

            pl.Array: 'Array',
            pl.List: 'List',
            pl.Struct: 'Struct',

            pl.String: 'String',
            pl.Categorical: 'Categorical',
            pl.Enum: 'Enum',
            pl.Utf8: 'Utf8',

            pl.Binary: 'Binary',
            pl.Boolean: 'Boolean',
            pl.Null: 'Null',
            pl.Object: 'Object',
            pl.Unknown: 'Unknown'
        }

        return dtype_map.get(dtype)

    
    @staticmethod
    def find_feat_occur(regex_path:str, ending_with:str) -> pl.DataFrame:
        feat_defs:pl.DataFrame = pl.read_csv(ROOT / 'feature_definitions.csv').filter(pl.col('Variable').apply(lambda var: var.endswith(ending_with)))
        feat_defs.sort(by=['Variable'])

        feats:list = feat_defs['Variable'].to_list()
        feats.sort()

        occurrences:list = [[set(), set()] for _ in range(feat_defs.height)]

        for path in glob(str(regex_path)):
            df_schema:dict = pl.read_parquet_schema(path)

            for (feat, dtype) in df_schema.items():
                index:int = Utility.find_index(feats, feat)
                if index != None:
                    occurrences[index][0].add(Utility.dtype_to_str(dtype))
                    occurrences[index][1].add(Path(path).stem)

        data_types:list[str] = [None] * feat_defs.height
        file_locs:list[str] = [None] * feat_defs.height

        for i, feat in enumerate(feats):
            data_types[i] = list(occurrences[i][0])
            file_locs[i] = list(occurrences[i][1])

        feat_defs = feat_defs.with_columns(pl.Series(data_types).alias('Data_Type(s)'))
        feat_defs = feat_defs.with_columns(pl.Series(file_locs).alias('File_Loc(s)'))

        return feat_defs
    
    
    def reduce_memory_usage(df:pl.DataFrame, name) -> pl.DataFrame:
        print(f'Memory usage of dataframe \'{name}\' is {round(df.estimated_size("mb"), 2)} MB.')

        int_types = [pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64]
        float_types = [pl.Float32, pl.Float64]

        for col in df.columns:
            col_type = df[col].dtype
            if (col_type in int_types + float_types):
                c_min = df[col].min()
                c_max = df[col].max()

                if c_min is not None and c_max is not None:
                    if col_type in int_types:
                        if c_min >= 0:
                            if c_min >= np.iinfo(np.uint8).min and c_max <= np.iinfo(np.uint8).max:
                                df = df.with_columns(df[col].cast(pl.UInt8))
                            elif c_min >= np.iinfo(np.uint16).min and c_max <= np.iinfo(np.uint16).max:
                                df = df.with_columns(df[col].cast(pl.UInt16))
                            elif c_min >= np.iinfo(np.uint32).min and c_max <= np.iinfo(np.uint32).max:
                                df = df.with_columns(df[col].cast(pl.UInt32))
                            elif c_min >= np.iinfo(np.uint64).min and c_max <= np.iinfo(np.uint64).max:
                                df = df.with_columns(df[col].cast(pl.UInt64))
                        else:
                            if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
                                df = df.with_columns(df[col].cast(pl.Int8))
                            elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
                                df = df.with_columns(df[col].cast(pl.Int16))
                            elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
                                df = df.with_columns(df[col].cast(pl.Int32))
                            elif c_min >= np.iinfo(np.int64).min and c_max <= np.iinfo(np.int64).max:
                                df = df.with_columns(df[col].cast(pl.Int64))
                    elif col_type in float_types:
                        if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                            df = df.with_columns(df[col].cast(pl.Float32))

        print(f'Memory usage of dataframe \'{name}\' became {round(df.estimated_size("mb"), 4)} MB.')

        return df


    def to_pandas(df:pl.DataFrame, cat_cols:list[str]=None) -> (pd.DataFrame, list[str]):
        df:pd.DataFrame = df.to_pandas()

        if cat_cols is None:
            cat_cols = list(df.select_dtypes('object').columns)

        df[cat_cols] = df[cat_cols].astype('str')

        return df, cat_cols

In [3]:
class Aggregator:
    @staticmethod
    def max_expr(df:pl.LazyFrame) -> list[pl.Series]:
        cols:list[str] = [col for col in df.columns if (col[-1] in ('P', 'M', 'A', 'D', 'T', 'L')) or ('num_group' in col)]

        expr_max:list[pl.Series] = [pl.col(col).max().alias(f'max_{col}') for col in cols]

        return expr_max
    
    
    @staticmethod
    def min_expr(df:pl.LazyFrame) -> list[pl.Series]:
        cols:list[str] = [col for col in df.columns if (col[-1] in ('P', 'M', 'A', 'D', 'T', 'L')) or ('num_group' in col)]

        expr_min:list[pl.Series] = [pl.col(col).min().alias(f'min_{col}') for col in cols]

        return expr_min
    
    
    @staticmethod
    def mean_expr(df:pl.LazyFrame) -> list[pl.Series]:
        cols:list[str] = [col for col in df.columns if col.endswith(('P', 'A', 'D'))]

        expr_mean:list[pl.Series] = [pl.col(col).mean().alias(f'mean_{col}') for col in cols]

        return expr_mean
    
    
    @staticmethod
    def var_expr(df:pl.LazyFrame) -> list[pl.Series]:
        cols:list[str] = [col for col in df.columns if col.endswith(('P', 'A', 'D'))]

        expr_mean:list[pl.Series] = [pl.col(col).var().alias(f'var_{col}') for col in cols]

        return expr_mean
    
    
    @staticmethod
    def mode_expr(df:pl.LazyFrame) -> list[pl.Series]:
        cols:list[str] = [col for col in df.columns if col.endswith('M')]

        expr_mode:list[pl.Series] = [pl.col(col).drop_nulls().mode().first().alias(f'mode_{col}') for col in cols]

        return expr_mode

    @staticmethod
    def get_exprs(df:pl.LazyFrame) -> list[pl.Series]:
        exprs = Aggregator.max_expr(df) + \
                Aggregator.mean_expr(df) + \
                Aggregator.var_expr(df)

        return exprs

In [4]:
class SchemaGen:
    @staticmethod
    def change_dtypes(df:pl.LazyFrame) -> pl.LazyFrame:
        for col in df.columns:
            if col == 'case_id':
                df = df.with_columns(pl.col(col).cast(pl.UInt32).alias(col))
            elif col in ['WEEK_NUM', 'num_group1', 'num_group2']:
                df = df.with_columns(pl.col(col).cast(pl.UInt16).alias(col))
            elif col == 'date_decision' or col[-1] == 'D':
                df = df.with_columns(pl.col(col).cast(pl.Date).alias(col))
            elif col[-1] in ['P', 'A']:
                df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))
            elif col[-1] in ('M',):
                    df = df.with_columns(pl.col(col).cast(pl.String));
        return df


    @staticmethod
    def scan_files(glob_path: str, depth: int = None) -> pl.LazyFrame:
        chunks: list[pl.LazyFrame] = []
        for path in glob(str(glob_path)):
            df: pl.LazyFrame = pl.scan_parquet(path, low_memory=True, rechunk=True).pipe(SchemaGen.change_dtypes)
            print(f'File {Path(path).stem} loaded into memory.')
            
            if depth in (1, 2):
                exprs: list[pl.Series] = Aggregator.get_exprs(df)
                df = df.group_by('case_id').agg(exprs)

                del exprs
                gc.collect()
                
            chunks.append(df)

        df: pl.LazyFrame = pl.concat(chunks, how='vertical_relaxed')
        
        del chunks
        gc.collect()
                
        df = df.unique(subset=['case_id'])
    
        return df
    
    
    @staticmethod
    def join_dataframes(df_base: pl.LazyFrame, depth_0: list[pl.LazyFrame], depth_1: list[pl.LazyFrame], depth_2: list[pl.LazyFrame]) -> pl.DataFrame:
        for (i, df) in enumerate(depth_0 + depth_1 + depth_2):
            df_base = df_base.join(df, how='left', on='case_id', suffix=f'_{i}')

        return df_base.collect()

In [5]:
def handle_dates(df:pl.DataFrame) -> pl.DataFrame:
    for col in df.columns:
        if col.endswith('D'):
            df = df.with_columns(pl.col(col) - pl.col('date_decision'))
            df = df.with_columns(pl.col(col).dt.total_days().cast(pl.Int32))

    df = df.with_columns([pl.col('date_decision').dt.year().alias('year').cast(pl.Int16), pl.col('date_decision').dt.month().alias('month').cast(pl.UInt8), pl.col('date_decision').dt.weekday().alias('week_num').cast(pl.UInt8)])

    return df.drop('date_decision', 'MONTH', 'WEEK_NUM');


def filter_cols(df:pd.DataFrame) -> pd.DataFrame:
    for col in df.columns:
        if col not in ['case_id', 'year', 'month', 'week_num', 'target']:
            null_pct = df[col].is_null().mean()

            if null_pct > 0.95:
                df = df.drop(col)

    for col in df.columns:
        if (col not in ['case_id', 'year', 'month', 'week_num', 'target']) & (df[col].dtype == pl.String):
            freq = df[col].n_unique()

            if (freq > 200) | (freq == 1):
                df = df.drop(col)

    return df

In [6]:
# feat_defs:pl.DataFrame = Utility.find_feat_occur(TRAIN_DIR / 'train_*.parquet', 'P')
# feat_defs:pl.DataFrame = Utility.find_feat_occur(TRAIN_DIR / 'train_*.parquet', 'M')
# feat_defs:pl.DataFrame = Utility.find_feat_occur(TRAIN_DIR / 'train_*.parquet', 'A')
# feat_defs:pl.DataFrame = Utility.find_feat_occur(TRAIN_DIR / 'train_*.parquet', 'D')
# feat_defs:pl.DataFrame = Utility.find_feat_occur(TRAIN_DIR / 'train_*.parquet', 'T')
# feat_defs:pl.DataFrame = Utility.find_feat_occur(TRAIN_DIR / 'train_*.parquet', 'L')
# feat_defs:pl.DataFrame = pl.read_csv(ROOT / 'feature_definitions.csv')
# with pl.Config(fmt_str_lengths=1000, tbl_rows=-1, tbl_width_chars=180):
#     print(feat_defs)

In [6]:
data_store:dict = {
    'df_base': SchemaGen.scan_files(TRAIN_DIR / 'train_base.parquet'),
    'depth_0': [
        SchemaGen.scan_files(TRAIN_DIR / 'train_static_cb_0.parquet'),
        SchemaGen.scan_files(TRAIN_DIR / 'train_static_0_*.parquet'),
    ],
    'depth_1': [
        SchemaGen.scan_files(TRAIN_DIR / 'train_applprev_1_*.parquet', 1),
        SchemaGen.scan_files(TRAIN_DIR / 'train_tax_registry_a_1.parquet', 1),
        SchemaGen.scan_files(TRAIN_DIR / 'train_tax_registry_b_1.parquet', 1),
        SchemaGen.scan_files(TRAIN_DIR / 'train_tax_registry_c_1.parquet', 1),
        SchemaGen.scan_files(TRAIN_DIR / 'train_credit_bureau_a_1_*.parquet', 1),
        SchemaGen.scan_files(TRAIN_DIR / 'train_credit_bureau_b_1.parquet', 1),
        SchemaGen.scan_files(TRAIN_DIR / 'train_other_1.parquet', 1),
        SchemaGen.scan_files(TRAIN_DIR / 'train_person_1.parquet', 1),
        SchemaGen.scan_files(TRAIN_DIR / 'train_deposit_1.parquet', 1),
        SchemaGen.scan_files(TRAIN_DIR / 'train_debitcard_1.parquet', 1),
    ],
    'depth_2': [
        SchemaGen.scan_files(TRAIN_DIR / 'train_credit_bureau_a_2_*.parquet', 2),
        SchemaGen.scan_files(TRAIN_DIR / 'train_credit_bureau_b_2.parquet', 2),
    ]
}

df_train:pl.LazyFrame = SchemaGen.join_dataframes(**data_store).pipe(filter_cols).pipe(handle_dates).pipe(Utility.reduce_memory_usage, 'df_train')

del data_store
gc.collect()

print(f'Train data shape: {df_train.shape}')
display(df_train.head(10))

File train_base loaded into memory.
File train_static_cb_0 loaded into memory.
File train_static_0_0 loaded into memory.
File train_static_0_1 loaded into memory.
File train_applprev_1_1 loaded into memory.
File train_applprev_1_0 loaded into memory.
File train_tax_registry_a_1 loaded into memory.
File train_tax_registry_b_1 loaded into memory.
File train_tax_registry_c_1 loaded into memory.
File train_credit_bureau_a_1_3 loaded into memory.
File train_credit_bureau_a_1_2 loaded into memory.
File train_credit_bureau_a_1_0 loaded into memory.
File train_credit_bureau_a_1_1 loaded into memory.
File train_credit_bureau_b_1 loaded into memory.
File train_other_1 loaded into memory.
File train_person_1 loaded into memory.
File train_deposit_1 loaded into memory.
File train_debitcard_1 loaded into memory.
File train_credit_bureau_a_2_6 loaded into memory.
File train_credit_bureau_a_2_1 loaded into memory.
File train_credit_bureau_a_2_0 loaded into memory.
File train_credit_bureau_a_2_7 loade

case_id,target,assignmentdate_238D,assignmentdate_4527235D,birthdate_574D,contractssum_5085716L,dateofbirth_337D,days120_123L,days180_256L,days30_165L,days360_512L,days90_310L,description_5085714M,education_1103M,education_88M,firstquarter_103L,fourthquarter_440L,maritalst_385M,maritalst_893M,numberofqueries_373L,pmtaverage_3A,pmtaverage_4527227A,pmtcount_4527229L,pmtcount_693L,pmtscount_423L,pmtssum_45A,requesttype_4525192L,responsedate_1012D,responsedate_4527233D,responsedate_4917613D,secondquarter_766L,thirdquarter_1082L,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,…,max_amount_416A,max_num_group1_10,max_openingdate_313D,mean_amount_416A,mean_openingdate_313D,max_num_group1_11,max_openingdate_857D,mean_openingdate_857D,max_collater_typofvalofguarant_298M,max_collater_typofvalofguarant_407M,max_collater_valueofguarantee_1124L,max_collater_valueofguarantee_876L,max_collaterals_typeofguarante_359M,max_collaterals_typeofguarante_669M,max_num_group1_12,max_num_group2,max_pmts_dpd_1073P,max_pmts_dpd_303P,max_pmts_month_158T,max_pmts_month_706T,max_pmts_overdue_1140A,max_pmts_overdue_1152A,max_pmts_year_1139T,max_pmts_year_507T,max_subjectroles_name_541M,max_subjectroles_name_838M,mean_pmts_dpd_1073P,mean_pmts_dpd_303P,mean_pmts_overdue_1140A,mean_pmts_overdue_1152A,var_pmts_dpd_1073P,var_pmts_dpd_303P,var_pmts_overdue_1140A,var_pmts_overdue_1152A,year,month,week_num
u32,u8,i16,u8,i16,f32,i32,f32,f32,f32,f32,f32,str,str,str,f32,f32,str,str,f32,f32,f32,f32,f32,f32,f32,str,i8,u8,i8,f32,f32,f32,f32,f32,f32,f32,…,f32,u8,i16,f32,i16,u8,i16,i16,str,str,f32,f32,str,str,u16,u8,f32,f32,f32,f32,f32,f32,f32,f32,str,str,f32,f32,f32,f32,f32,f32,f32,f32,u16,u8,u8
137116,0,,,-19742.0,,-19742.0,1.0,1.0,1.0,3.0,1.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",1.0,2.0,"""3439d993""","""a55475b1""",3.0,,,,,5.0,9518.746094,,14.0,,,4.0,1.0,0.0,0.0,6304.600098,0.0,0.0,…,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,0.0,"""a55475b1""","""c7a5ad39""",1.0,35.0,4.0,0.0,12.0,12.0,3082.89624,0.0,2020.0,2019.0,"""ab3c25cf""","""ab3c25cf""",0.153846,0.0,79.602364,0.0,0.502024,0.0,243620.796875,0.0,2019,6,4
1017348,0,,,,52920.328125,-23719.0,0.0,0.0,0.0,0.0,0.0,"""2fc785b2""","""717ddd49""","""a55475b1""",0.0,0.0,"""3439d993""","""a55475b1""",0.0,,,,,,,,,,14.0,2.0,5.0,0.0,,7080.800293,0.0,0.0,…,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,144300.0,"""c7a5ad39""","""c7a5ad39""",7.0,35.0,17.0,6.0,12.0,12.0,390.389984,22267.220703,2021.0,2021.0,"""ab3c25cf""","""ab3c25cf""",0.414634,0.2375,9.521707,1926.164673,7.04878,0.94288,3717.178955,17218864.0,2020,9,2
1484830,0,,,-9110.0,,-9110.0,0.0,0.0,0.0,2.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",5.0,3.0,"""a55475b1""","""a55475b1""",2.0,,,,,6.0,12091.574219,,14.0,,,0.0,0.0,0.0,6518.800293,4647.399902,0.0,0.0,…,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,0.0,"""c7a5ad39""","""c7a5ad39""",0.0,35.0,32.0,0.0,12.0,12.0,2556.656006,0.0,2020.0,2019.0,"""ab3c25cf""","""ab3c25cf""",2.041667,0.0,106.581001,0.0,46.650364,0.0,272341.84375,0.0,2019,8,7
948058,0,,,,,-12691.0,3.0,4.0,0.0,7.0,1.0,"""a55475b1""","""a55475b1""","""a55475b1""",3.0,3.0,"""a55475b1""","""a55475b1""",7.0,,,,,,,"""DEDUCTION_6""",,14.0,,5.0,2.0,0.0,5155.399902,4682.600098,1419.599976,0.0,…,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,0.0,"""c7a5ad39""","""c7a5ad39""",2.0,35.0,0.0,2301.0,12.0,12.0,0.0,99928.015625,2021.0,2020.0,"""ab3c25cf""","""ab3c25cf""",0.0,1087.020386,0.0,49348.34375,0.0,852619.125,0.0,1677000000.0,2020,1,3
1442436,0,,,-20129.0,,-20129.0,4.0,6.0,1.0,14.0,3.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",6.0,6.0,"""3439d993""","""a55475b1""",14.0,,,,,12.0,27635.0,,14.0,,,9.0,9.0,0.0,0.0,1492.0,5264.800293,0.0,…,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,0.0,"""c7a5ad39""","""c7a5ad39""",2.0,35.0,8.0,3.0,12.0,12.0,15076.131836,2972.0,2020.0,2019.0,"""ab3c25cf""","""ab3c25cf""",0.612245,0.078947,1453.762085,78.210526,2.57568,0.236842,12906598.0,232441.6875,2019,7,5
1939027,0,,,,155714.6875,-8396.0,6.0,6.0,2.0,11.0,6.0,"""2fc785b2""","""a55475b1""","""a55475b1""",1.0,7.0,"""a55475b1""","""a55475b1""",11.0,,,,,,,,,,14.0,0.0,6.0,0.0,8679.400391,4090.800049,0.0,0.0,…,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,0.0,"""c7a5ad39""","""c7a5ad39""",1.0,35.0,4.0,6.0,12.0,12.0,7380.0,1694.200073,2021.0,2021.0,"""ab3c25cf""","""ab3c25cf""",0.083333,0.538462,153.75,140.184616,0.333333,2.769231,1134675.0,219273.0625,2020,9,6
2648940,0,,14.0,,,-25909.0,2.0,2.0,0.0,8.0,2.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",3.0,1.0,"""3439d993""","""a55475b1""",8.0,,9389.200195,6.0,,,,"""PENSION_6""",,14.0,,9.0,2.0,0.0,239404.609375,11492.600586,9505.200195,0.0,…,3442.615967,0.0,-1103.0,3442.615967,-1103.0,0.0,-1103.0,-1103.0,"""a55475b1""","""a55475b1""",0.0,1173942.0,"""c7a5ad39""","""c7a5ad39""",21.0,35.0,0.0,1.0,12.0,12.0,0.0,0.0,2020.0,2020.0,"""ab3c25cf""","""ab3c25cf""",0.0,0.006061,0.0,0.0,0.0,0.006061,0.0,0.0,2019,12,1
1460648,0,,,-12077.0,,-12077.0,2.0,3.0,1.0,12.0,2.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",7.0,4.0,"""3439d993""","""a55475b1""",12.0,,,,,6.0,28051.287109,,14.0,,,12.0,15.0,0.0,27240.082031,1845.800049,0.0,0.0,…,,,,,,,,,"""a55475b1""","""a55475b1""",2679212.0,761972.0,"""c7a5ad39""","""c7a5ad39""",15.0,35.0,0.0,20.0,12.0,12.0,0.0,2989.199951,2020.0,2020.0,"""ab3c25cf""","""ab3c25cf""",0.0,0.236264,0.0,21.985693,0.0,4.037794,0.0,53783.53125,2019,7,4
1835962,0,,,,,-13235.0,0.0,1.0,0.0,2.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",0.0,1.0,"""a55475b1""","""a55475b1""",2.0,,,,,,,,,,14.0,0.0,1.0,0.0,88715.984375,2611.400146,0.0,0.0,…,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,0.0,"""c7a5ad39""","""c7a5ad39""",2.0,35.0,0.0,0.0,12.0,12.0,0.0,0.0,2021.0,2021.0,"""ab3c25cf""","""ab3c25cf""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2020,4,1
917326,0,,,,,,,,,,,"""a55475b1""","""a55475b1""","""a55475b1""",,,"""a55475b1""","""a55475b1""",,,,,,,,"""DEDUCTION_6""",,14.0,,,,,,2500.0,0.0,0.0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2019,12,7


In [7]:
data_store:dict = {
    'df_base': SchemaGen.scan_files(TEST_DIR / 'test_base.parquet'),
    'depth_0': [
        SchemaGen.scan_files(TEST_DIR / 'test_static_cb_0.parquet'),
        SchemaGen.scan_files(TEST_DIR / 'test_static_0_*.parquet'),
    ],
    'depth_1': [
        SchemaGen.scan_files(TEST_DIR / 'test_applprev_1_*.parquet', 1),
        SchemaGen.scan_files(TEST_DIR / 'test_tax_registry_a_1.parquet', 1),
        SchemaGen.scan_files(TEST_DIR / 'test_tax_registry_b_1.parquet', 1),
        SchemaGen.scan_files(TEST_DIR / 'test_tax_registry_c_1.parquet', 1),
        SchemaGen.scan_files(TEST_DIR / 'test_credit_bureau_a_1_*.parquet', 1),
        SchemaGen.scan_files(TEST_DIR / 'test_credit_bureau_b_1.parquet', 1),
        SchemaGen.scan_files(TEST_DIR / 'test_other_1.parquet', 1),
        SchemaGen.scan_files(TEST_DIR / 'test_person_1.parquet', 1),
        SchemaGen.scan_files(TEST_DIR / 'test_deposit_1.parquet', 1),
        SchemaGen.scan_files(TEST_DIR / 'test_debitcard_1.parquet', 1),
    ],
    'depth_2': [
        SchemaGen.scan_files(TEST_DIR / 'test_credit_bureau_a_2_*.parquet', 2),
        SchemaGen.scan_files(TEST_DIR / 'test_credit_bureau_b_2.parquet', 2),
    ]
}

df_test:pl.DataFrame = SchemaGen.join_dataframes(**data_store).pipe(handle_dates).select([col for col in df_train.columns if col != "target"]).pipe(Utility.reduce_memory_usage, 'df_test')

del data_store
gc.collect()

print(f'Test data shape: {df_test.shape}')

File test_base loaded into memory.
File test_static_cb_0 loaded into memory.
File test_static_0_0 loaded into memory.
File test_static_0_2 loaded into memory.
File test_static_0_1 loaded into memory.
File test_applprev_1_2 loaded into memory.
File test_applprev_1_0 loaded into memory.
File test_applprev_1_1 loaded into memory.
File test_tax_registry_a_1 loaded into memory.
File test_tax_registry_b_1 loaded into memory.
File test_tax_registry_c_1 loaded into memory.
File test_credit_bureau_a_1_3 loaded into memory.
File test_credit_bureau_a_1_2 loaded into memory.
File test_credit_bureau_a_1_1 loaded into memory.
File test_credit_bureau_a_1_4 loaded into memory.
File test_credit_bureau_a_1_0 loaded into memory.
File test_credit_bureau_b_1 loaded into memory.
File test_other_1 loaded into memory.
File test_person_1 loaded into memory.
File test_deposit_1 loaded into memory.
File test_debitcard_1 loaded into memory.
File test_credit_bureau_a_2_3 loaded into memory.
File test_credit_bureau

In [8]:
df_train, cat_cols = Utility.to_pandas(df_train)
df_test, cat_cols = Utility.to_pandas(df_test, cat_cols)

In [10]:
class VotingModel(BaseEstimator, ClassifierMixin):
    def __init__(self, estimators:list):
        super().__init__()
        self.estimators = estimators
        
    def fit(self, X, y=None):
        return self
    
    def predict(self, X):
        y_preds = [estimator.predict(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)
    
    def predict_proba(self, X):
        y_preds = [estimator.predict_proba(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)

In [11]:
df_subm:pd.DataFrame = pd.read_csv(ROOT / 'sample_submission.csv')
df_subm = df_subm.set_index('case_id')

device:str = 'gpu'
est_cnt:int = 6000
    
DRY_RUN = True if df_subm.shape[0] == 10 else False   
if DRY_RUN:
    device = 'cpu'
    df_train = df_train.iloc[:50000]
    est_cnt:int = 600

print(device)

cpu


In [12]:
X = df_train.drop(columns=['target', 'case_id', 'week_num'])
y = df_train['target']

weeks = df_train['week_num']

del df_train
gc.collect()

cv = StratifiedGroupKFold(n_splits=5, shuffle=False)

params1 = {
    'boosting_type': 'gbdt',
    'colsample_bynode': 0.8,
    'colsample_bytree': 0.8,
    'device': device,
    'extra_trees': True,
    'learning_rate': 0.05,
    'max_depth': 10,
    'metric': 'auc',
    'n_estimators': 2000,
    'num_leaves': 64,
    'objective': 'binary',
    'random_state': 42,
    'reg_alpha': 0.1,
    'reg_lambda': 10,
    'verbose': -1,   
}

params2 = {
    'boosting_type': 'gbdt',
    'colsample_bynode': 0.8,
    'colsample_bytree': 0.8,
    'device': device,
    'extra_trees': True,
    'learning_rate': 0.03,
    'max_depth': 8,
    'metric': 'auc',
    'n_estimators': 2000,
    'num_leaves': 50,
    'objective': 'binary',
    'random_state': 42,
    'reg_alpha': 0.1,
    'reg_lambda': 10,
    'verbose': -1,   
}

fitted_models_cat = []
fitted_models_lgb = []

cv_scores_cat = []
cv_scores_lgb = []

iter_cnt = 0
for idx_train, idx_valid in cv.split(X, y, groups=weeks):
    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]
    
    train_pool = Pool(X_train, y_train, cat_features=cat_cols)
    val_pool = Pool(X_valid, y_valid, cat_features=cat_cols)
    
    clf = CatBoostClassifier(
        eval_metric='AUC',
        iterations=est_cnt,
        learning_rate=0.03,
        random_seed=3107,
        task_type='GPU',
    )
    
    clf.fit(train_pool, eval_set=val_pool, verbose=False)
    fitted_models_cat.append(clf)
    
    y_pred_valid = clf.predict_proba(X_valid)[:, 1]
    auc_score = roc_auc_score(y_valid, y_pred_valid)
    cv_scores_cat.append(auc_score)

    X_train[cat_cols] = X_train[cat_cols].astype("category")
    X_valid[cat_cols] = X_valid[cat_cols].astype("category")
    
    if iter_cnt % 2 == 0:
        model = lgb.LGBMClassifier(**params1)
    else:
        model = lgb.LGBMClassifier(**params2)
    
    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        callbacks=[lgb.log_evaluation(100), lgb.early_stopping(100)]
    )
    fitted_models_lgb.append(model)

    y_pred_valid = model.predict_proba(X_valid)[:, 1]
    auc_score = roc_auc_score(y_valid, y_pred_valid)
    cv_scores_lgb.append(auc_score)
    
    iter_cnt += 1
    
model = VotingModel(fitted_models_cat + fitted_models_lgb)

print(f'\nCV AUC scores for CatBoost: {cv_scores_cat}')
print(f'Maximum CV AUC score for Catboost: {max(cv_scores_cat)}', end='\n\n')


print(f'CV AUC scores for LGBM: {cv_scores_lgb}')
print(f'Maximum CV AUC score for LGBM: {max(cv_scores_lgb)}', end='\n\n')

del X, y
gc.collect()

Default metric period is 5 because AUC is/are not implemented for GPU


Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.828504
[200]	valid_0's auc: 0.840693
[300]	valid_0's auc: 0.840622
Early stopping, best iteration is:
[247]	valid_0's auc: 0.842004


Default metric period is 5 because AUC is/are not implemented for GPU


Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.81438
[200]	valid_0's auc: 0.823554
[300]	valid_0's auc: 0.825703
[400]	valid_0's auc: 0.825317
Early stopping, best iteration is:
[320]	valid_0's auc: 0.826116


Default metric period is 5 because AUC is/are not implemented for GPU


Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.805997
[200]	valid_0's auc: 0.808867
Early stopping, best iteration is:
[145]	valid_0's auc: 0.810554


Default metric period is 5 because AUC is/are not implemented for GPU


Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.821018
[200]	valid_0's auc: 0.827478
[300]	valid_0's auc: 0.830018
[400]	valid_0's auc: 0.829375
Early stopping, best iteration is:
[321]	valid_0's auc: 0.830431


Default metric period is 5 because AUC is/are not implemented for GPU


Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.820625
[200]	valid_0's auc: 0.82567
[300]	valid_0's auc: 0.826713
Early stopping, best iteration is:
[264]	valid_0's auc: 0.827956

CV AUC scores for CatBoost: [0.8174293834708759, 0.8087154145195176, 0.7767810753493966, 0.8173708047020867, 0.7979656495697244]
Maximum CV AUC score for Catboost: 0.8174293834708759

CV AUC scores for LGBM: [0.8420042727029481, 0.8261162531612474, 0.8105541655819789, 0.8304305867068018, 0.8279560865846406]
Maximum CV AUC score for LGBM: 0.8420042727029481



30

In [13]:
X_test:pd.DataFrame = df_test.drop(columns=['week_num']).set_index('case_id')
    
X_test[cat_cols] = X_test[cat_cols].astype("category")

y_pred:pd.Series = pd.Series(model.predict_proba(X_test)[:, 1], index=X_test.index)

df_subm['score'] = y_pred

display(df_subm)
    
df_subm.to_csv("submission.csv")

del X_test, y_pred, df_subm
gc.collect()

Unnamed: 0_level_0,score
case_id,Unnamed: 1_level_1
57543,0.014565
57549,0.029926
57551,0.00548
57552,0.025835
57569,0.088715
57630,0.015527
57631,0.026919
57632,0.010297
57633,0.018118
57634,0.025134


4