In [1]:
import gc
import lightgbm as lgb
import numpy as np
import pandas as pd
import polars as pl
import warnings

from glob import glob
from IPython.display import display
from pathlib import Path
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedGroupKFold
from typing import Any

warnings.filterwarnings('ignore')

ROOT      = Path('/kaggle/input/home-credit-credit-risk-model-stability')
TRAIN_DIR = ROOT / 'parquet_files' / 'train'
TEST_DIR  = ROOT / 'parquet_files' / 'test'

In [2]:
class Utility:
    @staticmethod
    def get_feat_defs(ending_with: str):
        feat_defs: pl.DataFrame = pl.read_csv(ROOT / 'feature_definitions.csv')

        filtered_feats: pl.DataFrame = feat_defs.filter(pl.col('Variable').apply(lambda var: var.endswith(ending_with)))

        with pl.Config(fmt_str_lengths=200, tbl_rows=-1):
            print(filtered_feats)

        filtered_feats = None
        feat_defs = None

     
    @staticmethod
    def find_index(lst: list, item: Any) -> int | None:
        try:
            return lst.index(item)
        except ValueError:
            return None

    
    @staticmethod
    def dtype_to_str(dtype: pl.DataType) -> str:
        dtype_map = {
            pl.Decimal: 'Decimal',

            pl.Float32: 'Float32',
            pl.Float64: 'Float64',

            pl.UInt8: 'UInt8',
            pl.UInt16: 'UInt16',
            pl.UInt32: 'UInt32',
            pl.UInt64: 'UInt64',

            pl.Int8: 'Int8',
            pl.Int16: 'Int16',
            pl.Int32: 'Int32',
            pl.Int64: 'Int64',

            pl.Date: 'Date',
            pl.Datetime: 'Datetime',
            pl.Duration: 'Duration',
            pl.Time: 'Time',

            pl.Array: 'Array',
            pl.List: 'List',
            pl.Struct: 'Struct',

            pl.String: 'String',
            pl.Categorical: 'Categorical',
            pl.Enum: 'Enum',
            pl.Utf8: 'Utf8',

            pl.Binary: 'Binary',
            pl.Boolean: 'Boolean',
            pl.Null: 'Null',
            pl.Object: 'Object',
            pl.Unknown: 'Unknown'
        }

        return dtype_map.get(dtype)

    
    @staticmethod
    def find_feat_occur(regex_path: str, ending_with: str) -> pl.DataFrame:
        feat_defs: pl.DataFrame = pl.read_csv(ROOT / 'feature_definitions.csv').filter(pl.col('Variable').apply(lambda var: var.endswith(ending_with)))
        feat_defs.sort(by=['Variable'])

        feats: list = feat_defs['Variable'].to_list()
        feats.sort()

        occurrences: list = [[set(), set()] for _ in range(feat_defs.height)]

        for path in glob(str(regex_path)):
            df_schema: dict = pl.read_parquet_schema(path)

            for (feat, dtype) in df_schema.items():
                index: int = Utility.find_index(feats, feat)
                if index != None:
                    occurrences[index][0].add(Utility.dtype_to_str(dtype))
                    occurrences[index][1].add(Path(path).stem)

        data_types: list[str] = [None] * feat_defs.height
        file_locs: list[str] = [None] * feat_defs.height

        for i, feat in enumerate(feats):
            data_types[i] = list(occurrences[i][0])
            file_locs[i] = list(occurrences[i][1])

        feat_defs = feat_defs.with_columns(pl.Series(data_types).alias('Data_Type(s)'))
        feat_defs = feat_defs.with_columns(pl.Series(file_locs).alias('File_Loc(s)'))

        return feat_defs
    
    
    def reduce_memory_usage(df: pl.DataFrame, name) -> pl.DataFrame:
        print(f'Memory usage of dataframe \'{name}\' is {round(df.estimated_size("mb"), 2)} MB.')

        int_types = [pl.Int8, pl.Int16, pl.Int32, pl.Int64]
        float_types = [pl.Float32, pl.Float64]

        for col in df.columns:
            col_type = df[col].dtype
            if (col_type in int_types + float_types):
                c_min = df[col].min()
                c_max = df[col].max()

                if col_type in int_types:
                    if c_min is not None and c_max is not None:
                        if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                            df = df.with_columns(df[col].cast(pl.Int8))
                        elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                            df = df.with_columns(df[col].cast(pl.Int16))
                        elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                            df = df.with_columns(df[col].cast(pl.Int32))
                        elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                            df = df.with_columns(df[col].cast(pl.Int64))
                elif col_type in float_types:
                    if c_min is not None and c_max is not None:
                        if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                            df = df.with_columns(df[col].cast(pl.Float32))
                else:
                    pass

        print(f'Memory usage of dataframe \'{name}\' became {round(df.estimated_size("mb"), 2)} MB.')

        return df


    def to_pandas(df: pl.DataFrame, cat_cols:list[str]=None) -> (pd.DataFrame, list[str]):
        df: pd.DataFrame = df.to_pandas()

        if cat_cols is None:
            cat_cols = list(df.select_dtypes('object').columns)

        df[cat_cols] = df[cat_cols].astype('category')

        return df, cat_cols

In [3]:
class SchemaGen:
    @staticmethod
    def change_dtypes(df: pl.LazyFrame) -> pl.LazyFrame:
        for col in df.columns:
            if col in ['case_id', 'WEEK_NUM', 'num_group1', 'num_group2']:
                df = df.with_columns(pl.col(col).cast(pl.UInt32).alias(col))
            elif col == 'date_decision':
                df = df.with_columns(pl.col(col).cast(pl.Date).alias(col))
            elif col[-1] in ['P', 'A']:
                df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))
            elif col[-1] == 'D':
                df = df.with_columns(pl.col(col).cast(pl.Date).alias(col))
            elif col[-1] in ('M',):
                    df = df.with_columns(pl.col(col).cast(pl.String));
        return df


    @staticmethod
    def aggregate(df:pl.LazyFrame, depth:int=None) -> pl.LazyFrame:
        if depth in (1, 2):
            aggs: list = [];
            cols: list[str] = df.columns;

            for col in cols:
                if col[-1] in ('P', 'M', 'A', 'D', 'T', 'L') or 'num_group' in col:
                    for method in (pl.max, pl.min, pl.first, pl.last):
                        aggs.append(method(col).alias(f'{method.__name__}_{col}'))

                if col.endswith(('P', 'A', 'D')):
                    aggs.append(pl.col(col).mean().alias(f'mean_{col}'))

                if col.endswith('M') and df[col].dtype != pl.Null:
                    aggs.append(pl.col(col).drop_nulls().mode().first().alias(f'mode_{col}'))

            return df.group_by('case_id').agg(aggs)
        else:
            return df


    @staticmethod
    def scan_files(glob_path:str, depth:int=None) -> pl.LazyFrame:
        chunks: list[pl.LazyFrame] = []
        for path in glob(str(glob_path)):
            df: pl.LazyFrame = pl.scan_parquet(path, low_memory=True).pipe(SchemaGen.change_dtypes)

            if depth in [1, 2]:
                df = SchemaGen.aggregate(df)

            chunks.append(df)

        df = pl.concat(chunks, how='vertical_relaxed')
        df = df.unique(subset=['case_id'])
    
        return df
    
    
    @staticmethod
    def join_dataframes(df_base: pl.LazyFrame, depth_0: list[pl.LazyFrame], depth_1: list[pl.LazyFrame], depth_2: list[pl.LazyFrame]) -> pl.DataFrame:
        for (i, df) in enumerate(depth_0 + depth_1 + depth_2):
            df_base = df_base.join(df, how='left', on='case_id', suffix=f'_{i}')

        return df_base.collect()
    
    

In [4]:
def handle_dates(df: pl.DataFrame) -> pl.DataFrame:
    for col in df.columns:
        if col.endswith('D'):
            df = df.with_columns(pl.col(col) - pl.col('date_decision'))
            df = df.with_columns(pl.col(col).dt.total_days().cast(pl.Int32))

    df = df.with_columns([pl.col('date_decision').dt.year().alias('year').cast(pl.Int16), pl.col('date_decision').dt.month().alias('month').cast(pl.UInt8), pl.col('date_decision').dt.weekday().alias('week_num').cast(pl.UInt8)])

    return df.drop('date_decision', 'MONTH', 'WEEK_NUM');


def filter_cols(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.columns:
        if col not in ['case_id', 'year', 'month', 'week_num', 'target']:
            null_pct = df[col].is_null().mean()

            if null_pct > 0.75:
                df = df.drop(col)

    for col in df.columns:
        if (col not in ['case_id', 'year', 'month', 'week_num', 'target']) & (df[col].dtype == pl.String):
            freq = df[col].n_unique()

            if (freq > 200) | (freq == 1):
                df = df.drop(col)

    return df

In [5]:
# feat_defs: pl.DataFrame = Utility.find_feat_occur(TRAIN_DIR / 'train_*.parquet', 'P')
# feat_defs: pl.DataFrame = Utility.find_feat_occur(TRAIN_DIR / 'train_*.parquet', 'M')
# feat_defs: pl.DataFrame = Utility.find_feat_occur(TRAIN_DIR / 'train_*.parquet', 'A')
# feat_defs: pl.DataFrame = Utility.find_feat_occur(TRAIN_DIR / 'train_*.parquet', 'D')
# feat_defs: pl.DataFrame = Utility.find_feat_occur(TRAIN_DIR / 'train_*.parquet', 'T')
# feat_defs: pl.DataFrame = Utility.find_feat_occur(TRAIN_DIR / 'train_*.parquet', 'L')
# feat_defs: pl.DataFrame = pl.read_csv(ROOT / 'feature_definitions.csv')
# with pl.Config(fmt_str_lengths=1000, tbl_rows=-1, tbl_width_chars=180):
#     print(feat_defs)

In [6]:
data_store: dict = {
    'df_base': SchemaGen.scan_files(TRAIN_DIR / 'train_base.parquet'),
    'depth_0': [
        SchemaGen.scan_files(TRAIN_DIR / 'train_static_cb_0.parquet'),
        SchemaGen.scan_files(TRAIN_DIR / 'train_static_0_*.parquet'),
    ],
    'depth_1': [
        SchemaGen.scan_files(TRAIN_DIR / 'train_applprev_1_*.parquet', 1),
        SchemaGen.scan_files(TRAIN_DIR / 'train_tax_registry_a_1.parquet', 1),
        SchemaGen.scan_files(TRAIN_DIR / 'train_tax_registry_b_1.parquet', 1),
        SchemaGen.scan_files(TRAIN_DIR / 'train_tax_registry_c_1.parquet', 1),
        SchemaGen.scan_files(TRAIN_DIR / 'train_credit_bureau_a_1_*.parquet', 1),
        SchemaGen.scan_files(TRAIN_DIR / 'train_credit_bureau_b_1.parquet', 1),
        SchemaGen.scan_files(TRAIN_DIR / 'train_other_1.parquet', 1),
        SchemaGen.scan_files(TRAIN_DIR / 'train_person_1.parquet', 1),
        SchemaGen.scan_files(TRAIN_DIR / 'train_deposit_1.parquet', 1),
        SchemaGen.scan_files(TRAIN_DIR / 'train_debitcard_1.parquet', 1),
    ],
    'depth_2': [
#         SchemaGen.scan_files(TRAIN_DIR / 'train_credit_bureau_a_2_*.parquet', 2),
        SchemaGen.scan_files(TRAIN_DIR / 'train_credit_bureau_b_2.parquet', 2),
    ]
}

df_train: pl.LazyFrame = SchemaGen.join_dataframes(**data_store).pipe(filter_cols).pipe(handle_dates).pipe(Utility.reduce_memory_usage, 'df_train')

del data_store
gc.collect()

print(f'Train data shape: {df_train.shape}')
display(df_train.head(10))

Memory usage of dataframe 'df_train' is 2954.42 MB.
Memory usage of dataframe 'df_train' became 1728.52 MB.
Train data shape: (1526659, 290)


case_id,target,birthdate_574D,dateofbirth_337D,days120_123L,days180_256L,days30_165L,days360_512L,days90_310L,description_5085714M,education_1103M,education_88M,firstquarter_103L,fourthquarter_440L,maritalst_385M,maritalst_893M,numberofqueries_373L,pmtscount_423L,pmtssum_45A,requesttype_4525192L,responsedate_1012D,responsedate_4527233D,secondquarter_766L,thirdquarter_1082L,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,applicationscnt_1086L,applicationscnt_464L,applicationscnt_629L,applicationscnt_867L,avgdbddpdlast24m_3658932P,avgdbddpdlast3m_4187120P,avgdbdtollast24m_4525197P,…,overdueamountmaxdateyear_994T,periodicityofpmts_1102L,periodicityofpmts_837L,purposeofcred_426M,purposeofcred_874M,refreshdate_3813885D,residualamount_856A,subjectrole_182M,subjectrole_93M,totalamount_6A,totalamount_996A,totaldebtoverduevalue_178A,totaldebtoverduevalue_718A,totaloutstanddebtvalue_39A,totaloutstanddebtvalue_668A,birth_259D,contaddr_matchlist_1032L,contaddr_smempladdr_334L,education_927M,empl_employedfrom_271D,empl_employedtotal_800L,empl_industry_691L,familystate_447L,incometype_1044T,language1_981M,mainoccupationinc_384A,num_group1_9,personindex_1023L,persontype_1072L,persontype_792L,role_1084L,safeguarantyflag_411L,sex_738L,type_25L,year,month,week_num
u32,i8,i16,i32,f32,f32,f32,f32,f32,str,str,str,f32,f32,str,str,f32,f32,f32,str,i8,i8,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,…,f32,f32,f32,str,str,i16,f32,str,str,f32,f32,f32,f32,f32,f32,i16,bool,bool,str,i16,str,str,str,str,str,f32,u32,f32,f32,f32,str,bool,str,str,i16,u8,u8
27082,0,-19192.0,-19192.0,2.0,2.0,2.0,2.0,2.0,"""a55475b1""","""c8e1a1d0""","""a55475b1""",0.0,0.0,"""3439d993""","""a55475b1""",2.0,6.0,12400.0,,14.0,,0.0,3.0,,,5985.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,,,,…,,,30.0,"""96a8fdfe""","""a55475b1""",14.0,,"""a55475b1""","""a55475b1""",,86296.0,,,,,-19192,False,False,"""P97_36_170""",-2773.0,"""MORE_FIVE""","""OTHER""","""MARRIED""","""SALARIED_GOVT""","""P209_127_106""",38000.0,0,0.0,1.0,1.0,"""CL""",True,"""M""","""HOME_PHONE""",2019,8,1
708824,0,-10661.0,-10661.0,0.0,0.0,0.0,0.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",0.0,0.0,"""a55475b1""","""a55475b1""",0.0,4.0,1875.75,,14.0,,0.0,0.0,,,4176.200195,0.0,0.0,0.0,0.0,1.0,0.0,0.0,,,,…,,,,"""60c73645""","""a55475b1""",,1720.258057,"""ab3c25cf""","""a55475b1""",,,0.0,,1720.258057,,-10661,False,False,"""P97_36_170""",-3432.0,"""MORE_FIVE""","""OTHER""","""SINGLE""","""EMPLOYED""","""P209_127_106""",16000.0,0,0.0,1.0,1.0,"""EM""",True,"""M""","""PRIMARY_MOBILE…",2019,6,7
1525928,0,,-9349.0,4.0,5.0,3.0,6.0,4.0,"""a55475b1""","""a55475b1""","""a55475b1""",2.0,0.0,"""a55475b1""","""a55475b1""",6.0,11.0,8932.0,"""DEDUCTION_6""",14.0,14.0,0.0,4.0,0.0,40499.800781,3366.199951,2094.600098,0.0,2.0,0.0,0.0,0.0,2.0,-6.0,-5.0,-8.0,…,2019.0,30.0,,"""60c73645""","""60c73645""",14.0,3049.0,"""a55475b1""","""a55475b1""",16982.0,,,,,,-9349,False,False,"""a55475b1""",,,,,"""SALARIED_GOVT""","""P10_39_147""",40000.0,0,0.0,1.0,1.0,"""CL""",True,"""M""","""PRIMARY_MOBILE…",2019,9,6
2661245,0,,-18742.0,3.0,5.0,2.0,12.0,2.0,"""a55475b1""","""a55475b1""","""a55475b1""",7.0,8.0,"""a7fcb6e5""","""a55475b1""",12.0,,,"""DEDUCTION_6""",,14.0,7.0,7.0,0.0,102566.210938,1298.0,5301.800293,0.0,0.0,0.0,0.0,0.0,7.0,-1.0,-6.0,-3.0,…,2014.0,30.0,30.0,"""96a8fdfe""","""60c73645""",14.0,,"""a55475b1""","""a55475b1""",10000.0,33000.0,,,,,-18742,False,False,"""a55475b1""",,,,,"""SALARIED_GOVT""","""P10_39_147""",50000.0,0,0.0,1.0,1.0,"""CL""",True,"""F""","""PRIMARY_MOBILE…",2020,1,5
1446274,0,-12188.0,-12188.0,3.0,3.0,1.0,11.0,1.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",1.0,6.0,"""3439d993""","""a55475b1""",11.0,6.0,4815.800293,,14.0,,1.0,3.0,0.0,0.0,2452.400146,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,…,2015.0,,,"""a55475b1""","""60c73645""",-158.0,,"""a55475b1""","""a55475b1""",,,,,,,-12188,False,False,"""a55475b1""",,,,,"""EMPLOYED""","""P209_127_106""",40000.0,0,0.0,1.0,1.0,"""CL""",True,"""F""","""PRIMARY_MOBILE…",2019,7,7
650234,0,-10088.0,,,,,,,"""a55475b1""","""a55475b1""","""a55475b1""",,,"""a55475b1""","""a55475b1""",,5.0,61836.96875,,14.0,,,,,,1010.400024,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,…,,,,,,,,,,,,,,,,-10088,False,False,"""P33_146_175""",-273.0,"""MORE_FIVE""","""OTHER""","""SINGLE""","""PRIVATE_SECTOR…","""P209_127_106""",120000.0,0,0.0,1.0,1.0,"""CL""",True,"""F""","""PRIMARY_MOBILE…",2019,3,5
771668,0,-15263.0,,,,,,,"""a55475b1""","""a55475b1""","""a55475b1""",,,"""a55475b1""","""a55475b1""",,5.0,8424.200195,,14.0,,,,,,4182.399902,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,…,,,,,,,,,,,,,,,,-15263,False,False,"""P97_36_170""",-123.0,"""LESS_ONE""","""REAL_ESTATE""","""SINGLE""","""PRIVATE_SECTOR…","""P10_39_147""",30000.0,0,0.0,1.0,1.0,"""CL""",True,"""M""","""PRIMARY_MOBILE…",2019,8,5
1453702,1,-16212.0,-16212.0,3.0,3.0,1.0,11.0,3.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",3.0,10.0,"""3439d993""","""a55475b1""",11.0,1.0,1040.0,,14.0,,4.0,5.0,0.0,32694.0,1326.200073,3269.400146,0.0,0.0,0.0,0.0,0.0,6.0,-3.0,-1.0,,…,2019.0,30.0,30.0,"""96a8fdfe""","""96a8fdfe""",14.0,,"""a55475b1""","""a55475b1""",17100.0,16000.0,,,,,-16212,False,False,"""a55475b1""",,,,,"""PRIVATE_SECTOR…","""P10_39_147""",36000.0,0,0.0,1.0,1.0,"""CL""",True,"""M""","""PRIMARY_MOBILE…",2019,7,6
1588368,0,,-13685.0,4.0,5.0,1.0,15.0,4.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",11.0,15.0,"""3439d993""","""a55475b1""",15.0,,,"""DEDUCTION_6""",,14.0,9.0,13.0,0.0,60084.414062,2266.600098,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,-2.0,-1.0,…,2009.0,30.0,30.0,"""96a8fdfe""","""5065c2b8""",13.0,,"""ab3c25cf""","""ab3c25cf""",56790.601562,340000.0,7881.558105,0.0,503121.40625,0.0,-13685,False,False,"""a55475b1""",,,,,"""EMPLOYED""","""P209_127_106""",50000.0,0,0.0,1.0,1.0,"""CL""",True,"""F""","""PRIMARY_MOBILE…",2019,10,6
872527,0,,-9550.0,1.0,2.0,1.0,8.0,1.0,"""a55475b1""","""a55475b1""","""a55475b1""",9.0,6.0,"""a55475b1""","""a55475b1""",8.0,,,"""DEDUCTION_6""",,14.0,8.0,3.0,,,4660.200195,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,,,…,2014.0,30.0,30.0,"""96a8fdfe""","""96a8fdfe""",14.0,,"""ab3c25cf""","""ab3c25cf""",26808.800781,160000.0,0.0,0.0,135703.765625,0.0,-9550,False,False,"""P33_146_175""",-1074.0,"""MORE_ONE""","""GOVERNMENT""","""SINGLE""","""SALARIED_GOVT""","""P10_39_147""",94000.0,0,0.0,1.0,1.0,"""EM""",True,"""M""","""PRIMARY_MOBILE…",2019,11,7


In [7]:
data_store: dict = {
    'df_base': SchemaGen.scan_files(TEST_DIR / 'test_base.parquet'),
    'depth_0': [
        SchemaGen.scan_files(TEST_DIR / 'test_static_cb_0.parquet'),
        SchemaGen.scan_files(TEST_DIR / 'test_static_0_*.parquet'),
    ],
    'depth_1': [
        SchemaGen.scan_files(TEST_DIR / 'test_applprev_1_*.parquet', 1),
        SchemaGen.scan_files(TEST_DIR / 'test_tax_registry_a_1.parquet', 1),
        SchemaGen.scan_files(TEST_DIR / 'test_tax_registry_b_1.parquet', 1),
        SchemaGen.scan_files(TEST_DIR / 'test_tax_registry_c_1.parquet', 1),
        SchemaGen.scan_files(TEST_DIR / 'test_credit_bureau_a_1_*.parquet', 1),
        SchemaGen.scan_files(TEST_DIR / 'test_credit_bureau_b_1.parquet', 1),
        SchemaGen.scan_files(TEST_DIR / 'test_other_1.parquet', 1),
        SchemaGen.scan_files(TEST_DIR / 'test_person_1.parquet', 1),
        SchemaGen.scan_files(TEST_DIR / 'test_deposit_1.parquet', 1),
        SchemaGen.scan_files(TEST_DIR / 'test_debitcard_1.parquet', 1),
    ],
    'depth_2': [
#         SchemaGen.scan_files(TEST_DIR / 'test_credit_bureau_a_2_*.parquet', 2),
        SchemaGen.scan_files(TEST_DIR / 'test_credit_bureau_b_2.parquet', 2),
    ]
}

df_test: pl.DataFrame = SchemaGen.join_dataframes(**data_store).pipe(handle_dates).pipe(Utility.reduce_memory_usage, 'df_test').select([col for col in df_train.columns if col != "target"])

del data_store
gc.collect()

print(f'Test data shape: {df_test.shape}')

Memory usage of dataframe 'df_test' is 0.03 MB.
Memory usage of dataframe 'df_test' became 0.02 MB.
Test data shape: (10, 289)


In [8]:
df_train, cat_cols = Utility.to_pandas(df_train)
df_test, cat_cols = Utility.to_pandas(df_test, cat_cols)

gc.collect()

0

In [9]:
class VotingModel(BaseEstimator, ClassifierMixin):
    def __init__(self, estimators: list):
        super().__init__()
        self.estimators = estimators
        
    def fit(self, X, y=None):
        return self
    
    def predict(self, X):
        y_preds = [estimator.predict(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)
    
    def predict_proba(self, X):
        y_preds = [estimator.predict_proba(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)

In [10]:
X = df_train.drop(columns=['target', 'case_id', 'week_num'])
y = df_train['target']

weeks = df_train['week_num']

del df_train
gc.collect()

cv = StratifiedGroupKFold(n_splits=5, shuffle=False)

params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'max_depth': 8,
    'learning_rate': 0.05,
    'n_estimators': 1000,
    'colsample_bytree': 0.8, 
    'colsample_bynode': 0.8,
    'verbose': -1,
    'random_state': 42,
    'device': 'gpu',
}

fitted_models = []

for idx_train, idx_valid in cv.split(X, y, groups=weeks):
    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]

    model = lgb.LGBMClassifier(**params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        callbacks=[lgb.log_evaluation(100), lgb.early_stopping(100)]
    )

    fitted_models.append(model)

model = VotingModel(fitted_models)

del X, y
gc.collect()



Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.831946
[200]	valid_0's auc: 0.841497
[300]	valid_0's auc: 0.844827
[400]	valid_0's auc: 0.846256
[500]	valid_0's auc: 0.847073
[600]	valid_0's auc: 0.847384
[700]	valid_0's auc: 0.847684
[800]	valid_0's auc: 0.847926
[900]	valid_0's auc: 0.848242
[1000]	valid_0's auc: 0.848523
Did not meet early stopping. Best iteration is:
[995]	valid_0's auc: 0.848535
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.845687
[200]	valid_0's auc: 0.854444
[300]	valid_0's auc: 0.857791
[400]	valid_0's auc: 0.85895
[500]	valid_0's auc: 0.859568
[600]	valid_0's auc: 0.86011
[700]	valid_0's auc: 0.860301
[800]	valid_0's auc: 0.860625
[900]	valid_0's auc: 0.86087
[1000]	valid_0's auc: 0.861076
Did not meet early stopping. Best iteration is:
[994]	valid_0's auc: 0.861077
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.823056
[200]	valid_0's auc: 0.83231


558

In [11]:
X_test: pd.DataFrame = df_test.drop(columns=['week_num']).set_index('case_id')

y_pred: pd.Series = pd.Series(model.predict_proba(X_test)[:, 1], index=X_test.index)

df_subm = pd.read_csv(ROOT / 'sample_submission.csv')
df_subm = df_subm.set_index('case_id')

df_subm['score'] = y_pred

with pl.Config(fmt_str_lengths=1000, tbl_rows=-1, tbl_width_chars=180):
    print(df_subm)
    
df_subm.to_csv("submission.csv")

del X_test, y_pred, df_subm
gc.collect()

            score
case_id          
57543    0.005728
57549    0.030889
57551    0.004016
57552    0.009757
57569    0.071078
57630    0.004927
57631    0.015214
57632    0.002807
57633    0.025564
57634    0.008465


4