In [1]:
import gc
import lightgbm as lgb
import numpy as np
import pandas as pd
import polars as pl
import warnings

from catboost import CatBoostClassifier, Pool
from glob import glob
from IPython.display import display
from pathlib import Path
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedGroupKFold
from typing import Any

warnings.filterwarnings('ignore')

ROOT      = Path('/kaggle/input/home-credit-credit-risk-model-stability')
TRAIN_DIR = ROOT / 'parquet_files' / 'train'
TEST_DIR  = ROOT / 'parquet_files' / 'test'

In [2]:
class Utility:
    @staticmethod
    def get_feat_defs(ending_with: str):
        feat_defs: pl.DataFrame = pl.read_csv(ROOT / 'feature_definitions.csv')

        filtered_feats: pl.DataFrame = feat_defs.filter(pl.col('Variable').apply(lambda var: var.endswith(ending_with)))

        with pl.Config(fmt_str_lengths=200, tbl_rows=-1):
            print(filtered_feats)

        filtered_feats = None
        feat_defs = None

     
    @staticmethod
    def find_index(lst: list, item: Any) -> int | None:
        try:
            return lst.index(item)
        except ValueError:
            return None

    
    @staticmethod
    def dtype_to_str(dtype: pl.DataType) -> str:
        dtype_map = {
            pl.Decimal: 'Decimal',

            pl.Float32: 'Float32',
            pl.Float64: 'Float64',

            pl.UInt8: 'UInt8',
            pl.UInt16: 'UInt16',
            pl.UInt32: 'UInt32',
            pl.UInt64: 'UInt64',

            pl.Int8: 'Int8',
            pl.Int16: 'Int16',
            pl.Int32: 'Int32',
            pl.Int64: 'Int64',

            pl.Date: 'Date',
            pl.Datetime: 'Datetime',
            pl.Duration: 'Duration',
            pl.Time: 'Time',

            pl.Array: 'Array',
            pl.List: 'List',
            pl.Struct: 'Struct',

            pl.String: 'String',
            pl.Categorical: 'Categorical',
            pl.Enum: 'Enum',
            pl.Utf8: 'Utf8',

            pl.Binary: 'Binary',
            pl.Boolean: 'Boolean',
            pl.Null: 'Null',
            pl.Object: 'Object',
            pl.Unknown: 'Unknown'
        }

        return dtype_map.get(dtype)

    
    @staticmethod
    def find_feat_occur(regex_path: str, ending_with: str) -> pl.DataFrame:
        feat_defs: pl.DataFrame = pl.read_csv(ROOT / 'feature_definitions.csv').filter(pl.col('Variable').apply(lambda var: var.endswith(ending_with)))
        feat_defs.sort(by=['Variable'])

        feats: list = feat_defs['Variable'].to_list()
        feats.sort()

        occurrences: list = [[set(), set()] for _ in range(feat_defs.height)]

        for path in glob(str(regex_path)):
            df_schema: dict = pl.read_parquet_schema(path)

            for (feat, dtype) in df_schema.items():
                index: int = Utility.find_index(feats, feat)
                if index != None:
                    occurrences[index][0].add(Utility.dtype_to_str(dtype))
                    occurrences[index][1].add(Path(path).stem)

        data_types: list[str] = [None] * feat_defs.height
        file_locs: list[str] = [None] * feat_defs.height

        for i, feat in enumerate(feats):
            data_types[i] = list(occurrences[i][0])
            file_locs[i] = list(occurrences[i][1])

        feat_defs = feat_defs.with_columns(pl.Series(data_types).alias('Data_Type(s)'))
        feat_defs = feat_defs.with_columns(pl.Series(file_locs).alias('File_Loc(s)'))

        return feat_defs
    
    
    def reduce_memory_usage(df: pl.DataFrame, name) -> pl.DataFrame:
        print(f'Memory usage of dataframe \'{name}\' is {round(df.estimated_size("mb"), 2)} MB.')

        int_types = [pl.Int8, pl.Int16, pl.Int32, pl.Int64]
        float_types = [pl.Float32, pl.Float64]

        for col in df.columns:
            col_type = df[col].dtype
            if (col_type in int_types + float_types):
                c_min = df[col].min()
                c_max = df[col].max()

                if col_type in int_types:
                    if c_min is not None and c_max is not None:
                        if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                            df = df.with_columns(df[col].cast(pl.Int8))
                        elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                            df = df.with_columns(df[col].cast(pl.Int16))
                        elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                            df = df.with_columns(df[col].cast(pl.Int32))
                        elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                            df = df.with_columns(df[col].cast(pl.Int64))
                elif col_type in float_types:
                    if c_min is not None and c_max is not None:
                        if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                            df = df.with_columns(df[col].cast(pl.Float32))
                else:
                    pass

        print(f'Memory usage of dataframe \'{name}\' became {round(df.estimated_size("mb"), 2)} MB.')

        return df


    def to_pandas(df: pl.DataFrame, cat_cols:list[str]=None) -> (pd.DataFrame, list[str]):
        df: pd.DataFrame = df.to_pandas()

        if cat_cols is None:
            cat_cols = list(df.select_dtypes('object').columns)

        df[cat_cols] = df[cat_cols].astype('str')

        return df, cat_cols

In [3]:
class SchemaGen:
    @staticmethod
    def change_dtypes(df: pl.LazyFrame) -> pl.LazyFrame:
        for col in df.columns:
            if col in ['case_id', 'WEEK_NUM', 'num_group1', 'num_group2']:
                df = df.with_columns(pl.col(col).cast(pl.UInt32).alias(col))
            elif col == 'date_decision':
                df = df.with_columns(pl.col(col).cast(pl.Date).alias(col))
            elif col[-1] in ['P', 'A']:
                df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))
            elif col[-1] == 'D':
                df = df.with_columns(pl.col(col).cast(pl.Date).alias(col))
            elif col[-1] in ('M',):
                    df = df.with_columns(pl.col(col).cast(pl.String));
        return df


    @staticmethod
    def aggregate(df:pl.LazyFrame, depth:int=None) -> pl.LazyFrame:
        if depth in (1, 2):
            aggs: list = [];
            cols: list[str] = df.columns;

            for col in cols:
                if col[-1] in ('P', 'M', 'A', 'D', 'T', 'L') or 'num_group' in col:
                    for method in (pl.max, pl.min, pl.first, pl.last):
                        aggs.append(method(col).alias(f'{method.__name__}_{col}'))

                if col.endswith(('P', 'A', 'D')):
                    aggs.append(pl.col(col).mean().alias(f'mean_{col}'))

                if col.endswith('M') and df[col].dtype != pl.Null:
                    aggs.append(pl.col(col).drop_nulls().mode().first().alias(f'mode_{col}'))

            return df.group_by('case_id').agg(aggs)
        else:
            return df


    @staticmethod
    def scan_files(glob_path:str, depth:int=None) -> pl.LazyFrame:
        chunks: list[pl.LazyFrame] = []
        for path in glob(str(glob_path)):
            df: pl.LazyFrame = pl.scan_parquet(path, low_memory=True).pipe(SchemaGen.change_dtypes)

            if depth in [1, 2]:
                df = SchemaGen.aggregate(df)

            chunks.append(df)

        df = pl.concat(chunks, how='vertical_relaxed')
        df = df.unique(subset=['case_id'])
    
        return df
    
    
    @staticmethod
    def join_dataframes(df_base: pl.LazyFrame, depth_0: list[pl.LazyFrame], depth_1: list[pl.LazyFrame], depth_2: list[pl.LazyFrame]) -> pl.DataFrame:
        for (i, df) in enumerate(depth_0 + depth_1 + depth_2):
            df_base = df_base.join(df, how='left', on='case_id', suffix=f'_{i}')

        return df_base.collect()
    
    

In [4]:
def handle_dates(df: pl.DataFrame) -> pl.DataFrame:
    for col in df.columns:
        if col.endswith('D'):
            df = df.with_columns(pl.col(col) - pl.col('date_decision'))
            df = df.with_columns(pl.col(col).dt.total_days().cast(pl.Int32))

    df = df.with_columns([pl.col('date_decision').dt.year().alias('year').cast(pl.Int16), pl.col('date_decision').dt.month().alias('month').cast(pl.UInt8), pl.col('date_decision').dt.weekday().alias('week_num').cast(pl.UInt8)])

    return df.drop('date_decision', 'MONTH', 'WEEK_NUM');


def filter_cols(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.columns:
        if col not in ['case_id', 'year', 'month', 'week_num', 'target']:
            null_pct = df[col].is_null().mean()

            if null_pct > 0.95:
                df = df.drop(col)

    for col in df.columns:
        if (col not in ['case_id', 'year', 'month', 'week_num', 'target']) & (df[col].dtype == pl.String):
            freq = df[col].n_unique()

            if (freq > 200) | (freq == 1):
                df = df.drop(col)

    return df

In [5]:
# feat_defs: pl.DataFrame = Utility.find_feat_occur(TRAIN_DIR / 'train_*.parquet', 'P')
# feat_defs: pl.DataFrame = Utility.find_feat_occur(TRAIN_DIR / 'train_*.parquet', 'M')
# feat_defs: pl.DataFrame = Utility.find_feat_occur(TRAIN_DIR / 'train_*.parquet', 'A')
# feat_defs: pl.DataFrame = Utility.find_feat_occur(TRAIN_DIR / 'train_*.parquet', 'D')
# feat_defs: pl.DataFrame = Utility.find_feat_occur(TRAIN_DIR / 'train_*.parquet', 'T')
# feat_defs: pl.DataFrame = Utility.find_feat_occur(TRAIN_DIR / 'train_*.parquet', 'L')
# feat_defs: pl.DataFrame = pl.read_csv(ROOT / 'feature_definitions.csv')
# with pl.Config(fmt_str_lengths=1000, tbl_rows=-1, tbl_width_chars=180):
#     print(feat_defs)

In [6]:
data_store: dict = {
    'df_base': SchemaGen.scan_files(TRAIN_DIR / 'train_base.parquet'),
    'depth_0': [
        SchemaGen.scan_files(TRAIN_DIR / 'train_static_cb_0.parquet'),
        SchemaGen.scan_files(TRAIN_DIR / 'train_static_0_*.parquet'),
    ],
    'depth_1': [
        SchemaGen.scan_files(TRAIN_DIR / 'train_applprev_1_*.parquet', 1),
        SchemaGen.scan_files(TRAIN_DIR / 'train_tax_registry_a_1.parquet', 1),
        SchemaGen.scan_files(TRAIN_DIR / 'train_tax_registry_b_1.parquet', 1),
        SchemaGen.scan_files(TRAIN_DIR / 'train_tax_registry_c_1.parquet', 1),
        SchemaGen.scan_files(TRAIN_DIR / 'train_credit_bureau_a_1_*.parquet', 1),
        SchemaGen.scan_files(TRAIN_DIR / 'train_credit_bureau_b_1.parquet', 1),
        SchemaGen.scan_files(TRAIN_DIR / 'train_other_1.parquet', 1),
        SchemaGen.scan_files(TRAIN_DIR / 'train_person_1.parquet', 1),
        SchemaGen.scan_files(TRAIN_DIR / 'train_deposit_1.parquet', 1),
        SchemaGen.scan_files(TRAIN_DIR / 'train_debitcard_1.parquet', 1),
    ],
    'depth_2': [
#         SchemaGen.scan_files(TRAIN_DIR / 'train_credit_bureau_a_2_*.parquet', 2),
        SchemaGen.scan_files(TRAIN_DIR / 'train_credit_bureau_b_2.parquet', 2),
    ]
}

df_train: pl.LazyFrame = SchemaGen.join_dataframes(**data_store).pipe(filter_cols).pipe(handle_dates).pipe(Utility.reduce_memory_usage, 'df_train')

del data_store
gc.collect()

print(f'Train data shape: {df_train.shape}')
display(df_train.head(10))

Memory usage of dataframe 'df_train' is 3278.14 MB.
Memory usage of dataframe 'df_train' became 1900.83 MB.
Train data shape: (1526659, 330)


case_id,target,assignmentdate_238D,assignmentdate_4527235D,birthdate_574D,contractssum_5085716L,dateofbirth_337D,days120_123L,days180_256L,days30_165L,days360_512L,days90_310L,description_5085714M,education_1103M,education_88M,firstquarter_103L,fourthquarter_440L,maritalst_385M,maritalst_893M,numberofqueries_373L,pmtaverage_3A,pmtaverage_4527227A,pmtcount_4527229L,pmtcount_693L,pmtscount_423L,pmtssum_45A,requesttype_4525192L,responsedate_1012D,responsedate_4527233D,responsedate_4917613D,secondquarter_766L,thirdquarter_1082L,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,…,residualamount_856A,subjectrole_182M,subjectrole_93M,totalamount_6A,totalamount_996A,totaldebtoverduevalue_178A,totaldebtoverduevalue_718A,totaloutstanddebtvalue_39A,totaloutstanddebtvalue_668A,birth_259D,contaddr_matchlist_1032L,contaddr_smempladdr_334L,education_927M,empl_employedfrom_271D,empl_employedtotal_800L,empl_industry_691L,familystate_447L,housetype_905L,incometype_1044T,language1_981M,mainoccupationinc_384A,num_group1_9,personindex_1023L,persontype_1072L,persontype_792L,role_1084L,safeguarantyflag_411L,sex_738L,type_25L,amount_416A,num_group1_10,openingdate_313D,num_group1_11,openingdate_857D,year,month,week_num
u32,i8,i16,i8,i16,f32,i32,f32,f32,f32,f32,f32,str,str,str,f32,f32,str,str,f32,f32,f32,f32,f32,f32,f32,str,i8,i8,i8,f32,f32,f32,f32,f32,f32,f32,…,f32,str,str,f32,f32,f32,f32,f32,f32,i16,bool,bool,str,i16,str,str,str,str,str,str,f32,u32,f32,f32,f32,str,bool,str,str,f32,u32,i16,u32,i16,i16,u8,u8
2545932,0,-1469.0,,-24494.0,,-24494.0,1.0,1.0,0.0,2.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",1.0,1.0,"""a55475b1""","""a55475b1""",2.0,15397.932617,,,6.0,,,,14.0,,,2.0,1.0,0.0,,7381.800293,6446.0,0.0,…,,"""ab3c25cf""","""ab3c25cf""",,23595.201172,0.0,0.0,65600.65625,0.0,-24494,False,False,"""a55475b1""",,,,,,"""RETIRED_PENSIO…","""P209_127_106""",60000.0,0,0.0,1.0,1.0,"""CL""",True,"""M""","""PRIMARY_MOBILE…",,,,,,2019,2,6
2650601,0,,14.0,,,-24574.0,1.0,1.0,1.0,1.0,1.0,"""a55475b1""","""a55475b1""","""a55475b1""",0.0,1.0,"""3439d993""","""a55475b1""",1.0,,7223.399902,6.0,,,,"""PENSION_6""",,14.0,,0.0,0.0,0.0,47107.199219,8899.0,0.0,0.0,…,,"""a55475b1""","""ab3c25cf""",71984.804688,,,0.0,,0.0,-24574,False,False,"""a55475b1""",,,,,,"""RETIRED_PENSIO…","""P209_127_106""",25000.0,0,0.0,1.0,1.0,"""CL""",True,"""F""","""PRIMARY_MOBILE…",,,,,,2019,12,5
2675211,0,,,,,-9415.0,2.0,2.0,0.0,4.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",5.0,2.0,"""a7fcb6e5""","""a55475b1""",4.0,,,,,,,"""DEDUCTION_6""",,14.0,,2.0,2.0,0.0,15689.400391,6000.0,0.0,0.0,…,,"""a55475b1""","""a55475b1""",,,,,,,-9415,False,False,"""a55475b1""",,,,,,"""PRIVATE_SECTOR…","""P10_39_147""",128000.0,0,0.0,1.0,1.0,"""CL""",True,"""F""","""PRIMARY_MOBILE…",,,,,,2020,3,3
1733168,0,,,,,-19514.0,2.0,2.0,0.0,3.0,2.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",3.0,4.0,"""3439d993""","""a55475b1""",3.0,,,,,,,"""DEDUCTION_6""",,14.0,,1.0,2.0,0.0,43201.199219,5002.600098,2790.800049,0.0,…,,"""a55475b1""","""a55475b1""",,,,,,,-19514,False,False,"""a55475b1""",,,,,,"""PRIVATE_SECTOR…","""P209_127_106""",80000.0,0,0.0,1.0,1.0,"""CL""",True,"""F""","""PRIMARY_MOBILE…",,,,,,2020,1,6
2533338,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,9849.200195,6723.0,0.0,…,,,,,,,,,,-14930,False,False,"""a55475b1""",,,,,,"""PRIVATE_SECTOR…","""P209_127_106""",120000.0,0,0.0,1.0,1.0,"""CL""",True,"""F""","""PRIMARY_MOBILE…",,,,,,2019,1,2
833879,0,,,,,-10638.0,4.0,4.0,1.0,5.0,4.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",3.0,0.0,"""a7fcb6e5""","""a55475b1""",5.0,,,,,,,"""DEDUCTION_6""",,14.0,,3.0,6.0,,,3331.0,0.0,0.0,…,,"""ab3c25cf""","""ab3c25cf""",,1108800.0,0.0,0.0,1316700.0,0.0,-10638,False,False,"""P33_146_175""",-275.0,"""LESS_ONE""","""OTHER""","""SINGLE""",,"""PRIVATE_SECTOR…","""P209_127_106""",90000.0,0,0.0,1.0,1.0,"""CL""",True,"""F""","""PRIMARY_MOBILE…",,,,,,2019,10,4
1653978,0,,,,,-12995.0,0.0,0.0,0.0,0.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",0.0,2.0,"""a55475b1""","""a55475b1""",0.0,,,,,,,"""DEDUCTION_6""",,14.0,,1.0,0.0,0.0,14943.600586,5935.0,2134.800049,0.0,…,,"""a55475b1""","""a55475b1""",,,,,,,-12995,False,False,"""a55475b1""",,,,,,"""SALARIED_GOVT""","""P209_127_106""",38000.0,0,0.0,1.0,1.0,"""CL""",True,"""F""","""PRIMARY_MOBILE…",,,,,,2019,11,5
731826,0,,,-14337.0,,-14337.0,1.0,2.0,0.0,2.0,1.0,"""a55475b1""","""a55475b1""","""a55475b1""",1.0,0.0,"""a55475b1""","""a55475b1""",2.0,,,,,6.0,7000.0,,14.0,,,1.0,0.0,0.0,,3059.800049,0.0,0.0,…,,"""ab3c25cf""","""ab3c25cf""",15004.0,132000.0,0.0,0.0,131034.617188,0.0,-14337,False,False,"""P33_146_175""",-1599.0,"""MORE_FIVE""","""OTHER""","""MARRIED""",,"""PRIVATE_SECTOR…","""P209_127_106""",40000.0,0,0.0,1.0,1.0,"""EM""",True,"""F""","""PRIMARY_MOBILE…",,,,,,2019,7,3
1521876,0,,,,,-15042.0,1.0,1.0,1.0,4.0,1.0,"""a55475b1""","""a55475b1""","""a55475b1""",6.0,6.0,"""a55475b1""","""a55475b1""",4.0,,,,,3.0,2550.0,"""DEDUCTION_6""",14.0,14.0,,3.0,7.0,0.0,76355.929688,6156.200195,0.0,0.0,…,0.0,"""a55475b1""","""a55475b1""",25010.0,,,,,,-15042,False,False,"""a55475b1""",,,,,,"""PRIVATE_SECTOR…","""P209_127_106""",150000.0,0,0.0,1.0,1.0,"""CL""",True,"""F""","""PRIMARY_MOBILE…",,,,,,2019,9,5
1699528,0,,,,,-22117.0,4.0,5.0,1.0,8.0,2.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",6.0,6.0,"""3439d993""","""a55475b1""",8.0,,,,,,,"""DEDUCTION_6""",,14.0,,9.0,6.0,0.0,0.0,5025.0,0.0,0.0,…,,"""a55475b1""","""a55475b1""",,,,,,,-22117,False,False,"""a55475b1""",,,,,,"""PRIVATE_SECTOR…","""P209_127_106""",64000.0,0,0.0,1.0,1.0,"""CL""",True,"""M""","""PRIMARY_MOBILE…",220.098007,0.0,-1480.0,0.0,-1480.0,2019,12,5


In [7]:
data_store: dict = {
    'df_base': SchemaGen.scan_files(TEST_DIR / 'test_base.parquet'),
    'depth_0': [
        SchemaGen.scan_files(TEST_DIR / 'test_static_cb_0.parquet'),
        SchemaGen.scan_files(TEST_DIR / 'test_static_0_*.parquet'),
    ],
    'depth_1': [
        SchemaGen.scan_files(TEST_DIR / 'test_applprev_1_*.parquet', 1),
        SchemaGen.scan_files(TEST_DIR / 'test_tax_registry_a_1.parquet', 1),
        SchemaGen.scan_files(TEST_DIR / 'test_tax_registry_b_1.parquet', 1),
        SchemaGen.scan_files(TEST_DIR / 'test_tax_registry_c_1.parquet', 1),
        SchemaGen.scan_files(TEST_DIR / 'test_credit_bureau_a_1_*.parquet', 1),
        SchemaGen.scan_files(TEST_DIR / 'test_credit_bureau_b_1.parquet', 1),
        SchemaGen.scan_files(TEST_DIR / 'test_other_1.parquet', 1),
        SchemaGen.scan_files(TEST_DIR / 'test_person_1.parquet', 1),
        SchemaGen.scan_files(TEST_DIR / 'test_deposit_1.parquet', 1),
        SchemaGen.scan_files(TEST_DIR / 'test_debitcard_1.parquet', 1),
    ],
    'depth_2': [
#         SchemaGen.scan_files(TEST_DIR / 'test_credit_bureau_a_2_*.parquet', 2),
        SchemaGen.scan_files(TEST_DIR / 'test_credit_bureau_b_2.parquet', 2),
    ]
}

df_test: pl.DataFrame = SchemaGen.join_dataframes(**data_store).pipe(handle_dates).pipe(Utility.reduce_memory_usage, 'df_test').select([col for col in df_train.columns if col != "target"])

del data_store
gc.collect()

print(f'Test data shape: {df_test.shape}')

Memory usage of dataframe 'df_test' is 0.03 MB.
Memory usage of dataframe 'df_test' became 0.02 MB.
Test data shape: (10, 329)


In [8]:
df_train, cat_cols = Utility.to_pandas(df_train)
df_test, cat_cols = Utility.to_pandas(df_test, cat_cols)

0

In [9]:
class VotingModel(BaseEstimator, ClassifierMixin):
    def __init__(self, estimators: list):
        super().__init__()
        self.estimators = estimators
        
    def fit(self, X, y=None):
        return self
    
    def predict(self, X):
        y_preds = [estimator.predict(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)
    
    def predict_proba(self, X):
        y_preds = [estimator.predict_proba(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)

In [10]:
df_subm: pd.DataFrame = pd.read_csv(ROOT / 'sample_submission.csv')
df_subm = df_subm.set_index('case_id')

device: str = 'gpu'
est_cnt: int = 6000
    
DRY_RUN = True if df_subm.shape[0] == 10 else False   
if DRY_RUN:
    device = 'cpu'
    df_train = df_train.iloc[:50000]
    est_cnt = 600

print(device)

cpu


In [11]:
X = df_train.drop(columns=['target', 'case_id', 'week_num'])
y = df_train['target']

weeks = df_train['week_num']

del df_train
gc.collect()

cv = StratifiedGroupKFold(n_splits=5, shuffle=False)

params1 = {
    'boosting_type': 'gbdt',
    'colsample_bynode': 0.8,
    'colsample_bytree': 0.8,
    'device': device,
    'extra_trees': True,
    'learning_rate': 0.05,
    'max_depth': 10,
    'metric': 'auc',
    'n_estimators': 2000,
    'num_leaves': 64,
    'objective': 'binary',
    'random_state': 42,
    'reg_alpha': 0.1,
    'reg_lambda': 10,
    'verbose': -1,   
}

params2 = {
    'boosting_type': 'gbdt',
    'colsample_bynode': 0.8,
    'colsample_bytree': 0.8,
    'device': device,
    'extra_trees': True,
    'learning_rate': 0.03,
    'max_depth': 8,
    'metric': 'auc',
    'n_estimators': 2000,
    'num_leaves': 50,
    'objective': 'binary',
    'random_state': 42,
    'reg_alpha': 0.1,
    'reg_lambda': 10,
    'verbose': -1,   
}

fitted_models_cat = []
fitted_models_lgb = []

cv_scores_cat = []
cv_scores_lgb = []

iter_cnt = 0
for idx_train, idx_valid in cv.split(X, y, groups=weeks):
    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]
    
    train_pool = Pool(X_train, y_train, cat_features=cat_cols)
    val_pool = Pool(X_valid, y_valid, cat_features=cat_cols)
    
    clf = CatBoostClassifier(
        eval_metric='AUC',
        iterations=est_cnt,
        learning_rate=0.03,
        random_seed=3107,
        task_type='GPU',
    )
    
    clf.fit(train_pool, eval_set=val_pool, verbose=False)
    fitted_models_cat.append(clf)
    
    y_pred_valid = clf.predict_proba(X_valid)[:, 1]
    auc_score = roc_auc_score(y_valid, y_pred_valid)
    cv_scores_cat.append(auc_score)

    X_train[cat_cols] = X_train[cat_cols].astype("category")
    X_valid[cat_cols] = X_valid[cat_cols].astype("category")
    
    if iter_cnt % 2 == 0:
        model = lgb.LGBMClassifier(**params1)
    else:
        model = lgb.LGBMClassifier(**params2)
    
    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        callbacks=[lgb.log_evaluation(100), lgb.early_stopping(100)]
    )
    fitted_models_lgb.append(model)

    y_pred_valid = model.predict_proba(X_valid)[:, 1]
    auc_score = roc_auc_score(y_valid, y_pred_valid)
    cv_scores_lgb.append(auc_score)
    
    iter_cnt += 1
    
model = VotingModel(fitted_models_cat + fitted_models_lgb)

print(f'\nCV AUC scores for CatBoost: {cv_scores_cat}')
print(f'Maximum CV AUC score for Catboost: {max(cv_scores_cat)}', end='\n\n')


print(f'CV AUC scores for LGBM: {cv_scores_lgb}')
print(f'Maximum CV AUC score for LGBM: {max(cv_scores_lgb)}', end='\n\n')

del X, y
gc.collect()

Default metric period is 5 because AUC is/are not implemented for GPU


Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.810577
[200]	valid_0's auc: 0.821606
[300]	valid_0's auc: 0.824567
Early stopping, best iteration is:
[293]	valid_0's auc: 0.824837


Default metric period is 5 because AUC is/are not implemented for GPU


Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.809988
[200]	valid_0's auc: 0.82054
[300]	valid_0's auc: 0.823884
[400]	valid_0's auc: 0.826834
[500]	valid_0's auc: 0.827238
[600]	valid_0's auc: 0.827127
Early stopping, best iteration is:
[557]	valid_0's auc: 0.828032


Default metric period is 5 because AUC is/are not implemented for GPU


Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.80412
[200]	valid_0's auc: 0.806149
[300]	valid_0's auc: 0.801991
Early stopping, best iteration is:
[226]	valid_0's auc: 0.806226


Default metric period is 5 because AUC is/are not implemented for GPU


Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.78759
[200]	valid_0's auc: 0.794284
[300]	valid_0's auc: 0.796944
[400]	valid_0's auc: 0.795569
Early stopping, best iteration is:
[312]	valid_0's auc: 0.797151


Default metric period is 5 because AUC is/are not implemented for GPU


Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.80706
[200]	valid_0's auc: 0.810662
Early stopping, best iteration is:
[160]	valid_0's auc: 0.811968

CV AUC scores for CatBoost: [0.8003047220429536, 0.8012249111534582, 0.791684424305861, 0.792118953409276, 0.7953564651272387]
Maximum CV AUC score for Catboost: 0.8012249111534582

CV AUC scores for LGBM: [0.8248370394969058, 0.8280316502853596, 0.806225799889306, 0.7971511923124827, 0.8119676321682052]
Maximum CV AUC score for LGBM: 0.8280316502853596



13

In [13]:
X_test: pd.DataFrame = df_test.drop(columns=['week_num']).set_index('case_id')
    
X_test[cat_cols] = X_test[cat_cols].astype("category")

y_pred: pd.Series = pd.Series(model.predict_proba(X_test)[:, 1], index=X_test.index)

df_subm['score'] = y_pred

display(df_subm)
    
df_subm.to_csv("submission.csv")

del X_test, y_pred, df_subm
gc.collect()

Unnamed: 0_level_0,score
case_id,Unnamed: 1_level_1
57543,0.011163
57549,0.039185
57551,0.011642
57552,0.012786
57569,0.18849
57630,0.010784
57631,0.01846
57632,0.007056
57633,0.020572
57634,0.011631


387