In [1]:
import gc
import lightgbm as lgb
import numpy as np
import pandas as pd
import polars as pl
import warnings

from glob import glob
from IPython.display import display
from pathlib import Path
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedGroupKFold
from typing import Any

warnings.filterwarnings('ignore')

ROOT      = Path('/kaggle/input/home-credit-credit-risk-model-stability')
TRAIN_DIR = ROOT / 'parquet_files' / 'train'
TEST_DIR  = ROOT / 'parquet_files' / 'test'

In [2]:
class DTypeHandler():
    @staticmethod
    def get_feat_defs(ending_with: str):
        feat_defs: pl.DataFrame = pl.read_csv(ROOT / 'feature_definitions.csv')

        filtered_feats: pl.DataFrame = feat_defs.filter(pl.col('Variable').apply(lambda var: var.endswith(ending_with)))

        with pl.Config(fmt_str_lengths=200, tbl_rows=-1):
            print(filtered_feats)

        filtered_feats = None
        feat_defs = None

     
    @staticmethod
    def find_index(lst: list, item: Any) -> int | None:
        try:
            return lst.index(item)
        except ValueError:
            return None

    
    @staticmethod
    def dtype_to_str(dtype: pl.DataType) -> str:
        dtype_map = {
            pl.Decimal: 'Decimal',

            pl.Float32: 'Float32',
            pl.Float64: 'Float64',

            pl.UInt8: 'UInt8',
            pl.UInt16: 'UInt16',
            pl.UInt32: 'UInt32',
            pl.UInt64: 'UInt64',

            pl.Int8: 'Int8',
            pl.Int16: 'Int16',
            pl.Int32: 'Int32',
            pl.Int64: 'Int64',

            pl.Date: 'Date',
            pl.Datetime: 'Datetime',
            pl.Duration: 'Duration',
            pl.Time: 'Time',

            pl.Array: 'Array',
            pl.List: 'List',
            pl.Struct: 'Struct',

            pl.String: 'String',
            pl.Categorical: 'Categorical',
            pl.Enum: 'Enum',
            pl.Utf8: 'Utf8',

            pl.Binary: 'Binary',
            pl.Boolean: 'Boolean',
            pl.Null: 'Null',
            pl.Object: 'Object',
            pl.Unknown: 'Unknown'
        }

        return dtype_map.get(dtype)

    
    @staticmethod
    def find_feat_occur(regex_path: str, ending_with: str) -> pl.DataFrame:
        feat_defs: pl.DataFrame = pl.read_csv(ROOT / 'feature_definitions.csv').filter(pl.col('Variable').apply(lambda var: var.endswith(ending_with)))
        feat_defs.sort(by=['Variable'])

        feats: list = feat_defs['Variable'].to_list()
        feats.sort()

        occurrences: list = [[set(), set()] for _ in range(feat_defs.height)]

        for path in glob(str(regex_path)):
            df_schema: dict = pl.read_parquet_schema(path)

            for (feat, dtype) in df_schema.items():
                index: int = DTypeHandler.find_index(feats, feat)
                if index != None:
                    occurrences[index][0].add(DTypeHandler.dtype_to_str(dtype))
                    occurrences[index][1].add(Path(path).stem)

        data_types: list[str] = [None] * feat_defs.height
        file_locs: list[str] = [None] * feat_defs.height

        for i, feat in enumerate(feats):
            data_types[i] = list(occurrences[i][0])
            file_locs[i] = list(occurrences[i][1])

        feat_defs = feat_defs.with_columns(pl.Series(data_types).alias('Data_Type(s)'))
        feat_defs = feat_defs.with_columns(pl.Series(file_locs).alias('File_Loc(s)'))

        return feat_defs

    @staticmethod
    def change_dtypes(df: pl.DataFrame) -> pl.DataFrame:
        for col in df.columns:
            if col in ['case_id', 'WEEK_NUM', 'num_group1', 'num_group2']:
                df = df.with_columns(pl.col(col).cast(pl.UInt32).alias(col))
            elif col == 'date_decision':
                df = df.with_columns(pl.col(col).cast(pl.Date).alias(col))
            # Predictors belonging to 'P - Transform DPD (Days past due)' and 'A - Transform amount' must be floats.
            elif col[-1] in ['P', 'A']:
                df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))
            # Predictors belonging to 'D - Transform date' are dates.
            elif col[-1] == 'D':
                df = df.with_columns(pl.col(col).cast(pl.Date).alias(col))
            elif col[-1] in ('M',):
                    df = df.with_columns(pl.col(col).cast(pl.String));
        return df

In [3]:
# feat_defs: pl.DataFrame = DTypeHandler.find_feat_occur(TRAIN_DIR / 'train_*.parquet', 'P')
# feat_defs: pl.DataFrame = DTypeHandler.find_feat_occur(TRAIN_DIR / 'train_*.parquet', 'M')
# feat_defs: pl.DataFrame = DTypeHandler.find_feat_occur(TRAIN_DIR / 'train_*.parquet', 'A')
# feat_defs: pl.DataFrame = DTypeHandler.find_feat_occur(TRAIN_DIR / 'train_*.parquet', 'D')
feat_defs: pl.DataFrame = DTypeHandler.find_feat_occur(TRAIN_DIR / 'train_*.parquet', 'T')
# feat_defs: pl.DataFrame = DTypeHandler.find_feat_occur(TRAIN_DIR / 'train_*.parquet', 'L')
# feat_defs: pl.DataFrame = pl.read_csv(ROOT / 'feature_definitions.csv')
with pl.Config(fmt_str_lengths=1000, tbl_rows=-1, tbl_width_chars=180):
    print(feat_defs)

shape: (22, 4)
┌────────────────────────────────┬─────────────────────────────────────────────────────────────────┬──────────────┬────────────────────────────────────────────────────────────────┐
│ Variable                       ┆ Description                                                     ┆ Data_Type(s) ┆ File_Loc(s)                                                    │
│ ---                            ┆ ---                                                             ┆ ---          ┆ ---                                                            │
│ str                            ┆ str                                                             ┆ list[str]    ┆ list[str]                                                      │
╞════════════════════════════════╪═════════════════════════════════════════════════════════════════╪══════════════╪════════════════════════════════════════════════════════════════╡
│ dpdmaxdatemonth_442T           ┆ Max DPD occurrence month for terminated contr

In [4]:
def handle_dates(df: pl.LazyFrame) -> pl.LazyFrame:
    for col in df.columns:
        if col.endswith('D'):
            df = df.with_columns(pl.col(col) - pl.col('date_decision'))
            df = df.with_columns(pl.col(col).dt.total_days().cast(pl.Int32))
            
    df = df.with_columns([pl.col('date_decision').dt.year().alias('year').cast(pl.Int16), pl.col('date_decision').dt.month().alias('month').cast(pl.UInt8), pl.col('date_decision').dt.weekday().alias('week_num').cast(pl.UInt8)])
        
    return df.drop('date_decision', 'MONTH', 'WEEK_NUM');


def filter_cols(df: pl.LazyFrame) -> pl.LazyFrame:
    for col in df.columns:
        if col not in ['case_id', 'year', 'month', 'week_num', 'target']:
            null_pct = df[col].is_null().mean()

            if null_pct > 0.95:
                df = df.drop(col)

    for col in df.columns:
        if (col not in ['case_id', 'year', 'month', 'week_num', 'target']) & (df[col].dtype == pl.String):
            freq = df[col].n_unique()

            if (freq > 200) | (freq == 1):
                df = df.drop(col)

    return df


def aggregate(df: pl.LazyFrame) -> pl.LazyFrame:
    aggs: list = [];
    cols: list[str] = df.columns;
        
    for col in cols:
        if col[-1] in ('P', 'M', 'A', 'D', 'T', 'L') or 'num_group' in col:
            for method in [pl.max, pl.min, pl.first, pl.last]:
                aggs.append(method(col).alias(f'{method.__name__}_{col}'))
                
        if col.endswith(('P', 'A', 'D')):
            aggs.append(pl.col(col).mean().alias(f'mean_{col}'))
            
        if col.endswith('M'):
            aggs.append(pl.col(col).drop_nulls().mode().first().alias(f'mode_{col}'))
    
    return df.group_by('case_id').agg(aggs);


def read_file(path: str, depth: int=None) -> pl.LazyFrame:
    df: pl.LazyFrame = pl.scan_parquet(path, low_memory=True).pipe(DTypeHandler.change_dtypes)
    
    if depth in (1, 2):
        df = aggregate(df)
    
    return df


def read_files(regex_path: str, depth=None) -> pl.LazyFrame:
    chunks: list[pl.LazyFrame] = []
    for path in glob(str(regex_path)):
        df: pl.LazyFrame = pl.scan_parquet(path, low_memory=True).pipe(DTypeHandler.change_dtypes)
        
        if depth in [1, 2]:
            df = aggregate(df)
        
        chunks.append(df)
        
    df = pl.concat(chunks, how='vertical_relaxed')
    df = df.unique(subset=['case_id'])
    
    return df


def join_dataframes(df_base: pl.LazyFrame, depth_0: list[pl.LazyFrame], depth_1: list[pl.LazyFrame], depth_2: list[pl.LazyFrame]) -> pl.DataFrame:
    for (i, df) in enumerate(depth_0 + depth_1 + depth_2):
        df_base = df_base.join(df, how='left', on='case_id', suffix=f'_{i}')
        
    df_base = df_base.pipe(handle_dates).collect()
    
    return df_base


def reduce_memory_usage(df: pl.DataFrame, name) -> pl.DataFrame:
    print(f'Memory usage of dataframe \'{name}\' is {round(df.estimated_size("mb"), 2)} MB.')
    
    int_types = [pl.Int8, pl.Int16, pl.Int32, pl.Int64]
    float_types = [pl.Float32, pl.Float64]
    
    for col in df.columns:
        col_type = df[col].dtype
        c_min = df[col].min()
        c_max = df[col].max()
        
        if col_type in int_types:
            if c_min is not None and c_max is not None:
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df = df.with_columns(df[col].cast(pl.Int8))
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df = df.with_columns(df[col].cast(pl.Int16))
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df = df.with_columns(df[col].cast(pl.Int32))
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df = df.with_columns(df[col].cast(pl.Int64))
        elif col_type in float_types:
            if c_min is not None and c_max is not None:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df = df.with_columns(df[col].cast(pl.Float32))
        else:
            pass

    print(f'Memory usage of dataframe \'{name}\' became {round(df.estimated_size("mb"), 2)} MB.')
    
    return df


def to_pandas(df: pl.DataFrame, cat_cols:list[str]=None) -> (pd.DataFrame, list[str]):
    df = df.to_pandas()
    
    if cat_cols is None:
        cat_cols = list(df.select_dtypes('object').columns)
    
    df[cat_cols] = df[cat_cols].astype('category')
    
    return df, cat_cols

In [5]:
data_store: dict = {
    'df_base': read_file(TRAIN_DIR / 'train_base.parquet'),
    'depth_0': [
        read_file(TRAIN_DIR / 'train_static_cb_0.parquet'),
        read_files(TRAIN_DIR / 'train_static_0_*.parquet'),
    ],
    'depth_1': [
        read_files(TRAIN_DIR / 'train_applprev_1_*.parquet', 1),
        read_file(TRAIN_DIR / 'train_tax_registry_a_1.parquet', 1),
        read_file(TRAIN_DIR / 'train_tax_registry_b_1.parquet', 1),
        read_file(TRAIN_DIR / 'train_tax_registry_c_1.parquet', 1),
#         read_files(TRAIN_DIR / 'train_credit_bureau_a_1_*.parquet', 1),
        read_file(TRAIN_DIR / 'train_credit_bureau_b_1.parquet', 1),
        read_file(TRAIN_DIR / 'train_other_1.parquet', 1),
        read_file(TRAIN_DIR / 'train_person_1.parquet', 1),
        read_file(TRAIN_DIR / 'train_deposit_1.parquet', 1),
        read_file(TRAIN_DIR / 'train_debitcard_1.parquet', 1),
    ],
    'depth_2': [
#         read_files(TRAIN_DIR / 'train_credit_bureau_a_2_*.parquet', 2),
        read_file(TRAIN_DIR / 'train_credit_bureau_b_2.parquet', 2),
    ]
}

df_train: pl.DataFrame = join_dataframes(**data_store)
df_train = reduce_memory_usage(df_train, 'df_train')

del data_store
gc.collect()

print(f'Train data shape:\t{df_train.shape}')

Memory usage of dataframe 'df_train' is 8169.01 MB.
Memory usage of dataframe 'df_train' became 5006.72 MB
Train data shape:	(1526659, 927)


In [6]:
data_store: dict = {
    'df_base': read_file(TEST_DIR / 'test_base.parquet'),
    'depth_0': [
        read_file(TEST_DIR / 'test_static_cb_0.parquet'),
        read_files(TEST_DIR / 'test_static_0_*.parquet'),
    ],
    'depth_1': [
        read_files(TEST_DIR / 'test_applprev_1_*.parquet', 1),
        read_file(TEST_DIR / 'test_tax_registry_a_1.parquet', 1),
        read_file(TEST_DIR / 'test_tax_registry_b_1.parquet', 1),
        read_file(TEST_DIR / 'test_tax_registry_c_1.parquet', 1),
#         read_files(TEST_DIR / 'test_credit_bureau_a_1_*.parquet', 1),
        read_file(TEST_DIR / 'test_credit_bureau_b_1.parquet', 1),
        read_file(TEST_DIR / 'test_other_1.parquet', 1),
        read_file(TEST_DIR / 'test_person_1.parquet', 1),
        read_file(TEST_DIR / 'test_deposit_1.parquet', 1),
        read_file(TEST_DIR / 'test_debitcard_1.parquet', 1),
    ],
    'depth_2': [
#         read_files(TEST_DIR / 'test_credit_bureau_a_2_*.parquet', 2),
        read_file(TEST_DIR / 'test_credit_bureau_b_2.parquet', 2),
    ]
}

df_test: pl.DataFrame = join_dataframes(**data_store)

del data_store
gc.collect()

print(f'Test data shape:\t{df_test.shape}')

Test data shape:	(10, 926)


In [7]:
df_train = df_train.pipe(filter_cols)
df_test = df_test.select([col for col in df_train.columns if col != 'target'])

print('train data shape:\t', df_train.shape)
print('test data shape:\t', df_test.shape)

train data shape:	 (1526659, 517)
test data shape:	 (10, 516)


In [8]:
df_train, cat_cols = to_pandas(df_train)
df_test, cat_cols = to_pandas(df_test, cat_cols)

In [9]:
class VotingModel(BaseEstimator, ClassifierMixin):
    def __init__(self, estimators: list):
        super().__init__()
        self.estimators = estimators
        
    def fit(self, X, y=None):
        return self
    
    def predict(self, X):
        y_preds = [estimator.predict(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)
    
    def predict_proba(self, X):
        y_preds = [estimator.predict_proba(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)

In [10]:
X = df_train.drop(columns=['target', 'case_id', 'week_num'])
y = df_train['target']

weeks = df_train['week_num']

cv = StratifiedGroupKFold(n_splits=5, shuffle=False)

params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'max_depth': 8,
    'learning_rate': 0.05,
    'n_estimators': 1000,
    'colsample_bytree': 0.8, 
    'colsample_bynode': 0.8,
    'verbose': -1,
    'random_state': 42,
    'device': 'gpu',
}

fitted_models = []

for idx_train, idx_valid in cv.split(X, y, groups=weeks):
    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]

    model = lgb.LGBMClassifier(**params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        callbacks=[lgb.log_evaluation(100), lgb.early_stopping(100)]
    )

    fitted_models.append(model)

model = VotingModel(fitted_models)



Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.821995
[200]	valid_0's auc: 0.831746
[300]	valid_0's auc: 0.835394
[400]	valid_0's auc: 0.836584
[500]	valid_0's auc: 0.837041
[600]	valid_0's auc: 0.83745
[700]	valid_0's auc: 0.83777
[800]	valid_0's auc: 0.837812
[900]	valid_0's auc: 0.838043
[1000]	valid_0's auc: 0.838167
Did not meet early stopping. Best iteration is:
[940]	valid_0's auc: 0.838174
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.832973
[200]	valid_0's auc: 0.841958
[300]	valid_0's auc: 0.845271
[400]	valid_0's auc: 0.846458
[500]	valid_0's auc: 0.84713
[600]	valid_0's auc: 0.847432
[700]	valid_0's auc: 0.847785
[800]	valid_0's auc: 0.847981
[900]	valid_0's auc: 0.848129
[1000]	valid_0's auc: 0.848168
Did not meet early stopping. Best iteration is:
[999]	valid_0's auc: 0.848178
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.815651
[200]	valid_0's auc: 0.824562

In [11]:
X_test: pd.DataFrame = df_test.drop(columns=['week_num']).set_index('case_id')

y_pred: pd.Series = pd.Series(model.predict_proba(X_test)[:, 1], index=X_test.index)

df_subm = pd.read_csv(ROOT / 'sample_submission.csv')
df_subm = df_subm.set_index('case_id')

df_subm['score'] = y_pred

with pl.Config(fmt_str_lengths=1000, tbl_rows=-1, tbl_width_chars=180):
    print(df_subm)
    
df_subm.to_csv("submission.csv")

            score
case_id          
57543    0.011007
57549    0.041158
57551    0.006355
57552    0.012732
57569    0.094646
57630    0.011707
57631    0.050166
57632    0.025001
57633    0.063634
57634    0.025919
