In [5]:
import sys
from pathlib import Path
import subprocess
import os
import gc
from glob import glob
import numpy as np
import pandas as pd
import polars as pl
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import TimeSeriesSplit, GroupKFold, StratifiedGroupKFold
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import KNNImputer
import warnings
warnings.filterwarnings('ignore')

ROOT = '/kaggle/input/homecredit/'

# Data Preprocessing

In [6]:
def set_table_dtypes(df: pl.DataFrame) -> pl.DataFrame:
    for col in df.columns:
        if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
            df = df.with_columns(pl.col(col).cast(pl.Int64))
        elif col in ["date_decision"]:
            df = df.with_columns(pl.col(col).cast(pl.Date))
        elif col[-1] in ("P", "A"):
            df = df.with_columns(pl.col(col).cast(pl.Float64))
        elif col[-1] in ("M",):
            df = df.with_columns(pl.col(col).cast(pl.String))
        elif col[-1] in ("D",):
            df = df.with_columns(pl.col(col).cast(pl.Date))
    return df

def convert_strings(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.columns:  
        if df[col].dtype.name in ['object', 'string']:
            df[col] = df[col].astype("string").astype('category')
            current_categories = df[col].cat.categories
            new_categories = current_categories.to_list() + ["Unknown"]
            new_dtype = pd.CategoricalDtype(categories=new_categories, ordered=True)
            df[col] = df[col].astype(new_dtype)
    return df

def type_sorting(df: pd.DataFrame):
    numeric = []
    categorical = []
    for col in df.columns:
        if df[col].dtype in [pl.Int64, pl.Float64]:
            numeric.append(col)
        else:
            categorical.append(col)
    return numeric , categorical

def drop_col_null(df):
    bad = []
    for col in df.columns:
        null_percentages = df[col].null_count() / df[col].len() 
        if null_percentages >= 0.2:
            bad.append(col)
    new_df = df.drop(bad)
    return new_df

## Depth == 0

In [8]:
train_base = pl.read_parquet(ROOT + "parquet_files/train_base_table.parquet", use_pyarrow=True)
train_static = pl.concat(
    [
        pl.read_parquet(ROOT + "parquet_files/train/train_static_0_0.parquet").pipe(set_table_dtypes),
        pl.read_parquet(ROOT + "parquet_files/train/train_static_0_1.parquet").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)
train_static_cb = pl.read_parquet(ROOT + "parquet_files/train/train_static_cb_0.parquet").pipe(set_table_dtypes)

static = train_static.join(train_static_cb, on="case_id")
depth0 = train_base.join(static, on="case_id")

In [9]:
depth0

case_id,date_decision,MONTH,WEEK_NUM,target,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,applicationscnt_1086L,applicationscnt_464L,applicationscnt_629L,applicationscnt_867L,avgdbddpdlast24m_3658932P,avgdbddpdlast3m_4187120P,avgdbdtollast24m_4525197P,avgdpdtolclosure24_3658938P,avginstallast24m_3658937A,avglnamtstart24m_4525187A,avgmaxdpdlast9m_3716943P,avgoutstandbalancel6m_4187114A,avgpmtlast12m_4525200A,bankacctype_710L,cardtype_51L,clientscnt12m_3712952L,clientscnt3m_3712950L,clientscnt6m_3712949L,clientscnt_100L,clientscnt_1022L,clientscnt_1071L,clientscnt_1130L,clientscnt_136L,clientscnt_157L,clientscnt_257L,clientscnt_304L,…,firstquarter_103L,for3years_128L,for3years_504L,for3years_584L,formonth_118L,formonth_206L,formonth_535L,forquarter_1017L,forquarter_462L,forquarter_634L,fortoday_1092L,forweek_1077L,forweek_528L,forweek_601L,foryear_618L,foryear_818L,foryear_850L,fourthquarter_440L,maritalst_385M,maritalst_893M,numberofqueries_373L,pmtaverage_3A,pmtaverage_4527227A,pmtaverage_4955615A,pmtcount_4527229L,pmtcount_4955617L,pmtcount_693L,pmtscount_423L,pmtssum_45A,requesttype_4525192L,responsedate_1012D,responsedate_4527233D,responsedate_4917613D,riskassesment_302T,riskassesment_940T,secondquarter_766L,thirdquarter_1082L
i64,str,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,date,date,date,str,f64,f64,f64
357,"""2019-01-11""",201901,1,0,,,7433.4,0.0,0.0,1.0,0.0,0.0,0.0,1.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,…,,,,,,,,,,,,,,,,,,,"""a55475b1""","""a55475b1""",,,,,,,,6.0,6301.4,,2019-01-25,,,,,,
381,"""2019-01-11""",201901,1,0,0.0,,2593.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,…,,,,,,,,,,,,,,,,,,,"""a55475b1""","""a55475b1""",,,,,,,,6.0,4019.6,,2019-01-25,,,,,,
388,"""2019-01-14""",201901,1,0,,,5109.6,0.0,0.0,0.0,2.0,1.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,3.0,0.0,2.0,…,2.0,,,,,,,,,,,,,,,,,6.0,"""a55475b1""","""a55475b1""",10.0,,,,,,,6.0,14548.0,,2019-01-28,,,,,3.0,5.0
405,"""2019-01-08""",201901,1,0,,,10045.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,"""CA""",,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,…,0.0,,,,,,,,,,,,,,,,,4.0,"""a55475b1""","""a55475b1""",1.0,,,,,,,6.0,10498.24,,2019-01-21,,,,,2.0,0.0
409,"""2019-01-08""",201901,1,0,,,1556.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,"""CA""",,0.0,0.0,0.0,0.0,0.0,1.0,0.0,,0.0,0.0,0.0,…,4.0,,,,,,,,,,,,,,,,,1.0,"""a7fcb6e5""","""a55475b1""",3.0,,,,,,,7.0,6344.8804,,2019-01-21,,,,,0.0,4.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2670550,"""2020-02-24""",202002,59,0,0.0,29812.4,2448.4001,0.0,0.0,0.0,0.0,0.0,0.0,3.0,-4.0,,-7.0,1.0,2710.2,,,,,"""CA""",,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,…,3.0,,,,,,,,,,,,,,,,,3.0,"""a7fcb6e5""","""a55475b1""",6.0,,,,,,,,,"""DEDUCTION_6""",,2020-03-09,,,,1.0,2.0
2670551,"""2020-02-24""",202002,59,0,0.0,106641.375,3175.0,1873.2001,0.0,0.0,0.0,0.0,0.0,10.0,-16.0,-9.0,-16.0,0.0,5514.2,38000.0,0.0,52918.746,7919.6,"""CA""",,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,…,6.0,,,,,,,,,,,,,,,,,5.0,"""a55475b1""","""a55475b1""",5.0,,24682.0,,1.0,,,,,"""PENSION_6""",,2020-03-09,,,,6.0,9.0
2670552,"""2020-02-24""",202002,59,0,0.0,71477.74,5983.4,0.0,0.0,0.0,0.0,0.0,0.0,2.0,-1.0,,-1.0,0.0,7147.8003,,0.0,0.0,9543.0,"""CA""",,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,…,0.0,,,,,,,,,,,,,,,,,5.0,"""3439d993""","""a55475b1""",9.0,,,,,,,,,"""DEDUCTION_6""",,2020-03-09,,,,0.0,4.0
2670553,"""2020-02-24""",202002,59,0,0.0,0.0,6596.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,,,,,,,,,,"""CA""",,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,…,2.0,,,,,,,,,,,,,,,,,1.0,"""a55475b1""","""a55475b1""",1.0,,,,,,,,,"""DEDUCTION_6""",,2020-03-09,,,,1.0,0.0


## Depth == 1

In [None]:
train_applprev = pl.concat(
    [
        pl.read_parquet(dataPath + "train_applprev_1_0.parquet", use_pyarrow=True).pipe(set_table_dtypes),
        pl.read_parquet(dataPath + "train_applprev_1_1.parquet", use_pyarrow=True).pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)
train_credit_bureau_a = pl.concat(
    [
        pl.read_parquet(dataPath + "train_credit_bureau_a_1_0.parquet", use_pyarrow=True).pipe(set_table_dtypes),
        pl.read_parquet(dataPath + "train_credit_bureau_a_1_1.parquet", use_pyarrow=True).pipe(set_table_dtypes),
        pl.read_parquet(dataPath + "train_credit_bureau_a_1_2.parquet", use_pyarrow=True).pipe(set_table_dtypes),
        pl.read_parquet(dataPath + "train_credit_bureau_a_1_3.parquet", use_pyarrow=True).pipe(set_table_dtypes)
    ],
    how="vertical_relaxed",
)
train_credit_bureau_b = pl.read_parquet(dataPath + "train_credit_bureau_b_1.parquet", use_pyarrow=True).pipe(set_table_dtypes)

train_debitcard = pl.read_parquet(dataPath + "train_debitcard_1.parquet", use_pyarrow=True).pipe(set_table_dtypes)

train_deposit = pl.read_parquet(dataPath + "train_deposit_1.parquet", use_pyarrow=True).pipe(set_table_dtypes)

train_person = pl.read_parquet(dataPath + "train_person_1.parquet", use_pyarrow=True).pipe(set_table_dtypes)

train_tax_registry_a = pl.read_parquet(dataPath + "train_tax_registry_a_1.parquet", use_pyarrow=True).pipe(set_table_dtypes)

train_tax_registry_b = pl.read_parquet(dataPath + "train_tax_registry_b_1.parquet", use_pyarrow=True).pipe(set_table_dtypes)

train_tax_registry_c = pl.read_parquet(dataPath + "train_tax_registry_c_1.parquet", use_pyarrow=True).pipe(set_table_dtypes)

train_other = pl.read_parquet(dataPath + "train_other_1.parquet", use_pyarrow=True).pipe(set_table_dtypes)

## Depth == 2

In [None]:
train_credit_bureau_a_2 = pl.concat(
    [
        pl.read_parquet(dataPath + "train_credit_bureau_a_2_0.parquet", use_pyarrow=True).pipe(set_table_dtypes),
        pl.read_parquet(dataPath + "train_credit_bureau_a_2_1.parquet", use_pyarrow=True).pipe(set_table_dtypes),
        pl.read_parquet(dataPath + "train_credit_bureau_a_2_2.parquet", use_pyarrow=True).pipe(set_table_dtypes),
        pl.read_parquet(dataPath + "train_credit_bureau_a_2_3.parquet", use_pyarrow=True).pipe(set_table_dtypes),
        pl.read_parquet(dataPath + "train_credit_bureau_a_2_4.parquet", use_pyarrow=True).pipe(set_table_dtypes),
        pl.read_parquet(dataPath + "train_credit_bureau_a_2_5.parquet", use_pyarrow=True).pipe(set_table_dtypes),
        pl.read_parquet(dataPath + "train_credit_bureau_a_2_6.parquet", use_pyarrow=True).pipe(set_table_dtypes),
        pl.read_parquet(dataPath + "train_credit_bureau_a_2_7.parquet", use_pyarrow=True).pipe(set_table_dtypes),
        pl.read_parquet(dataPath + "train_credit_bureau_a_2_8.parquet", use_pyarrow=True).pipe(set_table_dtypes),
        pl.read_parquet(dataPath + "train_credit_bureau_a_2_9.parquet", use_pyarrow=True).pipe(set_table_dtypes),
        pl.read_parquet(dataPath + "train_credit_bureau_a_2_10.parquet", use_pyarrow=True).pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)
train_credit_bureau_b_2 = pl.read_parquet(dataPath + "train_credit_bureau_b_2.parquet", use_pyarrow=True).pipe(set_table_dtypes)

train_person_2 = pl.read_parquet(dataPath + "train_person_2.parquet", use_pyarrow=True).pipe(set_table_dtypes)

train_applprev_2 = pl.read_parquet(dataPath + "train_applprev_2.parquet", use_pyarrow=True).pipe(set_table_dtypes)

# Model Pipline

In [4]:
class A:
    #@staticmethod
    def foo(x):
        return x
class B(A):
    def bar(x):
        return x
B.foo(2)

2

In [7]:
class Pipeline:

    def set_table_dtypes(df):
        for col in df.columns:
            if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.Int64))
            elif col in ["date_decision"]:
                df = df.with_columns(pl.col(col).cast(pl.Date))
            elif col[-1] in ("P", "A"):
                df = df.with_columns(pl.col(col).cast(pl.Float64))
            elif col[-1] in ("M",):
                df = df.with_columns(pl.col(col).cast(pl.String))
            elif col[-1] in ("D",):
                df = df.with_columns(pl.col(col).cast(pl.Date))
        return df

    def handle_dates(df):
        for col in df.columns:
            if col[-1] in ("D",):
                df = df.with_columns(pl.col(col) - pl.col("date_decision"))
                df = df.with_columns(pl.col(col).dt.total_days())
        df = df.drop("date_decision", "MONTH")
        return df

    def filter_cols(df):
        for col in df.columns:
            if col not in ["target", "case_id", "WEEK_NUM"]:
                isnull = df[col].is_null().mean()
                if isnull > 0.3:
                    df = df.drop(col)
        
        for col in df.columns:
            if (col not in ["target", "case_id", "WEEK_NUM"]) & (df[col].dtype == pl.String):
                freq = df[col].n_unique()
                if (freq == 1) | (freq > 200):
                    df = df.drop(col)
        
        return df

In [8]:
class Aggregator:
    def num_expr(df):
        cols = [col for col in df.columns if col[-1] in ("P", "A")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        expr_mean = [pl.mean(col).alias(f"mean_{col}") for col in cols]
        return expr_max +expr_last+expr_mean
    
    def date_expr(df):
        cols = [col for col in df.columns if col[-1] in ("D")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        expr_mean = [pl.mean(col).alias(f"mean_{col}") for col in cols]
        return  expr_max +expr_last+expr_mean
    
    def str_expr(df):
        cols = [col for col in df.columns if col[-1] in ("M",)]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        return  expr_max +expr_last#+expr_count
    
    def other_expr(df):
        cols = [col for col in df.columns if col[-1] in ("T", "L")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        return  expr_max +expr_last
    
    def count_expr(df):
        cols = [col for col in df.columns if "num_group" in col]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols] 
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        return  expr_max +expr_last
    
    def get_exprs(df):
        exprs = Aggregator.num_expr(df) + \
                Aggregator.date_expr(df) + \
                Aggregator.str_expr(df) + \
                Aggregator.other_expr(df) + \
                Aggregator.count_expr(df)

        return exprs

def read_file(path, depth=None):
    df = pl.read_parquet(path)
    df = df.pipe(Pipeline.set_table_dtypes)
    if depth in [1,2]:
        df = df.group_by("case_id").agg(Aggregator.get_exprs(df)) 
    return df

def read_files(regex_path, depth=None):
    chunks = []
    
    for path in glob(str(regex_path)):
        df = pl.read_parquet(path)
        df = df.pipe(Pipeline.set_table_dtypes)
        if depth in [1, 2]:
            df = df.group_by("case_id").agg(Aggregator.get_exprs(df))
        chunks.append(df)
    
    df = pl.concat(chunks, how="vertical_relaxed")
    df = df.unique(subset=["case_id"])
    return df

def feature_eng(df_base, depth_0, depth_1, depth_2):
    df_base = (
        df_base
        .with_columns(
            month_decision = pl.col("date_decision").dt.month(),
            weekday_decision = pl.col("date_decision").dt.weekday(),
        )
    )
    for i, df in enumerate(depth_0 + depth_1 + depth_2):
        df_base = df_base.join(df, how="left", on="case_id", suffix=f"_{i}")
    df_base = df_base.pipe(Pipeline.handle_dates)
    return df_base

def to_pandas(df_data, cat_cols=None):
    df_data = df_data.to_pandas()
    if cat_cols is None:
        cat_cols = list(df_data.select_dtypes("object").columns)
    df_data[cat_cols] = df_data[cat_cols].astype("category")
    return df_data, cat_cols

def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        if str(col_type)=="category":
            continue
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            continue
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [9]:
ROOT            = Path("/kaggle/input/homecredit/parquet_files")
TRAIN_DIR       = ROOT / "train"
TEST_DIR        = ROOT / "train"

In [10]:
%%time
data_store = {
    "df_base": read_file(ROOT / "train_base_table.parquet"),
    "depth_0": [
        read_file(TRAIN_DIR / "train_static_cb_0.parquet"),
        read_files(TRAIN_DIR / "train_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TRAIN_DIR / "train_applprev_1_*.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_a_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_b_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_c_1.parquet", 1),
        read_files(TRAIN_DIR / "train_credit_bureau_a_1_*.parquet", 1),
        read_file(TRAIN_DIR / "train_credit_bureau_b_1.parquet", 1),
        read_file(TRAIN_DIR / "train_other_1.parquet", 1),
        read_file(TRAIN_DIR / "train_person_1.parquet", 1),
        read_file(TRAIN_DIR / "train_deposit_1.parquet", 1),
        read_file(TRAIN_DIR / "train_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        read_file(TRAIN_DIR / "train_credit_bureau_b_2.parquet", 2),
        read_files(TRAIN_DIR / "train_credit_bureau_a_2_*.parquet", 2),
        read_file(TRAIN_DIR / "train_applprev_2.parquet", 2),
        read_file(TRAIN_DIR / "train_person_2.parquet", 2)
    ]
}

CPU times: user 4min 34s, sys: 57.7 s, total: 5min 32s
Wall time: 1min 56s


In [11]:
df_train = feature_eng(**data_store)
print("train data shape:\t", df_train.shape)
del data_store
gc.collect()
df_train = df_train.pipe(Pipeline.filter_cols)
df_train, cat_cols = to_pandas(df_train)
df_train = reduce_mem_usage(df_train)
print("train data shape:\t", df_train.shape)
nums=df_train.select_dtypes(exclude='category').columns
from itertools import combinations, permutations
nans_df = df_train[nums].isna()
nans_groups={}
for col in nums:
    cur_group = nans_df[col].sum()
    try:
        nans_groups[cur_group].append(col)
    except:
        nans_groups[cur_group]=[col]
del nans_df; x=gc.collect()

def reduce_group(grps):
    use = []
    for g in grps:
        mx = 0; vx = g[0]
        for gg in g:
            n = df_train[gg].nunique()
            if n>mx:
                mx = n
                vx = gg
        use.append(vx)
    print('Use these',use)
    return use

def group_columns_by_correlation(matrix, threshold=0.8):
    correlation_matrix = matrix.corr()
    groups = []
    remaining_cols = list(matrix.columns)
    while remaining_cols:
        col = remaining_cols.pop(0)
        group = [col]
        correlated_cols = [col]
        for c in remaining_cols:
            if correlation_matrix.loc[col, c] >= threshold:
                group.append(c)
                correlated_cols.append(c)
        groups.append(group)
        remaining_cols = [c for c in remaining_cols if c not in correlated_cols]
    
    return groups

uses=[]
for k,v in nans_groups.items():
    if len(v)>1:
            Vs = nans_groups[k]
            grps= group_columns_by_correlation(df_train[Vs], threshold=0.8)
            use=reduce_group(grps)
            uses=uses+use
    else:
        uses=uses+v
    print('####### NAN count =',k)
print(uses)
print(len(uses))
uses=uses+list(df_train.select_dtypes(include='category').columns)
print(len(uses))
df_train=df_train[uses]

train data shape:	 (1219149, 861)
Memory usage of dataframe is 1803.36 MB
Memory usage after optimization is: 670.92 MB
Decreased by 62.8%
train data shape:	 (1219149, 282)
Use these ['case_id', 'WEEK_NUM', 'target', 'month_decision', 'weekday_decision', 'credamount_770A', 'applicationcnt_361L', 'applications30d_658L', 'applicationscnt_1086L', 'applicationscnt_464L', 'applicationscnt_867L', 'clientscnt12m_3712952L', 'clientscnt_100L', 'clientscnt_1022L', 'clientscnt_1071L', 'clientscnt_1130L', 'clientscnt_157L', 'clientscnt_257L', 'clientscnt_304L', 'clientscnt_360L', 'clientscnt_493L', 'clientscnt_533L', 'clientscnt_887L', 'clientscnt_946L', 'deferredmnthsnum_166L', 'disbursedcredamount_1113A', 'downpmt_116A', 'homephncnt_628L', 'isbidproduct_1095L', 'mobilephncnt_593L', 'numactivecreds_622L', 'numactivecredschannel_414L', 'numactiverelcontr_750L', 'numcontrs3months_479L', 'numnotactivated_1143L', 'numpmtchanneldd_318L', 'numrejects9m_859L', 'sellerplacecnt_915L', 'max_mainoccupationi

In [12]:
sample = pd.read_csv("/kaggle/input/homecredit/sample_submission.csv")
device='gpu'
#n_samples=200000
n_est=6000
DRY_RUN = True if sample.shape[0] == 10 else False   
if DRY_RUN:
    device='cpu'
    df_train = df_train.iloc[:50000]
    #n_samples=10000
    n_est=600
print(device)

gpu


In [14]:
data_store = {
    "df_base": read_file(ROOT / "test_base_table.parquet"),
    "depth_0": [
        read_file(TRAIN_DIR / "train_static_cb_0.parquet"),
        read_files(TRAIN_DIR / "train_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TRAIN_DIR / "train_applprev_1_*.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_a_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_b_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_c_1.parquet", 1),
        read_files(TRAIN_DIR / "train_credit_bureau_a_1_*.parquet", 1),
        read_file(TRAIN_DIR / "train_credit_bureau_b_1.parquet", 1),
        read_file(TRAIN_DIR / "train_other_1.parquet", 1),
        read_file(TRAIN_DIR / "train_person_1.parquet", 1),
        read_file(TRAIN_DIR / "train_deposit_1.parquet", 1),
        read_file(TRAIN_DIR / "train_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        read_file(TRAIN_DIR / "train_credit_bureau_b_2.parquet", 2),
        read_files(TRAIN_DIR / "train_credit_bureau_a_2_*.parquet", 2),
        read_file(TRAIN_DIR / "train_applprev_2.parquet", 2),
        read_file(TRAIN_DIR / "train_person_2.parquet", 2)
    ]
}

In [15]:
df_test = feature_eng(**data_store)
print("test data shape:\t", df_test.shape)
del data_store
gc.collect()
df_test = df_test.select([col for col in df_train.columns if col != "target"])
print("train data shape:\t", df_train.shape)
print("test data shape:\t", df_test.shape)

df_test, cat_cols = to_pandas(df_test, cat_cols)
df_test = reduce_mem_usage(df_test)

gc.collect()

test data shape:	 (307510, 860)
train data shape:	 (1219149, 236)
test data shape:	 (307510, 235)
Memory usage of dataframe is 346.10 MB
Memory usage after optimization is: 134.65 MB
Decreased by 61.1%


0

In [16]:
y = df_train["target"]
weeks = df_train["WEEK_NUM"]
df_train= df_train.drop(columns=["target", "case_id", "WEEK_NUM"])
cv = StratifiedGroupKFold(n_splits=5, shuffle=False)

In [17]:
df_train[cat_cols] = df_train[cat_cols].astype(str)
df_test[cat_cols] = df_test[cat_cols].astype(str)

In [18]:
params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 8,  
    "learning_rate": 0.05,
    "n_estimators": 1000,  
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "verbose": -1,
    "random_state": 42,
    "reg_alpha": 0.1,
    "reg_lambda": 10,
    "extra_trees":True,
    'num_leaves':64,
    "device": device, 
    "verbose": -1,
}

In [None]:
%%time
from catboost import CatBoostClassifier, Pool
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

fitted_models_cat = []
fitted_models_lgb = []
fitted_models_lr = []
fitted_models_svm = []

cv_scores_cat = []
cv_scores_lgb = []
cv_scores_lr = []
cv_scores_svm = []


for idx_train, idx_valid in cv.split(df_train, y, groups=weeks):#
    X_train, y_train = df_train.iloc[idx_train], y.iloc[idx_train]# 
    X_valid, y_valid = df_train.iloc[idx_valid], y.iloc[idx_valid]
    train_pool = Pool(X_train, y_train,cat_features=cat_cols)
    val_pool = Pool(X_valid, y_valid,cat_features=cat_cols)
    clf = CatBoostClassifier(
    eval_metric='AUC',
    task_type='GPU',
    learning_rate=0.03,
    iterations=n_est)
    random_seed=3107
    clf.fit(train_pool, eval_set=val_pool,verbose=300)
    fitted_models_cat.append(clf)
    y_pred_valid = clf.predict_proba(X_valid)[:,1]
    auc_score = roc_auc_score(y_valid, y_pred_valid)
    cv_scores_cat.append(auc_score)
    
    
    X_train[cat_cols] = X_train[cat_cols].astype("category")
    X_valid[cat_cols] = X_valid[cat_cols].astype("category")
    
    model = lgb.LGBMClassifier(**params)
    model.fit(
        X_train, y_train,
        eval_set = [(X_valid, y_valid)],
        callbacks = [lgb.log_evaluation(200), lgb.early_stopping(100)] )
    
    fitted_models_lgb.append(model)
    y_pred_valid = model.predict_proba(X_valid)[:,1]
    auc_score = roc_auc_score(y_valid, y_pred_valid)
    cv_scores_lgb.append(auc_score)
    
    # Обучение логистической регрессии
    model_lr = LogisticRegression()
    model_lr.fit(X_train, y_train)
    fitted_models_lr.append(model_lr)
    y_pred_valid_lr = model_lr.predict_proba(X_valid)[:, 1]
    auc_score_lr = roc_auc_score(y_valid, y_pred_valid_lr)
    cv_scores_lr.append(auc_score_lr)
    
    # Обучение SVM
    model_svm = SVC(probability=True)
    model_svm.fit(X_train, y_train)
    fitted_models_svm.append(model_svm)
    y_pred_valid_svm = model_svm.predict_proba(X_valid)[:, 1]
    auc_score_svm = roc_auc_score(y_valid, y_pred_valid_svm)
    cv_scores_svm.append(auc_score_svm)

    
print("CV AUC scores: ", cv_scores_cat)
print("Maximum CV AUC score: ", max(cv_scores_cat))


print("CV AUC scores: ", cv_scores_lgb)
print("Maximum CV AUC score: ", max(cv_scores_lgb))

print("CV AUC scores (Logistic Regression):", cv_scores_lr)
print("Maximum CV AUC score (Logistic Regression):", max(cv_scores_lr))

print("CV AUC scores (SVM):", cv_scores_svm)
print("Maximum CV AUC score (SVM):", max(cv_scores_svm))

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6619801	best: 0.6619801 (0)	total: 440ms	remaining: 44m
300:	test: 0.8167689	best: 0.8167689 (300)	total: 2m 7s	remaining: 40m 20s
600:	test: 0.8221259	best: 0.8221259 (600)	total: 4m 10s	remaining: 37m 33s
900:	test: 0.8243622	best: 0.8243622 (900)	total: 6m 11s	remaining: 35m 1s
1200:	test: 0.8258641	best: 0.8258641 (1200)	total: 8m 10s	remaining: 32m 38s
1500:	test: 0.8267086	best: 0.8267086 (1500)	total: 10m 8s	remaining: 30m 24s
1800:	test: 0.8274796	best: 0.8274796 (1800)	total: 12m 5s	remaining: 28m 10s
2100:	test: 0.8281343	best: 0.8281343 (2100)	total: 14m 4s	remaining: 26m 6s


In [None]:
class VotingModel(BaseEstimator, RegressorMixin):
    def __init__(self, estimators):
        super().__init__()
        self.estimators = estimators
        
    def fit(self, X, y=None):
        return self
    
    def predict(self, X):
        y_preds = [estimator.predict(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)
    
    def predict_proba(self, X):
        
        y_preds = [estimator.predict_proba(X) for estimator in self.estimators[:5]]
        
        X[cat_cols] = X[cat_cols].astype("category")
        y_preds += [estimator.predict_proba(X) for estimator in self.estimators[5:]]
        
        return np.mean(y_preds, axis=0)

In [None]:
model = VotingModel(fitted_models_cat+fitted_models_lgb+fitted_models_lr+fitted_models_svm)

In [None]:
df_test = df_test.drop(columns=["WEEK_NUM"])
df_test = df_test.set_index("case_id")


y_pred = pd.Series(model.predict_proba(df_test)[:, 1], index=df_test.index)
df_subm = pd.read_csv("/kaggle/input/homecredit/sample_submission.csv")
df_subm = df_subm.set_index("case_id")

df_subm["score"] = y_pred
df_subm.to_csv("submission.csv")
df_subm