In [1]:
import sys
from pathlib import Path
import subprocess
import os
import gc
from glob import glob

import numpy as np
import pandas as pd
import polars as pl
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

ROOT = '/kaggle/input/home-credit-credit-risk-model-stability'

from sklearn.model_selection import TimeSeriesSplit, GroupKFold, StratifiedGroupKFold
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import KNNImputer

In [3]:
class Pipeline:

    def set_table_dtypes(df):
        for col in df.columns:
            if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.Int64))
            elif col in ["date_decision"]:
                df = df.with_columns(pl.col(col).cast(pl.Date))
            elif col[-1] in ("P", "A"):
                df = df.with_columns(pl.col(col).cast(pl.Float64))
            elif col[-1] in ("M",):
                df = df.with_columns(pl.col(col).cast(pl.String))
            elif col[-1] in ("D",):
                df = df.with_columns(pl.col(col).cast(pl.Date))
        return df

    def handle_dates(df):
        for col in df.columns:
            if col[-1] in ("D",):
                df = df.with_columns(pl.col(col) - pl.col("date_decision"))  #!!?
                df = df.with_columns(pl.col(col).dt.total_days()) # t - t-1
        df = df.drop("date_decision", "MONTH")
        return df

    def filter_cols(df):
        for col in df.columns:
            if col not in ["target", "case_id", "WEEK_NUM"]:
                isnull = df[col].is_null().mean()
                if isnull > 0.7:
                    df = df.drop(col)
        
        for col in df.columns:
            if (col not in ["target", "case_id", "WEEK_NUM"]) & (df[col].dtype == pl.String):
                freq = df[col].n_unique()
                if (freq == 1) | (freq > 200):
                    df = df.drop(col)
        
        return df


class Aggregator:
    
    def num_expr(df):
        cols = [col for col in df.columns if col[-1] in ("P", "A")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        
        expr_mean = [pl.mean(col).alias(f"mean_{col}") for col in cols]
        return expr_max +expr_last+expr_mean
    
    def date_expr(df):
        cols = [col for col in df.columns if col[-1] in ("D")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]

        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]

        expr_mean = [pl.mean(col).alias(f"mean_{col}") for col in cols]
        return  expr_max +expr_last+expr_mean
    
    def str_expr(df):
        cols = [col for col in df.columns if col[-1] in ("M",)]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]

        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]

        return  expr_max +expr_last
    
    def other_expr(df):
        cols = [col for col in df.columns if col[-1] in ("T", "L")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]

        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]

        return  expr_max +expr_last
    
    def count_expr(df):
        cols = [col for col in df.columns if "num_group" in col]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols] 
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        
        return  expr_max +expr_last
    
    def get_exprs(df):
        exprs = Aggregator.num_expr(df) + \
                Aggregator.date_expr(df) + \
                Aggregator.str_expr(df) + \
                Aggregator.other_expr(df) + \
                Aggregator.count_expr(df)

        return exprs

def read_file(path, depth=None):
    df = pl.read_parquet(path)
    df = df.pipe(Pipeline.set_table_dtypes)
    if depth in [1,2]:
        df = df.group_by("case_id").agg(Aggregator.get_exprs(df)) 
    return df

def read_files(regex_path, depth=None):
    chunks = []
    
    for path in glob(str(regex_path)):
        df = pl.read_parquet(path)
        df = df.pipe(Pipeline.set_table_dtypes)
        if depth in [1, 2]:
            df = df.group_by("case_id").agg(Aggregator.get_exprs(df))
        chunks.append(df)
    
    df = pl.concat(chunks, how="vertical_relaxed")
    df = df.unique(subset=["case_id"])
    return df

def feature_eng(df_base, depth_0, depth_1, depth_2):
    df_base = (
        df_base
        .with_columns(
            month_decision = pl.col("date_decision").dt.month(),
            weekday_decision = pl.col("date_decision").dt.weekday(),
        )
    )
    for i, df in enumerate(depth_0 + depth_1 + depth_2):
        df_base = df_base.join(df, how="left", on="case_id", suffix=f"_{i}")
    df_base = df_base.pipe(Pipeline.handle_dates)
    return df_base

def to_pandas(df_data, cat_cols=None):
    df_data = df_data.to_pandas()
    if cat_cols is None:
        cat_cols = list(df_data.select_dtypes("object").columns)
    df_data[cat_cols] = df_data[cat_cols].astype("category")
    return df_data, cat_cols

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        if str(col_type)=="category":
            continue
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            continue
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [4]:
ROOT            = Path("/kaggle/input/home-credit-credit-risk-model-stability")

TRAIN_DIR       = ROOT / "parquet_files" / "train"
TEST_DIR        = ROOT / "parquet_files" / "test"

In [5]:
%%time
data_store = {
    "df_base": read_file(TRAIN_DIR / "train_base.parquet"),
    "depth_0": [
        read_file(TRAIN_DIR / "train_static_cb_0.parquet"),
        read_files(TRAIN_DIR / "train_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TRAIN_DIR / "train_applprev_1_*.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_a_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_b_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_c_1.parquet", 1),
        read_files(TRAIN_DIR / "train_credit_bureau_a_1_*.parquet", 1),
        read_file(TRAIN_DIR / "train_credit_bureau_b_1.parquet", 1),
        read_file(TRAIN_DIR / "train_other_1.parquet", 1),
        read_file(TRAIN_DIR / "train_person_1.parquet", 1),
        read_file(TRAIN_DIR / "train_deposit_1.parquet", 1),
        read_file(TRAIN_DIR / "train_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        read_file(TRAIN_DIR / "train_credit_bureau_b_2.parquet", 2),
        read_files(TRAIN_DIR / "train_credit_bureau_a_2_*.parquet", 2),
        read_file(TRAIN_DIR / "train_applprev_2.parquet", 2),
        read_file(TRAIN_DIR / "train_person_2.parquet", 2)
    ]
}

CPU times: user 4min 32s, sys: 1min 35s, total: 6min 7s
Wall time: 2min 9s


In [6]:
%%time
df_train = feature_eng(**data_store)
print("train data shape:\t", df_train.shape)
del data_store
gc.collect()
df_train = df_train.pipe(Pipeline.filter_cols)
df_train, cat_cols = to_pandas(df_train)
df_train = reduce_mem_usage(df_train)
print("train data shape:\t", df_train.shape)
nums=df_train.select_dtypes(exclude='category').columns
from itertools import combinations, permutations
#df_train=df_train[nums]
nans_df = df_train[nums].isna()
nans_groups={}
for col in nums:
    cur_group = nans_df[col].sum()
    try:
        nans_groups[cur_group].append(col)
    except:
        nans_groups[cur_group]=[col]
del nans_df; x=gc.collect()

def reduce_group(grps):
    use = []
    for g in grps:
        mx = 0; vx = g[0]
        for gg in g:
            n = df_train[gg].nunique()
            if n>mx:
                mx = n
                vx = gg
            #print(str(gg)+'-'+str(n),', ',end='')
        use.append(vx)
        #print()
    print('Use these',use)
    return use

def group_columns_by_correlation(matrix, threshold=0.8):
    # 计算列之间的相关性
    correlation_matrix = matrix.corr()

    # 分组列
    groups = []
    remaining_cols = list(matrix.columns)
    while remaining_cols:
        col = remaining_cols.pop(0)
        group = [col]
        correlated_cols = [col]
        for c in remaining_cols:
            if correlation_matrix.loc[col, c] >= threshold:
                group.append(c)
                correlated_cols.append(c)
        groups.append(group)
        remaining_cols = [c for c in remaining_cols if c not in correlated_cols]
    
    return groups

uses=[]
for k,v in nans_groups.items():
    if len(v)>1:
            Vs = nans_groups[k]
            #cross_features=list(combinations(Vs, 2))
            #make_corr(Vs)
            grps= group_columns_by_correlation(df_train[Vs], threshold=0.8)
            use=reduce_group(grps)
            uses=uses+use
            #make_corr(use)
    else:
        uses=uses+v
    print('####### NAN count =',k)
print(len(uses))
uses=uses+list(df_train.select_dtypes(include='category').columns)
print(len(uses))
df_train=df_train[uses]

train data shape:	 (1526659, 861)
Memory usage of dataframe is 4322.75 MB
Memory usage after optimization is: 1528.81 MB
Decreased by 64.6%
train data shape:	 (1526659, 472)
Use these ['case_id', 'WEEK_NUM', 'target', 'month_decision', 'weekday_decision', 'credamount_770A', 'applicationcnt_361L', 'applications30d_658L', 'applicationscnt_1086L', 'applicationscnt_464L', 'applicationscnt_867L', 'clientscnt_1022L', 'clientscnt_100L', 'clientscnt_1071L', 'clientscnt_1130L', 'clientscnt_157L', 'clientscnt_257L', 'clientscnt_304L', 'clientscnt_360L', 'clientscnt_493L', 'clientscnt_533L', 'clientscnt_887L', 'clientscnt_946L', 'deferredmnthsnum_166L', 'disbursedcredamount_1113A', 'downpmt_116A', 'homephncnt_628L', 'isbidproduct_1095L', 'mobilephncnt_593L', 'numactivecreds_622L', 'numactivecredschannel_414L', 'numactiverelcontr_750L', 'numcontrs3months_479L', 'numnotactivated_1143L', 'numpmtchanneldd_318L', 'numrejects9m_859L', 'sellerplacecnt_915L', 'max_mainoccupationinc_384A', 'max_birth_259D

In [7]:
device='gpu'
#n_samples=200000
n_est=3000

gpu


In [8]:
data_store = {
    "df_base": read_file(TEST_DIR / "test_base.parquet"),
    "depth_0": [
        read_file(TEST_DIR / "test_static_cb_0.parquet"),
        read_files(TEST_DIR / "test_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TEST_DIR / "test_applprev_1_*.parquet", 1),
        read_file(TEST_DIR / "test_tax_registry_a_1.parquet", 1),
        read_file(TEST_DIR / "test_tax_registry_b_1.parquet", 1),
        read_file(TEST_DIR / "test_tax_registry_c_1.parquet", 1),
        read_files(TEST_DIR / "test_credit_bureau_a_1_*.parquet", 1),
        read_file(TEST_DIR / "test_credit_bureau_b_1.parquet", 1),
        read_file(TEST_DIR / "test_other_1.parquet", 1),
        read_file(TEST_DIR / "test_person_1.parquet", 1),
        read_file(TEST_DIR / "test_deposit_1.parquet", 1),
        read_file(TEST_DIR / "test_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        read_file(TEST_DIR / "test_credit_bureau_b_2.parquet", 2),
        read_files(TEST_DIR / "test_credit_bureau_a_2_*.parquet", 2),
        read_file(TEST_DIR / "test_applprev_2.parquet", 2),
        read_file(TEST_DIR / "test_person_2.parquet", 2)
    ]
}

In [9]:
df_test = feature_eng(**data_store)
print("test data shape:\t", df_test.shape)
del data_store
gc.collect()
df_test = df_test.select([col for col in df_train.columns if col != "target"])
print("train data shape:\t", df_train.shape)
print("test data shape:\t", df_test.shape)

df_test, cat_cols = to_pandas(df_test, cat_cols)
df_test = reduce_mem_usage(df_test)

gc.collect()

test data shape:	 (10, 860)
train data shape:	 (1526659, 389)
test data shape:	 (10, 388)
Memory usage of dataframe is 0.04 MB
Memory usage after optimization is: 0.02 MB
Decreased by 40.3%


0

In [10]:
"""def add_custom_feature(df, feature_prefixes, new_feature_name):
    used_features = []
    custom_feature = pd.Series(0, index=df.index)

    # 遍历特征列表
    for column in df.columns:
        if any(column.startswith(prefix) for prefix in feature_prefixes):
            # 将特征添加到使用的特征列表
            used_features.append(column)
            # 相加特征并更新custom_feature
            print(column)
            custom_feature += df[column]

    # 将新的自定义特征添加到数据集中
    df[new_feature_name] = custom_feature

    # 删除原始的特征
    #df = df.drop(columns=used_features)

    return df
"""


In [11]:
# 使用示例
#df_train = add_custom_feature(df_train, ["applicationscnt", "applicationcnt"], "totalapplicationscnt")
#df_test = add_custom_feature(df_test, ["applicationscnt", "applicationcnt"], "totalapplicationscnt")

In [12]:
#df_train = add_custom_feature(df_train, ["clientscnt"], "totalclientscnt")
#df_test = add_custom_feature(df_test, ["clientscnt"], "totalclientscnt")

In [13]:
drop_list = ['mean_credlmt_230A', 'mean_credamount_590A', 'max_financialinstitution_382M', 'max_classificationofcontr_13M', 'last_postype_4733339M', 'max_totalamount_6A', 'max_collater_valueofguarantee_1124L', 'opencred_647L', 'max_education_1138M', 'max_isbidproduct_390L', 'max_status_219L', 'last_cancelreason_3545846M', 'max_collater_valueofguarantee_876L', 'mean_overdueamount_659A', 'max_mainoccupationinc_437A', 'lastrejectcommoditycat_161M', 'max_description_351M', 'max_inittransactioncode_279L', 'max_debtoverdue_47A', 'max_credacc_credlmt_575A', 'max_subjectrole_182M', 'max_rejectreasonclient_4145042M', 'weekday_decision', 'disbursementtype_67L', 'totalsettled_863A', 'max_collaterals_typeofguarante_359M', 'last_inittransactioncode_279L', 'last_classificationofcontr_13M', 'last_rejectreasonclient_4145042M', 'max_purposeofcred_874M', 'max_num_group1_9', 'mean_credacc_credlmt_575A', 'last_isbidproduct_390L', 'max_financialinstitution_591M', 'max_subjectroles_name_838M', 'last_credacc_credlmt_575A', 'last_empls_economicalst_849M', 'max_classificationofcontr_400M', 'max_collaterals_typeofguarante_669M', 'max_collater_typofvalofguarant_298M', 'maxannuity_159A', 'max_subjectroles_name_541M', 'lastapprcredamount_781A', 'max_remitter_829L', 'last_downpmt_134A', 'mean_downpmt_134A', 'lastst_736L', 'max_cancelreason_3545846M', 'last_status_219L', 'avgoutstandbalancel6m_4187114A', 'max_contaddr_smempladdr_334L', 'max_collater_typofvalofguarant_407M', 'last_contractst_545M', 'lastrejectreasonclient_4145040M', 'max_downpmt_134A', 'last_empls_employer_name_740M', 'avginstallast24m_3658937A', 'amtinstpaidbefduel24m_4187115A', 'max_purposeofcred_426M', 'max_currdebt_94A', 'max_contractst_964M', 'lastapprcommoditycat_1041M', 'maritalst_893M', 'maxoutstandbalancel12m_4187113A', 'education_88M', 'max_language1_981M', 'last_financialinstitution_591M', 'max_contractst_545M', 'last_contaddr_matchlist_1032L', 'max_outstandingdebt_522A', 'last_language1_981M', 'maxinstallast24m_3658928A', 'max_role_1084L', 'sumoutstandtotal_3546847A', 'last_subjectrole_182M', 'last_collater_typofvalofguarant_298M', 'max_conts_role_79M', 'totaldebt_9A', 'last_sex_738L', 'mean_currdebt_94A', 'avgpmtlast12m_4525200A', 'last_subjectrole_93M', 'price_1097A', 'maininc_215A', 'last_contaddr_smempladdr_334L', 'maxlnamtstart6m_4525199A', 'last_role_1084L', 'currdebt_22A', 'annuitynextmonth_57A', 'mean_outstandingdebt_522A', 'last_collater_typofvalofguarant_407M', 'maxdebt4_972A', 'sumoutstandtotalest_4493215A', 'last_collaterals_typeofguarante_359M', 'last_purposeofcred_426M', 'max_education_927M', 'last_safeguarantyflag_411L', 'last_cacccardblochreas_147M', 'downpmt_116A', 'max_conts_type_509L', 'last_empladdr_zipcode_114M', 'last_empladdr_district_926M', 'last_education_927M', 'max_empladdr_zipcode_114M', 'paytype_783L', 'max_empladdr_district_926M', 'paytype1st_925L', 'last_purposeofcred_874M', 'last_description_351M', 'last_financialinstitution_382M', 'last_subjectroles_name_838M', 'max_cacccardblochreas_147M', 'last_classificationofcontr_400M', 'last_conts_role_79M', 'last_collaterals_typeofguarante_669M', 'last_contractst_964M', 'last_subjectroles_name_541M', 'max_numberofoutstandinstls_520L', 'max_type_25L', 'max_contaddr_matchlist_1032L', 'lastrejectcommodtypec_5251769M', 'max_safeguarantyflag_411L', 'currdebtcredtyperange_828A', 'last_mainoccupationinc_384A', 'last_incometype_1044T', 'last_currdebt_94A', 'mean_overdueamount_31A', 'mean_outstandingamount_354A', 'last_outstandingdebt_522A', 'mean_totaldebtoverduevalue_718A', 'mean_totaloutstanddebtvalue_668A', 'last_type_25L']

In [14]:
#df_train = df_train.drop(columns=drop_list )
#df_test = df_test.drop(columns=drop_list )

In [15]:
#cat_cols = df_train.select_dtypes(include=['object','category']).columns.tolist()
#cat_cols

### Feature Selection

In [16]:
y = df_train["target"]
weeks = df_train["WEEK_NUM"]
df_train= df_train.drop(columns=["target", "case_id", "WEEK_NUM"])
cv = StratifiedGroupKFold(n_splits=5, shuffle=False)


In [17]:
df_train[cat_cols] = df_train[cat_cols].astype(str)
df_test[cat_cols] = df_test[cat_cols].astype(str)

In [18]:
class FocalLoss:

    def __init__(self, gamma, alpha=None):
        # 使用FocalLoss只需要设定以上两个参数,如果alpha=None,默认取值为1
        self.alpha = alpha
        self.gamma = gamma

    def at(self, y):
        # alpha 参数, 根据FL的定义函数,正样本权重为self.alpha,负样本权重为1 - self.alpha
        if self.alpha is None:
            return np.ones_like(y)
        return np.where(y, self.alpha, 1 - self.alpha)

    def pt(self, y, p):
        # pt和p的关系
        p = np.clip(p, 1e-15, 1 - 1e-15)
        return np.where(y, p, 1 - p)

    def __call__(self, y_true, y_pred):
        # 即FL的计算公式
        at = self.at(y_true)
        pt = self.pt(y_true, y_pred)
        return -at * (1 - pt) ** self.gamma * np.log(pt)

    def grad(self, y_true, y_pred):
        # 一阶导数
        y = 2 * y_true - 1  # {0, 1} -> {-1, 1}
        at = self.at(y_true)
        pt = self.pt(y_true, y_pred)
        g = self.gamma
        return at * y * (1 - pt) ** g * (g * pt * np.log(pt) + pt - 1)

    def hess(self, y_true, y_pred):
        # 二阶导数
        y = 2 * y_true - 1  # {0, 1} -> {-1, 1}
        at = self.at(y_true)
        pt = self.pt(y_true, y_pred)
        g = self.gamma

        u = at * y * (1 - pt) ** g
        du = -at * y * g * (1 - pt) ** (g - 1)
        v = g * pt * np.log(pt) + pt - 1
        dv = g * np.log(pt) + g + 1

        return (du * v + u * dv) * y * (pt * (1 - pt))

    def init_score(self, y_true):
        # 样本初始值寻找过程
        res = optimize.minimize_scalar(
            lambda p: self(y_true, p).sum(),
            bounds=(0, 1),
            method='bounded'
        )
        p = res.x
        log_odds = np.log(p / (1 - p))
        return log_odds

    def lgb_obj(self, preds, train_data):
        y = train_data.get_label()
        p = special.expit(preds)
        return self.grad(y, p), self.hess(y, p)

    def lgb_eval(self, preds, train_data):
        y = train_data.get_label()
        p = special.expit(preds)
        is_higher_better = False
        return 'focal_loss', self(y, p).mean(), is_higher_better
    
    def lgb_obj_sklearn(self, labels,preds):
        p = special.expit(preds)
        return self.grad(labels, p), self.hess(labels, p)
    
    def lgb_eval_sklearn(self, labels,preds):
        p = special.expit(preds)
        is_higher_better = False
        return 'focal_loss', self(labels, p).mean(), is_higher_better


In [19]:
params = {
    "boosting_type": "gbdt",
    #"objective": fl.lgb_obj_sklearn,
    "metric": "auc",
    "max_depth": 10,  
    "learning_rate": 0.05,
    "n_estimators": 2000,  
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "verbose": -1,
    "random_state": 42,
    "reg_alpha": 0.1,
    "reg_lambda": 10,
    "extra_trees":True,
    'num_leaves':64,
    "device": device, 
    "verbose": -1,
}

In [20]:
%%time
from scipy import optimize
from scipy import special
from catboost import CatBoostClassifier, Pool

fitted_models_cat = []
fitted_models_lgb = []

cv_scores_cat = []
cv_scores_lgb = []

for idx_train, idx_valid in cv.split(df_train, y, groups=weeks):#
    X_train, y_train = df_train.iloc[idx_train], y.iloc[idx_train]# 
    X_valid, y_valid = df_train.iloc[idx_valid], y.iloc[idx_valid]
    train_pool = Pool(X_train, y_train,cat_features=cat_cols)
    val_pool = Pool(X_valid, y_valid,cat_features=cat_cols)
    clf = CatBoostClassifier(
    eval_metric='AUC',
    task_type='GPU',
    learning_rate=0.06,
    iterations=n_est)
    random_seed=3107
    clf.fit(train_pool, eval_set=val_pool,verbose=300)
    fitted_models_cat.append(clf)
    y_pred_valid = clf.predict_proba(X_valid)[:,1]
    print("#######################")
    print(clf.predict_proba(X_valid))
    auc_score = roc_auc_score(y_valid, y_pred_valid)
    cv_scores_cat.append(auc_score)
    
    X_train[cat_cols] = X_train[cat_cols].astype("category")
    X_valid[cat_cols] = X_valid[cat_cols].astype("category")
    
    fl = FocalLoss(alpha=0.25, gamma=5)
    initScore_fit = np.full_like(y_train, fl.init_score(y_train), dtype=float)
    initScore_val = np.full_like(y_valid, fl.init_score(y_valid), dtype=float)

    model = lgb.LGBMClassifier(objective=fl.lgb_obj_sklearn,**params)
    model.fit(
        X_train, y_train,
        eval_set = [(X_train, y_train),(X_valid, y_valid)],
        init_score= initScore_fit,
        eval_init_score =[initScore_fit,initScore_val],
        eval_metric=fl.lgb_eval_sklearn,
        callbacks = [lgb.log_evaluation(20), lgb.early_stopping(100)] )

    fitted_models_lgb.append(model)
    print("#######################")
    y_pred_valid = special.expit(model.predict_proba(X_valid))
    

    print(y_pred_valid)
    auc_score = roc_auc_score(y_valid, y_pred_valid)
    cv_scores_lgb.append(auc_score)
    
    
print("CV AUC scores: ", cv_scores_cat)
print("Maximum CV AUC score: ", np.mean(cv_scores_cat))


print("CV AUC scores: ", cv_scores_lgb)
print("Maximum CV AUC score: ", np.mean(cv_scores_lgb))



[LightGBM] [Info] Using self-defined objective function
Training until validation scores don't improve for 100 rounds
[20]	training's auc: 0.813464	training's focal_loss: 0.00192362	valid_1's auc: 0.803724	valid_1's focal_loss: 0.00203814
[40]	training's auc: 0.828974	training's focal_loss: 0.00181957	valid_1's auc: 0.817993	valid_1's focal_loss: 0.00193857
[60]	training's auc: 0.840187	training's focal_loss: 0.00175937	valid_1's auc: 0.827781	valid_1's focal_loss: 0.001884
[80]	training's auc: 0.84704	training's focal_loss: 0.00172164	valid_1's auc: 0.833418	valid_1's focal_loss: 0.00185201
[100]	training's auc: 0.85211	training's focal_loss: 0.00169547	valid_1's auc: 0.837166	valid_1's focal_loss: 0.00183183
[120]	training's auc: 0.855998	training's focal_loss: 0.00167617	valid_1's auc: 0.839756	valid_1's focal_loss: 0.00181832
[140]	training's auc: 0.859317	training's focal_loss: 0.00166047	valid_1's auc: 0.841642	valid_1's focal_loss: 0.0018087
[160]	training's auc: 0.862229	traini

In [None]:
class VotingModel(BaseEstimator, RegressorMixin):
    def __init__(self, estimators):
        super().__init__()
        self.estimators = estimators
        
    def fit(self, X, y=None):
        return self
    
    def predict(self, X):
        y_preds = [estimator.predict(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)
    
    def predict_proba(self, X):        
        y_preds = [estimator.predict_proba(X)[:,1] for estimator in self.estimators[:5]]
       
        X[cat_cols] = X[cat_cols].astype("category")
        y_preds += [special.expit(estimator.predict_proba(X)) for estimator in self.estimators[5:]]
        print(y_preds)
        print(len(y_preds))
        return np.mean(y_preds, axis=0)

model = VotingModel(fitted_models_cat+fitted_models_lgb)
