In [1]:
!ls /kaggle/input/tabnet/pytorch-tabnet

MarkupSafe-2.1.5-cp311-cp311-win_amd64.whl
colorama-0.4.6-py2.py3-none-any.whl
filelock-3.14.0-py3-none-any.whl
fsspec-2024.5.0-py3-none-any.whl
intel_openmp-2021.4.0-py2.py3-none-win_amd64.whl
jinja2-3.1.4-py3-none-any.whl
joblib-1.4.2-py3-none-any.whl
mkl-2021.4.0-py2.py3-none-win_amd64.whl
mpmath-1.3.0-py3-none-any.whl
networkx-3.3-py3-none-any.whl
numpy-1.26.4-cp311-cp311-win_amd64.whl
pytorch_tabnet-4.1.0-py3-none-any.whl
scikit_learn-1.5.0-cp311-cp311-win_amd64.whl
scipy-1.13.1-cp311-cp311-win_amd64.whl
sympy-1.12-py3-none-any.whl
tbb-2021.12.0-py3-none-win_amd64.whl
threadpoolctl-3.5.0-py3-none-any.whl
torch-2.3.0-cp311-cp311-win_amd64.whl
tqdm-4.66.4-py3-none-any.whl
typing_extensions-4.12.0-py3-none-any.whl


In [2]:
!pip install pytorch-tabnet --no-index --find-links=file:///kaggle/input/tabnet/pytorch-tabnet

Looking in links: file:///kaggle/input/tabnet/pytorch-tabnet
Processing /kaggle/input/tabnet/pytorch-tabnet/pytorch_tabnet-4.1.0-py3-none-any.whl
Installing collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-4.1.0


In [3]:
import os, glob
import gc
from pathlib import Path

import numpy as np
import pandas as pd
import polars as pl
import polars.selectors as cs
import matplotlib.pyplot as plt
from typing import Literal

from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold
from sklearn.metrics import roc_auc_score, auc

import lightgbm as lgb
from lightgbm import LGBMClassifier, LGBMRegressor
import catboost as cat
from catboost import CatBoostClassifier, Pool
import xgboost as xgb
from xgboost import XGBClassifier

import torch
import torch.nn as nn
from torch.utils import data
from pytorch_tabnet.tab_model import TabNetClassifier
from pytorch_tabnet.pretraining import TabNetPretrainer

import pickle
import random

from datetime import datetime


In [4]:
import importlib.util
import sys

def load_module_from_path(module_name, file_path):
    spec = importlib.util.spec_from_file_location(module_name, file_path)
    module = importlib.util.module_from_spec(spec)
    sys.modules[module_name] = module
    spec.loader.exec_module(module)
    return module

# loaded_module = load_module_from_path('model', '/kaggle/input/ensemble/other/model_fn/1/model.py')
# from model import Transformer, KAN, TransformerKAN

In [5]:
if os.path.exists('/kaggle/input'):
    PATH_DATASET = Path("/kaggle/input/home-credit-credit-risk-model-stability")
else:
    PATH_DATASET = Path("/kaggle/data/")
PATH_PARQUETS = PATH_DATASET / "parquet_files"
PATH_TRAIN = PATH_PARQUETS / "train"
PATH_TEST = PATH_PARQUETS / "test"

if os.path.exists('/kaggle/input'):
    PATH_PREPARED_DATASET = Path("/kaggle/input/dataset")
else:
    PATH_PREPARED_DATASET = Path("/kaggle/data/preprocess")

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

# Load dataset

In [6]:
class DatasetConstructor:
    def __init__(self, mode: Literal['train', 'test']):
        self.mode = mode
        self.path = PATH_PARQUETS / mode

    @staticmethod
    def reduce_memory_usage_pl(df):
        """ Reduce memory usage by polars dataframe {df} with name {name} by changing its data types.
            Original pandas version of this function: https://www.kaggle.com/code/arjanso/reducing-dataframe-memory-size-by-65 """
        print(f"Memory usage of dataframe is {round(df.estimated_size('mb'), 2)} MB")
        Numeric_Int_types = [pl.Int8,pl.Int16,pl.Int32,pl.Int64]
        Numeric_Float_types = [pl.Float32,pl.Float64]    
        for col in df.columns:
            try:
                col_type = df[col].dtype
                if col_type == pl.Categorical:
                    continue
                c_min = df[col].min()
                c_max = df[col].max()
                if col_type in Numeric_Int_types:
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df = df.with_columns(df[col].cast(pl.Int8))
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df = df.with_columns(df[col].cast(pl.Int16))
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df = df.with_columns(df[col].cast(pl.Int32))
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df = df.with_columns(df[col].cast(pl.Int64))
                elif col_type in Numeric_Float_types:
                    if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df = df.with_columns(df[col].cast(pl.Float32))
                    else:
                        pass
                # elif col_type == pl.Utf8:
                #     df = df.with_columns(df[col].cast(pl.Categorical))
                else:
                    pass
            except:
                pass
        print(f"Memory usage of dataframe became {round(df.estimated_size('mb'), 2)} MB")
        return df

    @staticmethod
    def detect_datetime_cols(df):
        return df.select_dtypes(object).apply(lambda x: pd.to_datetime(x, errors='ignore'), axis=0).select_dtypes(np.datetime64).columns.tolist()
                        
    def _to_pandas(self, df):
        df = df.to_pandas().set_index('case_id')
        df = df.replace([np.inf, -np.inf], np.nan)
        return df

    def merge_static(self, df):
        df_static = (
            pl.concat([pl.scan_parquet(p, low_memory=True) for p in glob.glob(str(self.path / f"{self.mode}_static_0_*"))],how="vertical_relaxed",)
            .with_columns(
                [
                    (pl.col(col).cast(pl.String).str.to_date(strict=False)) 
                    for col in [
                        'datefirstoffer_1144D', 
                        'datelastinstal40dpd_247D',
                        'datelastunpaid_3546854D', 
                        'dtlastpmtallstes_4499206D',
                        'firstclxcampaign_1125D', 
                        'firstdatedue_489D', 
                        'lastactivateddate_801D',
                       'lastapplicationdate_877D', 
                        'lastapprdate_640D', 
                        'lastdelinqdate_224D',
                       'lastrejectdate_50D', 
                        'lastrepayingdate_696D',
                       'maxdpdinstldate_3546855D', 
                        'payvacationpostpone_4187118D',
                       'validfrom_1069D'
                    ]
                ] + [
                    (pl.col(col).cast(pl.String).cast(pl.Categorical))
                    for col in [
                        'bankacctype_710L', 'cardtype_51L', 'credtype_322L',
                       'disbursementtype_67L', 'equalitydataagreement_891L',
                       'equalityempfrom_62L', 'inittransactioncode_186L',
                       'isbidproductrequest_292L', 'isdebitcard_729L',
                       'lastapprcommoditycat_1041M', 'lastapprcommoditytypec_5251766M',
                       'lastcancelreason_561M', 'lastrejectcommoditycat_161M',
                       'lastrejectcommodtypec_5251769M', 'lastrejectreason_759M',
                       'lastrejectreasonclient_4145040M', 'lastst_736L', 'opencred_647L',
                       'paytype1st_925L', 'paytype_783L', 'previouscontdistrict_112M',
                       'twobodfilling_608L', 'typesuite_864L'
                    ]
                ]
            )
        )
        return df.join(df_static, how="left", on="case_id")
        
    def merge_static_cb(self, df):
        df_static_cb = (
            pl.scan_parquet(self.path / f"{self.mode}_static_cb_0.parquet", low_memory=True)
            .with_columns(
                [
                    (pl.col(col).cast(pl.String).str.to_date(strict=False)) 
                    for col in [
                        'assignmentdate_238D', 
                        'assignmentdate_4527235D',
                        'assignmentdate_4955616D', 
                        'birthdate_574D', 
                        'dateofbirth_337D',
                        'dateofbirth_342D', 
                        'responsedate_1012D', 
                        'responsedate_4527233D',
                        'responsedate_4917613D'
                    ] 
                ] + [
                    (pl.col(col).cast(pl.String).cast(pl.Categorical))
                    for col in [
                        'description_5085714M', 'education_1103M', 'education_88M',
                       'maritalst_385M', 'maritalst_893M', 'requesttype_4525192L',
                       'riskassesment_302T'
                    ]
                ]
            )
        )
        return df.join(df_static_cb, how="left", on="case_id")
 
    def load(self):
        df = pl.scan_parquet(self.path / f"{self.mode}_base.parquet", low_memory=True).with_columns(
            pl.col("date_decision").str.to_date()
        )
        # Depth=0
        df = self.merge_static(df)
        df = self.merge_static_cb(df)
        
        df =(
            df
            .with_columns(
                pl.col(pl.Float64).cast(pl.Float32),
                pl.col(pl.Int64).cast(pl.Int32),
            )
        )
        df = df.select(~cs.date())
        
        # Drop categorical large-dimension columns
        df = df.drop([
            'lastapprcommoditytypec_5251766M',
             'previouscontdistrict_112M',
             'district_544M',
             'profession_152M',
             'name_4527232M',
             'name_4917606M',
             'employername_160M',
             'classificationofcontr_400M',
             'financialinstitution_382M',
             'contaddr_district_15M',
             'contaddr_zipcode_807M',
             'empladdr_district_926M',
             'empladdr_zipcode_114M',
             'registaddr_district_1083M',
             'registaddr_zipcode_184M',
             'addres_district_368M',
             'addres_zip_823M'])
        df = df.collect()
        df = self.reduce_memory_usage_pl(df)
        df = self._to_pandas(df)
        return df

In [7]:
def _to_pandas(df):
    df = df.to_pandas().set_index('case_id')
    df = df.replace([np.inf, -np.inf], np.nan)
    return df

def reduce_memory_usage_pl(df):
        """ Reduce memory usage by polars dataframe {df} with name {name} by changing its data types.
            Original pandas version of this function: https://www.kaggle.com/code/arjanso/reducing-dataframe-memory-size-by-65 """
        print(f"Memory usage of dataframe is {round(df.estimated_size('mb'), 2)} MB")
        Numeric_Int_types = [pl.Int8,pl.Int16,pl.Int32,pl.Int64]
        Numeric_Float_types = [pl.Float32,pl.Float64]    
        for col in df.columns:
            try:
                col_type = df[col].dtype
                if col_type == pl.Categorical:
                    continue
                c_min = df[col].min()
                c_max = df[col].max()
                if col_type in Numeric_Int_types:
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df = df.with_columns(df[col].cast(pl.Int8))
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df = df.with_columns(df[col].cast(pl.Int16))
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df = df.with_columns(df[col].cast(pl.Int32))
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df = df.with_columns(df[col].cast(pl.Int64))
                elif col_type in Numeric_Float_types:
                    if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df = df.with_columns(df[col].cast(pl.Float32))
                    else:
                        pass
                # elif col_type == pl.Utf8:
                #     df = df.with_columns(df[col].cast(pl.Categorical))
                else:
                    pass
            except:
                pass
        print(f"Memory usage of dataframe became {round(df.estimated_size('mb'), 2)} MB")
        return df

In [8]:
test_constructor = DatasetConstructor('test')
df_test = test_constructor.load()
df_test.replace(to_replace=[None], value=np.nan, inplace=True)
df_test
# x_test.info()

Memory usage of dataframe is 0.01 MB
Memory usage of dataframe became 0.01 MB


Unnamed: 0_level_0,MONTH,WEEK_NUM,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,applicationscnt_1086L,applicationscnt_464L,applicationscnt_629L,applicationscnt_867L,avgdbddpdlast24m_3658932P,avgdbddpdlast3m_4187120P,avgdbdtollast24m_4525197P,avgdpdtolclosure24_3658938P,avginstallast24m_3658937A,avglnamtstart24m_4525187A,avgmaxdpdlast9m_3716943P,avgoutstandbalancel6m_4187114A,avgpmtlast12m_4525200A,bankacctype_710L,cardtype_51L,clientscnt12m_3712952L,clientscnt3m_3712950L,clientscnt6m_3712949L,clientscnt_100L,clientscnt_1022L,clientscnt_1071L,clientscnt_1130L,clientscnt_136L,clientscnt_157L,clientscnt_257L,clientscnt_304L,clientscnt_360L,clientscnt_493L,clientscnt_533L,clientscnt_887L,clientscnt_946L,cntincpaycont9m_3716944L,cntpmts24_3658933L,commnoinclast6m_3546845L,credamount_770A,credtype_322L,currdebt_22A,currdebtcredtyperange_828A,daysoverduetolerancedd_3976961L,deferredmnthsnum_166L,disbursedcredamount_1113A,disbursementtype_67L,downpmt_116A,eir_270L,equalitydataagreement_891L,equalityempfrom_62L,homephncnt_628L,inittransactionamount_650A,inittransactioncode_186L,interestrate_311L,interestrategrace_34L,isbidproduct_1095L,isbidproductrequest_292L,isdebitcard_729L,lastapprcommoditycat_1041M,lastapprcredamount_781A,lastcancelreason_561M,lastdependentsnum_448L,lastotherinc_902A,lastotherlnsexpense_631A,lastrejectcommoditycat_161M,lastrejectcommodtypec_5251769M,lastrejectcredamount_222A,lastrejectreason_759M,lastrejectreasonclient_4145040M,lastst_736L,maininc_215A,mastercontrelectronic_519L,mastercontrexist_109L,maxannuity_159A,maxannuity_4075009A,maxdbddpdlast1m_3658939P,maxdbddpdtollast12m_3658940P,maxdbddpdtollast6m_4187119P,maxdebt4_972A,maxdpdfrom6mto36m_3546853P,maxdpdinstlnum_3546846P,maxdpdlast12m_727P,maxdpdlast24m_143P,maxdpdlast3m_392P,maxdpdlast6m_474P,maxdpdlast9m_1059P,maxdpdtolerance_374P,maxinstallast24m_3658928A,maxlnamtstart6m_4525199A,maxoutstandbalancel12m_4187113A,maxpmtlast3m_4525190A,mindbddpdlast24m_3658935P,mindbdtollast24m_4525191P,mobilephncnt_593L,monthsannuity_845L,numactivecreds_622L,numactivecredschannel_414L,numactiverelcontr_750L,numcontrs3months_479L,numincomingpmts_3546848L,numinstlallpaidearly3d_817L,numinstls_657L,numinstlsallpaid_934L,numinstlswithdpd10_728L,numinstlswithdpd5_4187116L,numinstlswithoutdpd_562L,numinstmatpaidtearly2d_4499204L,numinstpaid_4499208L,numinstpaidearly3d_3546850L,numinstpaidearly3dest_4493216L,numinstpaidearly5d_1087L,numinstpaidearly5dest_4493211L,numinstpaidearly5dobd_4499205L,numinstpaidearly_338L,numinstpaidearlyest_4493214L,numinstpaidlastcontr_4325080L,numinstpaidlate1d_3546852L,numinstregularpaid_973L,numinstregularpaidest_4493210L,numinsttopaygr_769L,numinsttopaygrest_4493213L,numinstunpaidmax_3546851L,numinstunpaidmaxest_4493212L,numnotactivated_1143L,numpmtchanneldd_318L,numrejects9m_859L,opencred_647L,paytype1st_925L,paytype_783L,pctinstlsallpaidearl3d_427L,pctinstlsallpaidlat10d_839L,pctinstlsallpaidlate1d_3546856L,pctinstlsallpaidlate4d_3546849L,pctinstlsallpaidlate6d_3546844L,pmtnum_254L,posfpd10lastmonth_333P,posfpd30lastmonth_3976960P,posfstqpd30lastmonth_3976962P,price_1097A,sellerplacecnt_915L,sellerplacescnt_216L,sumoutstandtotal_3546847A,sumoutstandtotalest_4493215A,totaldebt_9A,totalsettled_863A,totinstallast1m_4525188A,twobodfilling_608L,typesuite_864L,contractssum_5085716L,days120_123L,days180_256L,days30_165L,days360_512L,days90_310L,description_5085714M,education_1103M,education_88M,firstquarter_103L,for3years_128L,for3years_504L,for3years_584L,formonth_118L,formonth_206L,formonth_535L,forquarter_1017L,forquarter_462L,forquarter_634L,fortoday_1092L,forweek_1077L,forweek_528L,forweek_601L,foryear_618L,foryear_818L,foryear_850L,fourthquarter_440L,maritalst_385M,maritalst_893M,numberofqueries_373L,pmtaverage_3A,pmtaverage_4527227A,pmtaverage_4955615A,pmtcount_4527229L,pmtcount_4955617L,pmtcount_693L,pmtscount_423L,pmtssum_45A,requesttype_4525192L,riskassesment_302T,riskassesment_940T,secondquarter_766L,thirdquarter_1082L
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1,Unnamed: 189_level_1,Unnamed: 190_level_1,Unnamed: 191_level_1,Unnamed: 192_level_1,Unnamed: 193_level_1,Unnamed: 194_level_1,Unnamed: 195_level_1
57543,202201,100,0.0,191767.359375,3674.600098,1218.200073,0.0,0.0,0.0,0.0,0.0,9.0,1.0,2.0,1.0,1.0,16049.400391,17054.400391,2.0,14554.400391,24482.0,CA,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,20.0,0.0,20000.0,CAL,12154.400391,0.0,8.0,,20000.0,GBA,0.0,0.34,,,0.0,,CASH,0.34,,True,,False,a55475b1,14000.0,a55475b1,,,,P109_133_183,P49_111_165,24000.0,a55475b1,a55475b1,K,34000.0,0.0,0.0,280983.5625,,2.0,3.0,3.0,231440.03125,7.0,14.0,3.0,7.0,3.0,3.0,3.0,7.0,131700.796875,16672.599609,157731.78125,16641.400391,-7.0,-7.0,2.0,66.0,1.0,0.0,0.0,1.0,112.0,34.0,14.0,66.0,0.0,6.0,79.0,37.0,96.0,34.0,34.0,0.0,0.0,25.0,25.0,25.0,1.0,31.0,96.0,96.0,10.0,10.0,10.0,10.0,0.0,0.0,0.0,False,,,0.35417,0.0,0.32292,0.07292,0.05208,6.0,0.0,0.0,0.0,0.0,0.0,5.0,12154.400391,12154.400391,12154.400391,456031.09375,17859.599609,FO,AL,151364.0,2.0,4.0,1.0,8.0,2.0,2fc785b2,6b2ae0fa,a55475b1,4.0,,,,,,,,,,,,,,,,,9.0,38c061ee,a55475b1,8.0,,,,,,,,,,,,2.0,3.0
57549,202201,100,0.0,129704.398438,5742.600098,3546.600098,0.0,2.0,0.0,0.0,0.0,10.0,0.0,0.0,-1.0,0.0,32426.201172,118964.804688,0.0,13681.713867,32426.201172,CA,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,0.0,75000.0,CAL,10638.200195,10638.200195,61.0,,75000.0,GBA,0.0,0.227,True,,1.0,,CASH,0.227,,True,,False,a55475b1,94000.0,a55475b1,,,,a55475b1,a55475b1,160000.0,a55475b1,P30_86_84,D,44000.0,0.0,0.0,337659.8125,,0.0,0.0,0.0,34066.0,0.0,16.0,0.0,0.0,0.0,0.0,0.0,54.0,122511.398438,31820.599609,21278.0,122511.398438,-2.0,-2.0,3.0,41.0,0.0,0.0,1.0,2.0,38.0,15.0,6.0,31.0,15.0,7.0,60.0,15.0,44.0,15.0,15.0,0.0,0.0,15.0,15.0,15.0,1.0,13.0,44.0,44.0,3.0,3.0,3.0,3.0,2.0,0.0,1.0,False,,,0.34091,0.11628,0.29545,0.18605,0.13953,18.0,0.0,0.0,0.0,,2.0,7.0,10638.200195,10638.200195,10638.200195,373720.84375,126058.0,FO,,1563078.0,6.0,9.0,3.0,12.0,4.0,2fc785b2,39a0853f,a55475b1,9.0,,,,,,,,,,,,,,,,,5.0,a7fcb6e5,a55475b1,12.0,,,26815.599609,,14.0,,,,,,,8.0,2.0
57551,202201,100,0.0,71036.398438,2844.600098,0.0,0.0,1.0,0.0,0.0,0.0,2.0,-1.0,,-1.0,1.0,8357.200195,,1.0,0.0,9551.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,9.0,0.0,27095.201172,COL,0.0,0.0,4.0,0.0,27095.201172,SBA,0.0,0.45,True,,1.0,,POS,0.45,,False,,,a55475b1,200000.0,P85_114_140,,,,a55475b1,a55475b1,,a55475b1,a55475b1,T,70000.0,0.0,0.0,83400.0,,,4.0,0.0,54000.0,4.0,6.0,4.0,4.0,0.0,0.0,4.0,4.0,41783.402344,54000.0,62619.0,,-4.0,-4.0,1.0,9.0,0.0,0.0,0.0,1.0,9.0,3.0,0.0,8.0,0.0,0.0,9.0,5.0,9.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,9.0,1.0,9.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,OTHER,OTHER,0.33333,0.0,0.11111,0.11111,0.0,12.0,,,,27095.201172,1.0,2.0,0.0,0.0,0.0,75219.0,,FO,,2926195.0,1.0,3.0,1.0,4.0,1.0,2fc785b2,6b2ae0fa,a55475b1,3.0,,,,,,,,,,,,,,,,,2.0,3439d993,a55475b1,4.0,,,,,,,,,,,,5.0,5.0
57552,202201,100,0.0,183992.0,6298.800293,12155.400391,0.0,0.0,0.0,0.0,0.0,9.0,-9.0,-7.0,-9.0,0.0,7440.399902,,0.0,199322.40625,9148.400391,CA,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,24.0,0.0,100000.0,CAL,191269.609375,191269.609375,1.0,,100000.0,GBA,0.0,0.15,,,1.0,,CASH,0.15,,True,,False,a55475b1,0.0,P94_109_143,,,,a55475b1,a55475b1,150000.0,a55475b1,P94_109_143,D,,0.0,0.0,110500.0,,-1.0,0.0,-1.0,188126.140625,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,12155.400391,104473.601562,288642.59375,12155.400391,-13.0,-13.0,1.0,23.0,2.0,0.0,1.0,0.0,37.0,26.0,62.0,31.0,0.0,0.0,33.0,27.0,32.0,26.0,26.0,0.0,0.0,24.0,24.0,24.0,0.0,0.0,32.0,32.0,30.0,30.0,22.0,22.0,1.0,0.0,1.0,False,OTHER,OTHER,0.83871,0.0,0.0,0.0,0.0,24.0,0.0,0.0,0.0,,0.0,6.0,191269.609375,191269.609375,191269.609375,284213.0,18889.0,BO,,747031.8,2.0,2.0,0.0,5.0,0.0,2fc785b2,a55475b1,a55475b1,3.0,,,,,,,,,,,,,,,,,2.0,a55475b1,a55475b1,5.0,,,23402.800781,,14.0,,,,,,,7.0,1.0
57569,202201,100,0.0,0.0,4682.600098,0.0,0.0,1.0,0.0,0.0,0.0,6.0,2824.0,,2824.0,2517.0,,,,,10796.400391,CA,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,60000.0,CAL,0.0,0.0,3183.0,,60000.0,GBA,0.0,0.38,,,1.0,,CASH,0.38,,True,False,False,a55475b1,20000.0,P94_109_143,,,,a55475b1,a55475b1,40000.0,a55475b1,P94_109_143,N,6000.0,0.0,0.0,37704.0,,,2865.0,,64555.667969,2865.0,7.0,2865.0,2865.0,0.0,0.0,2865.0,2865.0,,,0.0,,2783.0,2783.0,2.0,11.0,0.0,0.0,1.0,4.0,17.0,5.0,0.0,5.0,36.0,9.0,6.0,5.0,15.0,5.0,5.0,1.0,1.0,5.0,5.0,5.0,5.0,10.0,15.0,15.0,0.0,0.0,0.0,0.0,1.0,0.0,4.0,True,,,0.33333,0.6,0.66667,0.66667,0.6,24.0,0.0,,,,2.0,3.0,0.0,0.0,0.0,95348.421875,,FO,,,4.0,4.0,1.0,4.0,4.0,2fc785b2,717ddd49,a55475b1,0.0,,,,,,,,,,,,,,,,,0.0,3439d993,a55475b1,4.0,,,17333.599609,,14.0,,,,,,,1.0,3.0
57630,202201,100,0.0,0.0,8905.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,,,0.0,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,96174.0,COL,0.0,0.0,3.0,,96174.0,SBA,0.0,0.0,,,1.0,,POS,0.0,,False,,False,P148_110_5,8876.0,P198_89_166,,,,a55475b1,a55475b1,,a55475b1,a55475b1,T,12000.0,0.0,0.0,1382.800049,,,,,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,2.0,,,,,,,2.0,7.0,0.0,0.0,0.0,0.0,7.0,3.0,0.0,5.0,0.0,0.0,5.0,4.0,7.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,7.0,2.0,7.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,,,0.42857,0.0,0.28571,0.0,0.0,12.0,0.0,0.0,0.0,96174.0,0.0,1.0,0.0,0.0,0.0,9677.600586,,FO,,499975.0,1.0,2.0,1.0,5.0,1.0,2fc785b2,6b2ae0fa,a55475b1,1.0,,,,,,,,,,,,,,,,,3.0,3439d993,a55475b1,5.0,,,,,,,,,,,,4.0,1.0
57631,202201,100,0.0,,2540.600098,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,24920.0,COL,0.0,0.0,,0.0,24920.0,SBA,0.0,0.39,,,1.0,,POS,0.39,,False,,,a55475b1,,P94_109_143,,,,P100_96_175,P165_57_169,46279.800781,P45_84_106,P94_109_143,D,,0.0,0.0,0.0,,,,,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,3.0,,0.0,0.0,0.0,0.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,False,,,,,,,,12.0,0.0,0.0,0.0,24920.0,0.0,0.0,,,0.0,0.0,,FO,,480334.5,0.0,0.0,0.0,1.0,0.0,2fc785b2,a55475b1,a55475b1,3.0,,,,,,,,,,,,,,,,,7.0,a55475b1,a55475b1,1.0,,,,,,,,,,,,2.0,5.0
57632,202201,100,0.0,63647.402344,4732.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-7.0,-6.0,-7.0,0.0,3536.0,,0.0,10581.713867,3536.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,10.0,18.0,0.0,25998.0,COL,0.0,0.0,0.0,0.0,25998.0,SBA,0.0,0.0,,,0.0,,POS,0.0,,False,False,False,P53_45_92,50116.0,a55475b1,,,,a55475b1,a55475b1,,a55475b1,a55475b1,K,56000.0,0.0,0.0,7000.0,,-4.0,-4.0,-6.0,63647.402344,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,3536.0,63647.402344,42412.0,3536.0,-9.0,-9.0,1.0,17.0,0.0,0.0,0.0,0.0,21.0,18.0,0.0,18.0,0.0,0.0,19.0,18.0,18.0,18.0,18.0,16.0,16.0,16.0,16.0,16.0,18.0,0.0,18.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,,,1.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,25998.0,0.0,1.0,0.0,0.0,0.0,63652.0,7071.399902,FO,,17677.0,1.0,2.0,0.0,4.0,0.0,2fc785b2,a55475b1,a55475b1,1.0,,,,,,,,,,,,,,,,,1.0,a55475b1,a55475b1,4.0,,,15841.200195,,14.0,,,,,,,1.0,1.0
57633,202201,100,0.0,,8273.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,,,,,,,,,,CA,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,,0.0,200000.0,CAL,0.0,0.0,,,200000.0,GBA,0.0,0.39,,,1.0,,CASH,0.39,,False,,False,a55475b1,,P85_114_140,,,,P159_130_59,P75_90_70,64996.0,P45_84_106,P94_109_143,T,,0.0,0.0,0.0,,,,,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,1.0,,0.0,0.0,0.0,0.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,False,,,,,,,,48.0,0.0,0.0,0.0,0.0,0.0,2.0,,,0.0,0.0,,FO,AL,6373008.0,3.0,3.0,2.0,8.0,3.0,2fc785b2,a55475b1,a55475b1,4.0,,,,,,,,,,,,,,,,,8.0,3439d993,a55475b1,8.0,,,,,,,,,,,,3.0,1.0
57634,202201,100,0.0,39948.800781,1165.800049,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-4.0,,-4.0,0.0,3994.800049,,0.0,1675.400024,3358.400146,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,10.0,0.0,12108.200195,COL,0.0,0.0,0.0,,12108.200195,SBA,0.0,0.277,,,0.0,,POS,0.277,,False,,False,P159_130_59,16494.201172,a55475b1,,,,a55475b1,a55475b1,,a55475b1,a55475b1,K,50000.0,0.0,0.0,5000.0,,,0.0,-1.0,19798.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,4949.600098,20887.201172,20150.800781,,-26.0,-26.0,2.0,10.0,0.0,0.0,0.0,0.0,10.0,4.0,0.0,10.0,0.0,0.0,11.0,6.0,10.0,4.0,4.0,0.0,0.0,2.0,2.0,2.0,6.0,0.0,10.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False,,,0.4,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,13998.0,0.0,0.0,0.0,0.0,0.0,39950.800781,,FO,,15263.65,2.0,2.0,1.0,3.0,1.0,2fc785b2,a55475b1,a55475b1,1.0,,,,,,,,,,,,,,,,,1.0,a55475b1,a55475b1,3.0,,,,,,,,,,,,0.0,1.0


In [9]:
train_constructor = DatasetConstructor('train')
df_train = train_constructor.load()
df_train
# data.info()

label = df_train['target']
df_train = df_train.drop(columns=['target'])
df_train



Memory usage of dataframe is 1168.76 MB
Memory usage of dataframe became 1160.02 MB


Unnamed: 0_level_0,MONTH,WEEK_NUM,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,applicationscnt_1086L,applicationscnt_464L,applicationscnt_629L,applicationscnt_867L,avgdbddpdlast24m_3658932P,avgdbddpdlast3m_4187120P,avgdbdtollast24m_4525197P,avgdpdtolclosure24_3658938P,avginstallast24m_3658937A,avglnamtstart24m_4525187A,avgmaxdpdlast9m_3716943P,avgoutstandbalancel6m_4187114A,avgpmtlast12m_4525200A,bankacctype_710L,cardtype_51L,clientscnt12m_3712952L,clientscnt3m_3712950L,clientscnt6m_3712949L,clientscnt_100L,clientscnt_1022L,clientscnt_1071L,clientscnt_1130L,clientscnt_136L,clientscnt_157L,clientscnt_257L,clientscnt_304L,clientscnt_360L,clientscnt_493L,clientscnt_533L,clientscnt_887L,clientscnt_946L,cntincpaycont9m_3716944L,cntpmts24_3658933L,commnoinclast6m_3546845L,credamount_770A,credtype_322L,currdebt_22A,currdebtcredtyperange_828A,daysoverduetolerancedd_3976961L,deferredmnthsnum_166L,disbursedcredamount_1113A,disbursementtype_67L,downpmt_116A,eir_270L,equalitydataagreement_891L,equalityempfrom_62L,homephncnt_628L,inittransactionamount_650A,inittransactioncode_186L,interestrate_311L,interestrategrace_34L,isbidproduct_1095L,isbidproductrequest_292L,isdebitcard_729L,lastapprcommoditycat_1041M,lastapprcredamount_781A,lastcancelreason_561M,lastdependentsnum_448L,lastotherinc_902A,lastotherlnsexpense_631A,lastrejectcommoditycat_161M,lastrejectcommodtypec_5251769M,lastrejectcredamount_222A,lastrejectreason_759M,lastrejectreasonclient_4145040M,lastst_736L,maininc_215A,mastercontrelectronic_519L,mastercontrexist_109L,maxannuity_159A,maxannuity_4075009A,maxdbddpdlast1m_3658939P,maxdbddpdtollast12m_3658940P,maxdbddpdtollast6m_4187119P,maxdebt4_972A,maxdpdfrom6mto36m_3546853P,maxdpdinstlnum_3546846P,maxdpdlast12m_727P,maxdpdlast24m_143P,maxdpdlast3m_392P,maxdpdlast6m_474P,maxdpdlast9m_1059P,maxdpdtolerance_374P,maxinstallast24m_3658928A,maxlnamtstart6m_4525199A,maxoutstandbalancel12m_4187113A,maxpmtlast3m_4525190A,mindbddpdlast24m_3658935P,mindbdtollast24m_4525191P,mobilephncnt_593L,monthsannuity_845L,numactivecreds_622L,numactivecredschannel_414L,numactiverelcontr_750L,numcontrs3months_479L,numincomingpmts_3546848L,numinstlallpaidearly3d_817L,numinstls_657L,numinstlsallpaid_934L,numinstlswithdpd10_728L,numinstlswithdpd5_4187116L,numinstlswithoutdpd_562L,numinstmatpaidtearly2d_4499204L,numinstpaid_4499208L,numinstpaidearly3d_3546850L,numinstpaidearly3dest_4493216L,numinstpaidearly5d_1087L,numinstpaidearly5dest_4493211L,numinstpaidearly5dobd_4499205L,numinstpaidearly_338L,numinstpaidearlyest_4493214L,numinstpaidlastcontr_4325080L,numinstpaidlate1d_3546852L,numinstregularpaid_973L,numinstregularpaidest_4493210L,numinsttopaygr_769L,numinsttopaygrest_4493213L,numinstunpaidmax_3546851L,numinstunpaidmaxest_4493212L,numnotactivated_1143L,numpmtchanneldd_318L,numrejects9m_859L,opencred_647L,paytype1st_925L,paytype_783L,pctinstlsallpaidearl3d_427L,pctinstlsallpaidlat10d_839L,pctinstlsallpaidlate1d_3546856L,pctinstlsallpaidlate4d_3546849L,pctinstlsallpaidlate6d_3546844L,pmtnum_254L,posfpd10lastmonth_333P,posfpd30lastmonth_3976960P,posfstqpd30lastmonth_3976962P,price_1097A,sellerplacecnt_915L,sellerplacescnt_216L,sumoutstandtotal_3546847A,sumoutstandtotalest_4493215A,totaldebt_9A,totalsettled_863A,totinstallast1m_4525188A,twobodfilling_608L,typesuite_864L,contractssum_5085716L,days120_123L,days180_256L,days30_165L,days360_512L,days90_310L,description_5085714M,education_1103M,education_88M,firstquarter_103L,for3years_128L,for3years_504L,for3years_584L,formonth_118L,formonth_206L,formonth_535L,forquarter_1017L,forquarter_462L,forquarter_634L,fortoday_1092L,forweek_1077L,forweek_528L,forweek_601L,foryear_618L,foryear_818L,foryear_850L,fourthquarter_440L,maritalst_385M,maritalst_893M,numberofqueries_373L,pmtaverage_3A,pmtaverage_4527227A,pmtaverage_4955615A,pmtcount_4527229L,pmtcount_4955617L,pmtcount_693L,pmtscount_423L,pmtssum_45A,requesttype_4525192L,riskassesment_302T,riskassesment_940T,secondquarter_766L,thirdquarter_1082L
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1,Unnamed: 189_level_1,Unnamed: 190_level_1,Unnamed: 191_level_1,Unnamed: 192_level_1,Unnamed: 193_level_1,Unnamed: 194_level_1,Unnamed: 195_level_1
0,201901,0,,,1917.599976,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,30000.000000,CAL,0.000000,0.000000,,0.0,30000.000000,GBA,0.0,0.4500,,,0.0,,CASH,0.4500,,False,,,a55475b1,,a55475b1,,,,a55475b1,a55475b1,,a55475b1,a55475b1,,,0.0,0.0,0.000000,,,,,0.000000,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,1.0,,0.0,0.0,0.0,0.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,,OTHER,OTHER,,,,,,24.0,0.0,0.0,,,0.0,0.0,,,0.000000,0.000000,,BO,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,201901,0,,,3134.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,19999.800781,CAL,0.000000,0.000000,,0.0,19999.800781,GBA,0.0,0.2999,,,0.0,,CASH,0.2999,0.0,False,,,a55475b1,,a55475b1,,,,a55475b1,a55475b1,,a55475b1,a55475b1,,,0.0,0.0,0.000000,,,,,0.000000,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,1.0,,0.0,0.0,0.0,0.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,,OTHER,OTHER,,,,,,18.0,0.0,0.0,,,0.0,0.0,,,0.000000,0.000000,,BO,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,201901,0,,,4937.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,78000.000000,CAL,0.000000,0.000000,,0.0,78000.000000,GBA,0.0,0.4500,,,1.0,,CASH,0.4500,,False,,,a55475b1,,a55475b1,,,,a55475b1,a55475b1,10000.000000,a55475b1,a55475b1,D,,0.0,0.0,0.000000,,,,,0.000000,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,2.0,,0.0,0.0,0.0,0.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,false,OTHER,OTHER,,,,,,36.0,0.0,0.0,,,0.0,0.0,,,0.000000,0.000000,,BO,AL,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,201901,0,,,4643.600098,0.000000,0.0,1.0,0.0,2.0,0.0,1.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,40000.000000,CAL,0.000000,0.000000,,0.0,40000.000000,GBA,0.0,0.4200,true,true,0.0,,CASH,0.4200,0.0,False,,,a55475b1,,P94_109_143,,,,a55475b1,a55475b1,59999.800781,P94_109_143,a55475b1,D,,0.0,0.0,0.000000,,,,,0.000000,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,1.0,,0.0,0.0,0.0,1.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,1.0,false,OTHER,OTHER,,,,,,12.0,0.0,0.0,,,1.0,1.0,,,0.000000,0.000000,,BO,AL,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,201901,0,,,3390.199951,0.000000,0.0,1.0,0.0,0.0,0.0,1.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,44000.000000,CAL,0.000000,0.000000,,0.0,44000.000000,GBA,0.0,0.4500,,,1.0,,CASH,0.4500,,False,,,a55475b1,,P24_27_36,,,,a55475b1,a55475b1,,a55475b1,a55475b1,T,,0.0,0.0,0.000000,,,,,0.000000,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,1.0,,0.0,0.0,0.0,0.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,false,OTHER,OTHER,,,,,,24.0,0.0,0.0,,,0.0,0.0,,,0.000000,0.000000,,BO,AL,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2703450,202010,91,0.0,176561.359375,3675.400146,0.000000,0.0,0.0,0.0,0.0,0.0,10.0,-23.0,-43.0,-23.0,0.0,7356.800293,,0.0,16392.496094,6750.200195,CA,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,24.0,0.0,30000.000000,CAL,0.000000,0.000000,8.0,0.0,30000.000000,GBA,0.0,0.4200,,,0.0,,CASH,0.4200,,True,,,P12_6_178,20020.0,a55475b1,,,,a55475b1,a55475b1,150000.000000,P94_109_143,P94_109_143,K,36000.0,0.0,0.0,75521.906250,,0.0,0.0,0.0,105019.789062,0.0,6.0,0.0,0.0,0.0,0.0,0.0,8.0,46718.199219,49651.402344,77533.757812,14346.319336,-144.0,-144.0,3.0,65.0,1.0,0.0,1.0,0.0,92.0,106.0,0.0,112.0,0.0,1.0,117.0,106.0,113.0,103.0,103.0,11.0,11.0,99.0,99.0,99.0,12.0,4.0,113.0,113.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,false,OTHER,OTHER,0.91150,0.02655,0.03540,0.03540,0.0354,12.0,0.0,0.0,0.0,0.0,0.0,8.0,0.000000,0.000000,0.000000,428159.656250,14346.319336,FO,,52863.589844,0.0,0.0,0.0,0.0,0.0,2fc785b2,a55475b1,a55475b1,0.0,,,,,,,,,,,,,,,,,1.0,a55475b1,a55475b1,0.0,,,12155.000000,,12.0,,,,,,,1.0,1.0
2703451,202010,91,0.0,301276.468750,7088.600098,6191.600098,0.0,0.0,5.0,0.0,0.0,5.0,-18.0,-12.0,-18.0,0.0,12553.200195,,0.0,105129.312500,15780.400391,CA,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,21.0,0.0,100000.000000,CAL,68098.398438,68098.398438,2.0,0.0,40739.539062,GBA,0.0,0.4000,,,2.0,,CASH,0.4000,,True,,,a55475b1,0.0,a55475b1,,,,a55475b1,a55475b1,,a55475b1,a55475b1,A,,0.0,0.0,117251.601562,,0.0,2.0,2.0,202775.546875,0.0,7.0,2.0,2.0,0.0,2.0,2.0,2.0,40499.800781,116813.398438,250031.203125,40499.804688,-92.0,-92.0,2.0,55.0,2.0,0.0,1.0,0.0,69.0,70.0,24.0,73.0,0.0,0.0,76.0,69.0,75.0,70.0,70.0,0.0,0.0,70.0,70.0,70.0,0.0,1.0,75.0,75.0,11.0,11.0,11.0,11.0,0.0,0.0,0.0,false,OTHER,OTHER,0.94595,0.00000,0.01351,0.00000,0.0000,24.0,0.0,0.0,0.0,,0.0,3.0,68098.398438,68098.398438,68098.398438,701247.312500,40499.804688,FO,,324608.531250,0.0,0.0,0.0,0.0,0.0,2fc785b2,a55475b1,a55475b1,1.0,,,,,,,,,,,,,,,,,0.0,a55475b1,a55475b1,0.0,,,22904.599609,,12.0,,,,,,,1.0,2.0
2703452,202010,91,0.0,14232.400391,7788.800293,0.000000,0.0,0.0,0.0,0.0,0.0,3.0,-12.0,,-16.0,1.0,2662.400146,,,,1500.599976,CA,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,60000.000000,CAL,0.000000,0.000000,4.0,0.0,60000.000000,GBA,0.0,0.4200,,,0.0,,CASH,0.4200,,True,,,P159_130_59,3998.0,P180_60_137,,,,a55475b1,a55475b1,,a55475b1,a55475b1,T,24000.0,0.0,0.0,6600.000000,,,-27.0,,17143.400391,4.0,5.0,0.0,4.0,0.0,0.0,0.0,4.0,3243.400146,4182.000000,0.000000,,-27.0,-55.0,1.0,9.0,0.0,0.0,0.0,1.0,9.0,6.0,0.0,6.0,0.0,0.0,8.0,6.0,9.0,6.0,6.0,3.0,3.0,6.0,6.0,6.0,3.0,3.0,9.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,false,OTHER,OTHER,0.66667,0.00000,0.33333,0.11111,0.0000,11.0,0.0,0.0,0.0,0.0,0.0,1.0,0.000000,0.000000,0.000000,24002.000000,,BO,,102738.757812,2.0,2.0,0.0,3.0,2.0,2fc785b2,a55475b1,a55475b1,0.0,,,,,,,,,,,,,,,,,1.0,a55475b1,a55475b1,3.0,,,,,,,,,,,,0.0,4.0
2703453,202010,91,0.0,197371.578125,1195.400024,2827.199951,0.0,0.0,36.0,0.0,0.0,9.0,-33.0,-64.0,-34.0,0.0,8212.600586,,0.0,47943.062500,9921.200195,CA,,0.0,0.0,0.0,0.0,1.0,0.0,0.0,,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,7.0,23.0,0.0,6000.000000,CAL,46806.601562,46806.601562,38.0,0.0,6000.000000,GBA,0.0,0.4200,,,1.0,,CASH,0.4200,,True,,,a55475b1,0.0,a55475b1,,,,P159_130_59,P174_113_42,2198.000000,a55475b1,a55475b1,A,,0.0,0.0,163202.000000,,-66.0,0.0,-33.0,126780.000000,2.0,13.0,0.0,0.0,0.0,0.0,0.0,34.0,88740.804688,94265.203125,81604.601562,2827.199951,-68.0,-68.0,2.0,79.0,2.0,1.0,1.0,0.0,119.0,73.0,30.0,89.0,7.0,9.0,109.0,74.0,119.0,78.0,73.0,12.0,12.0,61.0,61.0,61.0,13.0,23.0,115.0,119.0,17.0,17.0,17.0,17.0,0.0,0.0,0.0,false,OTHER,OTHER,0.69643,0.04348,0.20536,0.10811,0.0991,6.0,0.0,0.0,0.0,0.0,0.0,4.0,46806.601562,46806.601562,46806.601562,440145.312500,5654.399902,BO,,212683.296875,2.0,2.0,1.0,4.0,1.0,2fc785b2,6b2ae0fa,a55475b1,1.0,,,,,,,,,,,,,,,,,3.0,3439d993,a55475b1,4.0,,,15792.400391,,14.0,,,,,,,2.0,1.0


In [10]:
df_all = pd.concat([df_train, df_test], axis=0)
df_all

  df_all = pd.concat([df_train, df_test], axis=0)
  df_all = pd.concat([df_train, df_test], axis=0)
  df_all = pd.concat([df_train, df_test], axis=0)
  df_all = pd.concat([df_train, df_test], axis=0)


Unnamed: 0_level_0,MONTH,WEEK_NUM,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,applicationscnt_1086L,applicationscnt_464L,applicationscnt_629L,applicationscnt_867L,avgdbddpdlast24m_3658932P,avgdbddpdlast3m_4187120P,avgdbdtollast24m_4525197P,avgdpdtolclosure24_3658938P,avginstallast24m_3658937A,avglnamtstart24m_4525187A,avgmaxdpdlast9m_3716943P,avgoutstandbalancel6m_4187114A,avgpmtlast12m_4525200A,bankacctype_710L,cardtype_51L,clientscnt12m_3712952L,clientscnt3m_3712950L,clientscnt6m_3712949L,clientscnt_100L,clientscnt_1022L,clientscnt_1071L,clientscnt_1130L,clientscnt_136L,clientscnt_157L,clientscnt_257L,clientscnt_304L,clientscnt_360L,clientscnt_493L,clientscnt_533L,clientscnt_887L,clientscnt_946L,cntincpaycont9m_3716944L,cntpmts24_3658933L,commnoinclast6m_3546845L,credamount_770A,credtype_322L,currdebt_22A,currdebtcredtyperange_828A,daysoverduetolerancedd_3976961L,deferredmnthsnum_166L,disbursedcredamount_1113A,disbursementtype_67L,downpmt_116A,eir_270L,equalitydataagreement_891L,equalityempfrom_62L,homephncnt_628L,inittransactionamount_650A,inittransactioncode_186L,interestrate_311L,interestrategrace_34L,isbidproduct_1095L,isbidproductrequest_292L,isdebitcard_729L,lastapprcommoditycat_1041M,lastapprcredamount_781A,lastcancelreason_561M,lastdependentsnum_448L,lastotherinc_902A,lastotherlnsexpense_631A,lastrejectcommoditycat_161M,lastrejectcommodtypec_5251769M,lastrejectcredamount_222A,lastrejectreason_759M,lastrejectreasonclient_4145040M,lastst_736L,maininc_215A,mastercontrelectronic_519L,mastercontrexist_109L,maxannuity_159A,maxannuity_4075009A,maxdbddpdlast1m_3658939P,maxdbddpdtollast12m_3658940P,maxdbddpdtollast6m_4187119P,maxdebt4_972A,maxdpdfrom6mto36m_3546853P,maxdpdinstlnum_3546846P,maxdpdlast12m_727P,maxdpdlast24m_143P,maxdpdlast3m_392P,maxdpdlast6m_474P,maxdpdlast9m_1059P,maxdpdtolerance_374P,maxinstallast24m_3658928A,maxlnamtstart6m_4525199A,maxoutstandbalancel12m_4187113A,maxpmtlast3m_4525190A,mindbddpdlast24m_3658935P,mindbdtollast24m_4525191P,mobilephncnt_593L,monthsannuity_845L,numactivecreds_622L,numactivecredschannel_414L,numactiverelcontr_750L,numcontrs3months_479L,numincomingpmts_3546848L,numinstlallpaidearly3d_817L,numinstls_657L,numinstlsallpaid_934L,numinstlswithdpd10_728L,numinstlswithdpd5_4187116L,numinstlswithoutdpd_562L,numinstmatpaidtearly2d_4499204L,numinstpaid_4499208L,numinstpaidearly3d_3546850L,numinstpaidearly3dest_4493216L,numinstpaidearly5d_1087L,numinstpaidearly5dest_4493211L,numinstpaidearly5dobd_4499205L,numinstpaidearly_338L,numinstpaidearlyest_4493214L,numinstpaidlastcontr_4325080L,numinstpaidlate1d_3546852L,numinstregularpaid_973L,numinstregularpaidest_4493210L,numinsttopaygr_769L,numinsttopaygrest_4493213L,numinstunpaidmax_3546851L,numinstunpaidmaxest_4493212L,numnotactivated_1143L,numpmtchanneldd_318L,numrejects9m_859L,opencred_647L,paytype1st_925L,paytype_783L,pctinstlsallpaidearl3d_427L,pctinstlsallpaidlat10d_839L,pctinstlsallpaidlate1d_3546856L,pctinstlsallpaidlate4d_3546849L,pctinstlsallpaidlate6d_3546844L,pmtnum_254L,posfpd10lastmonth_333P,posfpd30lastmonth_3976960P,posfstqpd30lastmonth_3976962P,price_1097A,sellerplacecnt_915L,sellerplacescnt_216L,sumoutstandtotal_3546847A,sumoutstandtotalest_4493215A,totaldebt_9A,totalsettled_863A,totinstallast1m_4525188A,twobodfilling_608L,typesuite_864L,contractssum_5085716L,days120_123L,days180_256L,days30_165L,days360_512L,days90_310L,description_5085714M,education_1103M,education_88M,firstquarter_103L,for3years_128L,for3years_504L,for3years_584L,formonth_118L,formonth_206L,formonth_535L,forquarter_1017L,forquarter_462L,forquarter_634L,fortoday_1092L,forweek_1077L,forweek_528L,forweek_601L,foryear_618L,foryear_818L,foryear_850L,fourthquarter_440L,maritalst_385M,maritalst_893M,numberofqueries_373L,pmtaverage_3A,pmtaverage_4527227A,pmtaverage_4955615A,pmtcount_4527229L,pmtcount_4955617L,pmtcount_693L,pmtscount_423L,pmtssum_45A,requesttype_4525192L,riskassesment_302T,riskassesment_940T,secondquarter_766L,thirdquarter_1082L
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1,Unnamed: 189_level_1,Unnamed: 190_level_1,Unnamed: 191_level_1,Unnamed: 192_level_1,Unnamed: 193_level_1,Unnamed: 194_level_1,Unnamed: 195_level_1
0,201901,0,,,1917.599976,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,30000.000000,CAL,0.0,0.0,,0.0,30000.000000,GBA,0.0,0.4500,,,0.0,,CASH,0.4500,,False,,,a55475b1,,a55475b1,,,,a55475b1,a55475b1,,a55475b1,a55475b1,,,0.0,0.0,0.000000,,,,,0.000000,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,1.0,,0.0,0.0,0.0,0.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,,OTHER,OTHER,,,,,,24.0,0.0,0.0,,,0.0,0.0,,,0.0,0.000000,,BO,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,201901,0,,,3134.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,19999.800781,CAL,0.0,0.0,,0.0,19999.800781,GBA,0.0,0.2999,,,0.0,,CASH,0.2999,0.0,False,,,a55475b1,,a55475b1,,,,a55475b1,a55475b1,,a55475b1,a55475b1,,,0.0,0.0,0.000000,,,,,0.000000,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,1.0,,0.0,0.0,0.0,0.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,,OTHER,OTHER,,,,,,18.0,0.0,0.0,,,0.0,0.0,,,0.0,0.000000,,BO,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,201901,0,,,4937.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,78000.000000,CAL,0.0,0.0,,0.0,78000.000000,GBA,0.0,0.4500,,,1.0,,CASH,0.4500,,False,,,a55475b1,,a55475b1,,,,a55475b1,a55475b1,10000.000000,a55475b1,a55475b1,D,,0.0,0.0,0.000000,,,,,0.000000,,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,2.0,,0.0,0.0,0.0,0.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,false,OTHER,OTHER,,,,,,36.0,0.0,0.0,,,0.0,0.0,,,0.0,0.000000,,BO,AL,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,201901,0,,,4643.600098,0.0,0.0,1.0,0.0,2.0,0.0,1.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,40000.000000,CAL,0.0,0.0,,0.0,40000.000000,GBA,0.0,0.4200,true,true,0.0,,CASH,0.4200,0.0,False,,,a55475b1,,P94_109_143,,,,a55475b1,a55475b1,59999.800781,P94_109_143,a55475b1,D,,0.0,0.0,0.000000,,,,,0.000000,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,1.0,,0.0,0.0,0.0,1.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,1.0,false,OTHER,OTHER,,,,,,12.0,0.0,0.0,,,1.0,1.0,,,0.0,0.000000,,BO,AL,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,201901,0,,,3390.199951,0.0,0.0,1.0,0.0,0.0,0.0,1.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,44000.000000,CAL,0.0,0.0,,0.0,44000.000000,GBA,0.0,0.4500,,,1.0,,CASH,0.4500,,False,,,a55475b1,,P24_27_36,,,,a55475b1,a55475b1,,a55475b1,a55475b1,T,,0.0,0.0,0.000000,,,,,0.000000,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,1.0,,0.0,0.0,0.0,0.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,false,OTHER,OTHER,,,,,,24.0,0.0,0.0,,,0.0,0.0,,,0.0,0.000000,,BO,AL,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57630,202201,100,0.0,0.000000,8905.000000,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,,,0.0,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,96174.000000,COL,0.0,0.0,3.0,,96174.000000,SBA,0.0,0.0000,,,1.0,,POS,0.0000,,False,,false,P148_110_5,8876.000000,P198_89_166,,,,a55475b1,a55475b1,,a55475b1,a55475b1,T,12000.0,0.0,0.0,1382.800049,,,,,0.000000,0.0,4.0,0.0,0.0,0.0,0.0,0.0,2.0,,,,,,,2.0,7.0,0.0,0.0,0.0,0.0,7.0,3.0,0.0,5.0,0.0,0.0,5.0,4.0,7.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,7.0,2.0,7.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,false,,,0.42857,0.0,0.28571,0.0,0.0,12.0,0.0,0.0,0.0,96174.0,0.0,1.0,0.0,0.0,0.0,9677.600586,,FO,,4.999750e+05,1.0,2.0,1.0,5.0,1.0,2fc785b2,6b2ae0fa,a55475b1,1.0,,,,,,,,,,,,,,,,,3.0,3439d993,a55475b1,5.0,,,,,,,,,,,,4.0,1.0
57631,202201,100,0.0,,2540.600098,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,24920.000000,COL,0.0,0.0,,0.0,24920.000000,SBA,0.0,0.3900,,,1.0,,POS,0.3900,,False,,,a55475b1,,P94_109_143,,,,P100_96_175,P165_57_169,46279.800781,P45_84_106,P94_109_143,D,,0.0,0.0,0.000000,,,,,0.000000,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,3.0,,0.0,0.0,0.0,0.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,false,,,,,,,,12.0,0.0,0.0,0.0,24920.0,0.0,0.0,,,0.0,0.000000,,FO,,4.803345e+05,0.0,0.0,0.0,1.0,0.0,2fc785b2,a55475b1,a55475b1,3.0,,,,,,,,,,,,,,,,,7.0,a55475b1,a55475b1,1.0,,,,,,,,,,,,2.0,5.0
57632,202201,100,0.0,63647.402344,4732.000000,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-7.0,-6.0,-7.0,0.0,3536.000000,,0.0,10581.713867,3536.000000,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,10.0,18.0,0.0,25998.000000,COL,0.0,0.0,0.0,0.0,25998.000000,SBA,0.0,0.0000,,,0.0,,POS,0.0000,,False,false,false,P53_45_92,50116.000000,a55475b1,,,,a55475b1,a55475b1,,a55475b1,a55475b1,K,56000.0,0.0,0.0,7000.000000,,-4.0,-4.0,-6.0,63647.402344,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,3536.000000,63647.402344,42412.000000,3536.0,-9.0,-9.0,1.0,17.0,0.0,0.0,0.0,0.0,21.0,18.0,0.0,18.0,0.0,0.0,19.0,18.0,18.0,18.0,18.0,16.0,16.0,16.0,16.0,16.0,18.0,0.0,18.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,false,,,1.00000,0.0,0.00000,0.0,0.0,6.0,0.0,0.0,0.0,25998.0,0.0,1.0,0.0,0.0,0.0,63652.000000,7071.399902,FO,,1.767700e+04,1.0,2.0,0.0,4.0,0.0,2fc785b2,a55475b1,a55475b1,1.0,,,,,,,,,,,,,,,,,1.0,a55475b1,a55475b1,4.0,,,15841.200195,,14.0,,,,,,,1.0,1.0
57633,202201,100,0.0,,8273.000000,0.0,0.0,0.0,0.0,0.0,0.0,3.0,,,,,,,,,,CA,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,,0.0,200000.000000,CAL,0.0,0.0,,,200000.000000,GBA,0.0,0.3900,,,1.0,,CASH,0.3900,,False,,false,a55475b1,,P85_114_140,,,,P159_130_59,P75_90_70,64996.000000,P45_84_106,P94_109_143,T,,0.0,0.0,0.000000,,,,,0.000000,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,1.0,,0.0,0.0,0.0,0.0,,,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,false,,,,,,,,48.0,0.0,0.0,0.0,0.0,0.0,2.0,,,0.0,0.000000,,FO,AL,6.373008e+06,3.0,3.0,2.0,8.0,3.0,2fc785b2,a55475b1,a55475b1,4.0,,,,,,,,,,,,,,,,,8.0,3439d993,a55475b1,8.0,,,,,,,,,,,,3.0,1.0


In [11]:
filter_columns = ['WEEK_NUM', 'numrejects9m_859L',
 'maxdpdinstlnum_3546846P',
 'pmtaverage_3A',
 'for3years_128L',
 'isbidproductrequest_292L',
 'formonth_535L',
 'numinstpaidlastcontr_4325080L',
 'forquarter_634L',
 'mindbdtollast24m_4525191P',
 'clientscnt_1130L',
 'lastrejectreason_759M',
 'maxdpdlast3m_392P',
 'lastrejectreasonclient_4145040M',
 'disbursementtype_67L',
 'numinstpaid_4499208L',
 'education_1103M',
 'maxdpdlast24m_143P',
 'avgoutstandbalancel6m_4187114A',
 'days90_310L',
 'disbursedcredamount_1113A',
 'numactivecredschannel_414L',
 'thirdquarter_1082L',
 'avgdbdtollast24m_4525197P',
 'currdebt_22A',
 'avgpmtlast12m_4525200A',
 'avgdbddpdlast24m_3658932P',
 'price_1097A',
 'numinstpaidearly5dobd_4499205L',
 'lastotherlnsexpense_631A',
 'lastapprcredamount_781A',
 'cntincpaycont9m_3716944L',
 'deferredmnthsnum_166L',
 'contractssum_5085716L',
 'lastdependentsnum_448L',
 'clientscnt_493L',
 'typesuite_864L',
 'applications30d_658L',
 'equalityempfrom_62L',
 'formonth_206L',
 'firstquarter_103L',
 'forweek_1077L',
 'riskassesment_940T',
 'clientscnt_887L',
 'maxpmtlast3m_4525190A',
 'foryear_818L',
 'numinstpaidlate1d_3546852L',
 'for3years_584L',
 'sellerplacescnt_216L',
 'numinsttopaygr_769L',
 'mindbddpdlast24m_3658935P',
 'lastst_736L',
 'avglnamtstart24m_4525187A',
 'pmtcount_4527229L',
 'clientscnt_533L',
 'applicationscnt_629L',
 'totaldebt_9A',
 'numinstregularpaid_973L',
 'pmtaverage_4527227A',
 'posfpd10lastmonth_333P',
 'avgdbddpdlast3m_4187120P',
 'foryear_850L',
 'pctinstlsallpaidlate1d_3546856L',
 'numactiverelcontr_750L',
 'twobodfilling_608L']

In [12]:
df_all_filtered = df_all[filter_columns]
df_all_filtered


numeric_features = df_all_filtered.select_dtypes(include=np.number).columns
df_all_filtered[numeric_features] = df_all_filtered[numeric_features].apply(
    lambda x: (x-x.mean())/(x.std()))
df_all_filtered[numeric_features] = df_all_filtered[numeric_features].fillna(0)

# cat_features = [col for col in df_all_filtered.columns if df_all_filtered[col].dtype.name == 'category' or df_all_filtered[col].dtype.name == 'object']
# df_all_filtered = pd.get_dummies(df_all_filtered, columns=cat_features, dtype=np.float32)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_all_filtered[numeric_features] = df_all_filtered[numeric_features].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_all_filtered[numeric_features] = df_all_filtered[numeric_features].fillna(0)


In [13]:
# x_test = x_test[filter_columns]
# data = data[filter_columns]

In [14]:
# x_train, x_valid, y_train, y_valid = train_test_split(data, label, test_size=0.3, shuffle=True, random_state=1293, stratify=label)
from sklearn.preprocessing import LabelEncoder
cv = StratifiedKFold(n_splits=5, shuffle=False)

types = df_all_filtered.dtypes
# df_train = data
weeks = df_train['WEEK_NUM']
df_all_filtered = df_all_filtered.drop(columns=['WEEK_NUM'])

cat_features = [col for col in df_all_filtered.columns if df_all_filtered[col].dtype.name == 'category' or df_all_filtered[col].dtype.name == 'object']

categorical_dims =  {}
for col in df_all_filtered.columns:
    if types[col] == 'object' or types[col] == 'category':
        print(col, df_all_filtered[col].nunique())
        l_enc = LabelEncoder()
#         train[col] = train[col].fillna("VV_likely")
        df_all_filtered[col] = l_enc.fit_transform(df_all_filtered[col].values)
#         categorical_columns.append(col)
        categorical_dims[col] = len(l_enc.classes_)

features = [col for col in df_all_filtered.columns] 
cat_idxs = [i for i, f in enumerate(features) if f in cat_features]
cat_dims = [categorical_dims[f] for i, f in enumerate(features) if f in cat_features]

df_train = df_all_filtered.head(len(df_train))
df_test = df_all_filtered.tail(len(df_test))

# df_train[cat_features] = df_train[cat_features].astype(str)
# df_test[cat_features] = df_test[cat_features].astype(str)

isbidproductrequest_292L 2
lastrejectreason_759M 18
lastrejectreasonclient_4145040M 14
disbursementtype_67L 3
education_1103M 5
typesuite_864L 1
equalityempfrom_62L 2
lastst_736L 11
twobodfilling_608L 2


In [15]:
# x_all = pd.concat([x_train, x_valid], axis=0)
# cat_features = [col for col in x_all.columns if x_all[col].dtype == 'category']
# for col in cat_features:
#     x_all[col] = x_all[col].astype('category')
#     x_all[col] = x_all[col].cat.add_categories('Missing').fillna('Missing')
#     x_test[col] = x_test[col].astype('category')
#     x_test[col] = x_test[col].cat.set_categories(x_all[col].cat.categories).fillna('Missing')

# x_train = x_all.iloc[:len(x_train)]
# x_valid = x_all.iloc[len(x_train):]


In [16]:
from torch.optim.lr_scheduler import ReduceLROnPlateau
X_train, X_valid, y_train, y_valid = train_test_split(df_train, label, test_size=0.3) 

tabnet_params = dict(
    n_d = 8,
    n_a = 8,
    n_steps = 5,
    gamma = 1.3,
    lambda_sparse = 0,
    optimizer_fn = torch.optim.Adam,
    optimizer_params = dict(lr = 2e-2, weight_decay = 1e-5),
    mask_type = "entmax",
    scheduler_params = dict(
        mode = "min", patience = 5, min_lr = 1e-5, factor = 0.9),
    scheduler_fn = ReduceLROnPlateau,
    seed = 8787,
    verbose = 2
)

unsupervised_model = TabNetPretrainer(**tabnet_params)

unsupervised_model.fit(
    X_train=X_train.values,
    eval_set=[X_valid.values],
    pretraining_ratio=0.8,
    patience=10
)



epoch 0  | loss: 576.97247| val_0_unsup_loss_numpy: 0.9545400142669678|  0:01:43s
epoch 2  | loss: 1.03936 | val_0_unsup_loss_numpy: 1.1897599697113037|  0:05:11s
epoch 4  | loss: 0.9704  | val_0_unsup_loss_numpy: 0.9103699922561646|  0:08:36s
epoch 6  | loss: 0.99055 | val_0_unsup_loss_numpy: 1.1971800327301025|  0:12:00s
epoch 8  | loss: 1.05973 | val_0_unsup_loss_numpy: 0.9663400053977966|  0:15:23s
epoch 10 | loss: 1.05916 | val_0_unsup_loss_numpy: 0.8922899961471558|  0:18:45s
epoch 12 | loss: 1.11243 | val_0_unsup_loss_numpy: 1.5497699975967407|  0:22:06s
epoch 14 | loss: 1.10183 | val_0_unsup_loss_numpy: 1.0291199684143066|  0:25:26s
epoch 16 | loss: 1.2746  | val_0_unsup_loss_numpy: 1.0593899488449097|  0:28:47s
epoch 18 | loss: 1.04496 | val_0_unsup_loss_numpy: 0.9204800128936768|  0:32:09s
epoch 20 | loss: 1.08782 | val_0_unsup_loss_numpy: 28.975709915161133|  0:35:30s

Early stopping occurred at epoch 20 with best_epoch = 10 and best_val_0_unsup_loss_numpy = 0.89228999614715



In [17]:
from pytorch_tabnet.augmentations import ClassificationSMOTE
aug = ClassificationSMOTE(p=0.2)

fitted_models_cat = []
fitted_models_lgb = []
fitted_models_xgb = []
fitted_models_tabnet = []

cv_scores_cat = []
cv_scores_lgb = []
cv_scores_xgb = []
cv_scores_tabnet = []
device = 'cuda' if torch.cuda.is_available() else 'cpu'
for idx_train, idx_valid in cv.split(df_train, label, groups=weeks):
    x_train, y_train = df_train.iloc[idx_train], label.iloc[idx_train]
    x_valid, y_valid = df_train.iloc[idx_valid], label.iloc[idx_valid]
    
    
#     x_train_d  = torch.tensor(x_train.values, dtype=torch.float32)
#     x_valid_d  = torch.tensor(x_valid.values, dtype=torch.float32)
#     y_train_d  = torch.tensor(y_train.values, dtype=torch.float32)
#     y_valid_d  = torch.tensor(y_valid.values, dtype=torch.float32)
    
#     # for catboost    
#     train_pool = Pool(x_train, y_train,cat_features=cat_features)
#     val_pool = Pool(x_valid, y_valid,cat_features=cat_features)
    
#     cat_model = CatBoostClassifier(
#         iterations=1200,                 
#         depth=12,                        
#         learning_rate=0.03,               
#         eval_metric='AUC',               
#         random_seed=42,                  
#         bootstrap_type='Bayesian',       
#         bagging_temperature=1,           
#         od_type='Iter',                  
#         od_wait=50,
#         task_type='GPU'
#     )

#     cat_model.fit(
#         x_train, y_train,
#         eval_set=(x_valid, y_valid),
#         cat_features=cat_features,
#         use_best_model=True,
#         verbose=True
#     )
    
#     fitted_models_cat.append(cat_model)
#     y_pred_valid = cat_model.predict_proba(x_valid)[:,1]
#     auc_score = roc_auc_score(y_valid, y_pred_valid)
#     cv_scores_cat.append(auc_score)
    
#     x_train[cat_features] = x_train[cat_features].astype("category")
#     x_valid[cat_features] = x_valid[cat_features].astype("category")
    
#     lgb_params = {
#         "boosting_type": "gbdt",
#         "objective": "binary",
#         "metric": "auc",
#         "max_depth": 10,
#         "num_leaves": 64,
#         "min_data_in_leaf": 10,
#         "learning_rate": 1e-3,
#         "feature_fraction": 0.5,
#         "bagging_fraction": 0.5,
#         "bagging_freq": 5,
#         "n_estimators": 1000,
#         'min_data_in_bin':1,
#         'max_bin': 64,
#         "verbose": -1,
#         "random_state": 42, 
#         'n_jobs': 10
#     }
    
#     lgb_model = lgb.LGBMClassifier(**lgb_params)
#     lgb_model.fit(
#         x_train, y_train,
#         eval_set = [(x_valid, y_valid)],
#         callbacks = [lgb.log_evaluation(200), lgb.early_stopping(100)] )
    
#     fitted_models_lgb.append(lgb_model)
#     y_pred_valid = lgb_model.predict_proba(x_valid)[:,1]
#     auc_score = roc_auc_score(y_valid, y_pred_valid)
#     cv_scores_lgb.append(auc_score)
    
#     xgb_model = XGBClassifier(
#         device="cuda",
#         objective='binary:logistic',
#         tree_method="hist",
#         enable_categorical=True,
#         eval_metric='auc',
#         subsample=0.8,
#         colsample_bytree=0.8,
#         min_child_weight=1,
#         max_depth=20,
#         n_estimators=1200,
#         random_state=42,
#     )

#     # Training the model on the training data
#     xgb_model.fit(
#         x_train, y_train,
#         eval_set=[(x_valid, y_valid)],
#         early_stopping_rounds=100,
#         verbose=True,
#     )
#     fitted_models_xgb.append(xgb_model)
#     y_pred_valid = xgb_model.predict_proba(x_valid)[:,1]
#     auc_score = roc_auc_score(y_valid, y_pred_valid)
#     cv_scores_xgb.append(auc_score)
    clf = TabNetClassifier(cat_idxs=cat_idxs,
        cat_dims=cat_dims,
        cat_emb_dim=4,)  #TabNetRegressor()
    
    clf.fit(
        x_train.values, y_train.values,
        eval_set=[(x_valid.values, y_valid.values)],
        eval_metric=["auc"],
#         loss_fn= nn.CrossEntropyLoss(weight=torch.tensor([1., 5.]).to(device)),
        patience=5,
        max_epochs=15,
        weights=1,
        augmentations=aug,
        from_unsupervised=unsupervised_model
    )
    fitted_models_tabnet.append(clf)
    y_pred_valid = clf.predict_proba(x_valid.values)[:,1]
    auc_score = roc_auc_score(y_valid, y_pred_valid)
    cv_scores_tabnet.append(auc_score)
#     preds = clf.predict(X_test)

    
    
    
# print("CV AUC scores: ", cv_scores_cat)
# print("Maximum CV AUC score: ", max(cv_scores_cat))

# print("CV AUC scores: ", cv_scores_lgb)
# print("Maximum CV AUC score: ", max(cv_scores_lgb))

# print("CV AUC scores: ", cv_scores_xgb)
# print("Maximum CV AUC score: ", max(cv_scores_xgb))

print("CV AUC scores: ", cv_scores_tabnet)
print("Maximum CV AUC score: ", max(cv_scores_tabnet))


# with open("fitted_models_cat.pickle", "wb") as f:
#     pickle.dump(fitted_models_cat, f)
# with open("fitted_models_lgb.pickle", "wb") as f:
#     pickle.dump(fitted_models_lgb, f)
# with open("fitted_models_xgb.pickle", "wb") as f:
#     pickle.dump(fitted_models_xgb, f)



epoch 0  | loss: 0.58229 | val_0_auc: 0.68985 |  0:01:27s
epoch 1  | loss: 0.56877 | val_0_auc: 0.69041 |  0:02:57s
epoch 2  | loss: 0.56444 | val_0_auc: 0.69808 |  0:04:25s
epoch 3  | loss: 0.5616  | val_0_auc: 0.69258 |  0:05:52s
epoch 4  | loss: 0.56089 | val_0_auc: 0.69721 |  0:07:19s
epoch 5  | loss: 0.55907 | val_0_auc: 0.69247 |  0:08:47s
epoch 6  | loss: 0.55738 | val_0_auc: 0.6823  |  0:10:14s
epoch 7  | loss: 0.55642 | val_0_auc: 0.69113 |  0:11:41s

Early stopping occurred at epoch 7 with best_epoch = 2 and best_val_0_auc = 0.69808




epoch 0  | loss: 0.58171 | val_0_auc: 0.69228 |  0:01:27s
epoch 1  | loss: 0.56984 | val_0_auc: 0.69252 |  0:02:55s
epoch 2  | loss: 0.5668  | val_0_auc: 0.69691 |  0:04:24s
epoch 3  | loss: 0.56389 | val_0_auc: 0.69633 |  0:05:52s
epoch 4  | loss: 0.562   | val_0_auc: 0.69715 |  0:07:18s
epoch 5  | loss: 0.55956 | val_0_auc: 0.69835 |  0:08:45s
epoch 6  | loss: 0.5584  | val_0_auc: 0.69141 |  0:10:11s
epoch 7  | loss: 0.55683 | val_0_auc: 0.69919 |  0:11:38s
epoch 8  | loss: 0.55645 | val_0_auc: 0.68917 |  0:13:04s
epoch 9  | loss: 0.55594 | val_0_auc: 0.70162 |  0:14:31s
epoch 10 | loss: 0.55591 | val_0_auc: 0.70184 |  0:15:58s
epoch 11 | loss: 0.55578 | val_0_auc: 0.69704 |  0:17:24s
epoch 12 | loss: 0.55421 | val_0_auc: 0.69648 |  0:18:51s
epoch 13 | loss: 0.55457 | val_0_auc: 0.6994  |  0:20:17s
epoch 14 | loss: 0.5531  | val_0_auc: 0.69821 |  0:21:43s
Stop training because you reached max_epochs = 15 with best_epoch = 10 and best_val_0_auc = 0.70184




epoch 0  | loss: 0.60286 | val_0_auc: 0.80296 |  0:01:25s
epoch 1  | loss: 0.58943 | val_0_auc: 0.80024 |  0:02:50s
epoch 2  | loss: 0.5865  | val_0_auc: 0.80876 |  0:04:15s
epoch 3  | loss: 0.58376 | val_0_auc: 0.80497 |  0:05:41s
epoch 4  | loss: 0.58232 | val_0_auc: 0.80643 |  0:07:07s
epoch 5  | loss: 0.58069 | val_0_auc: 0.80363 |  0:08:33s
epoch 6  | loss: 0.5792  | val_0_auc: 0.80347 |  0:09:59s
epoch 7  | loss: 0.57841 | val_0_auc: 0.8024  |  0:11:25s

Early stopping occurred at epoch 7 with best_epoch = 2 and best_val_0_auc = 0.80876




epoch 0  | loss: 0.60495 | val_0_auc: 0.80662 |  0:01:27s
epoch 1  | loss: 0.5921  | val_0_auc: 0.80633 |  0:02:57s
epoch 2  | loss: 0.58865 | val_0_auc: 0.80751 |  0:04:23s
epoch 3  | loss: 0.58499 | val_0_auc: 0.80662 |  0:05:51s
epoch 4  | loss: 0.58318 | val_0_auc: 0.80794 |  0:07:18s
epoch 5  | loss: 0.58168 | val_0_auc: 0.80883 |  0:08:48s
epoch 6  | loss: 0.57995 | val_0_auc: 0.80835 |  0:10:15s
epoch 7  | loss: 0.57911 | val_0_auc: 0.80541 |  0:11:43s
epoch 8  | loss: 0.57872 | val_0_auc: 0.80797 |  0:13:11s
epoch 9  | loss: 0.57894 | val_0_auc: 0.80695 |  0:14:39s
epoch 10 | loss: 0.57777 | val_0_auc: 0.80923 |  0:16:07s
epoch 11 | loss: 0.57764 | val_0_auc: 0.80915 |  0:17:35s
epoch 12 | loss: 0.57687 | val_0_auc: 0.80851 |  0:19:03s
epoch 13 | loss: 0.57652 | val_0_auc: 0.80508 |  0:20:31s
epoch 14 | loss: 0.5755  | val_0_auc: 0.80406 |  0:21:58s
Stop training because you reached max_epochs = 15 with best_epoch = 10 and best_val_0_auc = 0.80923




epoch 0  | loss: 0.59648 | val_0_auc: 0.74945 |  0:01:24s
epoch 1  | loss: 0.5841  | val_0_auc: 0.75241 |  0:02:49s
epoch 2  | loss: 0.57909 | val_0_auc: 0.75358 |  0:04:14s
epoch 3  | loss: 0.577   | val_0_auc: 0.75036 |  0:05:38s
epoch 4  | loss: 0.57462 | val_0_auc: 0.75643 |  0:07:04s
epoch 5  | loss: 0.57284 | val_0_auc: 0.75341 |  0:08:28s
epoch 6  | loss: 0.57078 | val_0_auc: 0.74494 |  0:09:52s
epoch 7  | loss: 0.57049 | val_0_auc: 0.74589 |  0:11:18s
epoch 8  | loss: 0.57005 | val_0_auc: 0.74396 |  0:12:45s
epoch 9  | loss: 0.569   | val_0_auc: 0.7489  |  0:14:12s

Early stopping occurred at epoch 9 with best_epoch = 4 and best_val_0_auc = 0.75643




CV AUC scores:  []


ValueError: max() arg is an empty sequence

In [20]:
print("CV AUC scores: ", cv_scores_xgb)
print("Maximum CV AUC score: ", max(cv_scores_xgb))

CV AUC scores:  [0.6980845171251402, 0.7018412794533331, 0.8087590609052192, 0.8092293621649994, 0.7564286305892265]
Maximum CV AUC score:  0.8092293621649994


In [24]:
print(fitted_models_tabnet)

[TabNetClassifier(n_d=8, n_a=8, n_steps=5, gamma=1.3, cat_idxs=[], cat_dims=[], cat_emb_dim=[], n_independent=2, n_shared=2, epsilon=1e-15, momentum=0.02, lambda_sparse=0.001, seed=0, clip_value=1, verbose=1, optimizer_fn=<class 'torch.optim.adam.Adam'>, optimizer_params={'lr': 0.02}, scheduler_fn=None, scheduler_params={}, mask_type='entmax', input_dim=64, output_dim=2, device_name='auto', n_shared_decoder=1, n_indep_decoder=1, grouped_features=[]), TabNetClassifier(n_d=8, n_a=8, n_steps=5, gamma=1.3, cat_idxs=[], cat_dims=[], cat_emb_dim=[], n_independent=2, n_shared=2, epsilon=1e-15, momentum=0.02, lambda_sparse=0.001, seed=0, clip_value=1, verbose=1, optimizer_fn=<class 'torch.optim.adam.Adam'>, optimizer_params={'lr': 0.02}, scheduler_fn=None, scheduler_params={}, mask_type='entmax', input_dim=64, output_dim=2, device_name='auto', n_shared_decoder=1, n_indep_decoder=1, grouped_features=[]), TabNetClassifier(n_d=8, n_a=8, n_steps=5, gamma=1.3, cat_idxs=[], cat_dims=[], cat_emb_dim=

In [27]:
# with open("fitted_models_tabnet.pickle", "wb") as f:
#     pickle.dump(fitted_models_tabnet, f)
    
# for i, m in enumerate(fitted_models_tabnet):
#     saving_path_name = f"/kaggle/working/tabnet_model_test_{i}"
#     saved_filepath = m.save_model(saving_path_name)

Successfully saved model at /kaggle/working/tabnet_model_test_0.zip
Successfully saved model at /kaggle/working/tabnet_model_test_1.zip
Successfully saved model at /kaggle/working/tabnet_model_test_2.zip
Successfully saved model at /kaggle/working/tabnet_model_test_3.zip
Successfully saved model at /kaggle/working/tabnet_model_test_4.zip


In [None]:
class VotingModel(BaseEstimator, RegressorMixin):
    def __init__(self, estimators):
        super().__init__()
        self.estimators = estimators
        
    def fit(self, X, y=None):
        return self
    
    def predict(self, X):
        y_preds = [estimator.predict(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)
    
    def predict_proba(self, X):
        
        y_preds = [estimator.predict_proba(X) for estimator in self.estimators[:5]]
        
        X[cat_cols] = X[cat_cols].astype("category")
        y_preds += [estimator.predict_proba(X) for estimator in self.estimators[5:]]
        
        return np.mean(y_preds, axis=0)
    
    def predict_proba_with_dl(self, X1, X2):
        
        y_preds = [estimator.predict_proba(X) for estimator in self.estimators[:5]]
        
        X[cat_cols] = X[cat_cols].astype("category")
        y_preds += [estimator.predict_proba(X) for estimator in self.estimators[5:]]
        
        # TODO: ADD DL predict     
        
        
        return np.mean(y_preds, axis=0)

model = VotingModel(fitted_models_cat+fitted_models_lgb+fitted_models_xgb)

# # method1
# catVoting = VotingModel(fitted_models_cat)
# lgbVoting = VotingModel(fitted_models_lgb)
# xgbVoting = VotingModel(fitted_models_xgb)

# RandomizedSearch(catVoting, lgbVoting, xgbVoting, DL1Voting, DL2...)

# # method2
# m = VotingModel(fitted_models_cat+fitted_models_lgb+fitted_models_xgb+fitted_models_dl1+...)

In [None]:
# dataset = data.copy()
# x_test_d = x_test.copy()
# len_dataset = len(dataset)
# print(len(dataset), len(x_test_d))
# dataset = pd.concat([dataset, x_test_d])

# numeric_features = dataset.select_dtypes(include=np.number).columns
# dataset[numeric_features] = dataset[numeric_features].apply(
#     lambda x: (x-x.mean())/(x.std()))
# dataset[numeric_features] = dataset[numeric_features].fillna(0)

# cat_features = [col for col in dataset.columns if dataset[col].dtype.name == 'category' or dataset[col].dtype.name == 'object']
# dataset = pd.get_dummies(dataset, columns=cat_features, dtype=np.float32)

In [None]:
# x_test_d = dataset[len_dataset:]
# dataset = dataset[:len_dataset]

In [None]:
# print(len(x_test_d.columns))
# print(len(dataset.columns))

In [None]:
# x_train_d, x_valid_d, y_train_d, y_valid_d = train_test_split(df_train, label, test_size=0.3, shuffle=True, stratify=label)

# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# x_train_d  = torch.tensor(x_train_d.values, dtype=torch.float32).to(device)
# x_valid_d  = torch.tensor(x_valid_d.values, dtype=torch.float32).to(device)
# y_train_d  = torch.tensor(y_train_d.values, dtype=torch.float32).to(device).reshape(-1, 1)
# y_valid_d  = torch.tensor(y_valid_d.values, dtype=torch.float32).reshape(-1, 1)

In [None]:
# x_test_d = torch.tensor(x_test_d.values, dtype=torch.float32).to(device)

In [None]:
# del data, dataset
# gc.collect()

# Deep

In [None]:
# import numpy as np
# import pandas as pd
# import polars as pl
# import matplotlib.pyplot as plt

# from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
# from sklearn.metrics import roc_auc_score, accuracy_score

# import torch
# import torch.nn as nn
# from torch.utils import data

# import importlib.util
# import sys

# def load_module_from_path(module_name, file_path):
#     spec = importlib.util.spec_from_file_location(module_name, file_path)
#     module = importlib.util.module_from_spec(spec)
#     sys.modules[module_name] = module
#     spec.loader.exec_module(module)
#     return module

# loaded_module = load_module_from_path('model', '/kaggle/input/ensemble/other/model_fn/1/model.py')

# from model import Transformer, KAN, TransformerKAN
# model_name = "Transformer" # "Transformer", "KAN", "TransformerKAN"
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# PATH = f"/kaggle/working/{model_name}.pth"

In [None]:
# def gini_stability(base, w_fallingrate=88.0, w_resstd=-0.5):
#     gini_in_time = base.loc[:, ["WEEK_NUM", "target", "score"]]\
#         .sort_values("WEEK_NUM")\
#         .groupby("WEEK_NUM")[["target", "score"]]\
#         .apply(lambda x: 2*roc_auc_score(x["target"], x["score"])-1).tolist()
    
#     x = np.arange(len(gini_in_time))
#     y = gini_in_time
#     a, b = np.polyfit(x, y, 1)
#     y_hat = a*x + b
#     residuals = y - y_hat
#     res_std = np.std(residuals)
#     avg_gini = np.mean(gini_in_time)
#     return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std

In [None]:
# from tqdm import tqdm

# def load_data(data_array, batch_size, is_train=True):
#     dataset = data.TensorDataset(*data_array)
#     return data.DataLoader(dataset, batch_size, shuffle=is_train)

# def train(model, x_train, y_train, x_valid, y_valid, num_epochs, lr, batch_size, weight_decay):
#     best_validation = 0
#     train_iter = load_data((x_train, y_train), batch_size)
#     val_iter = load_data((x_valid, y_valid), batch_size, is_train=False)
#     optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
#     scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.8)
#     # loss = nn.BCELoss()
#     # loss = nn.BCEWithLogitsLoss()
#     loss = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(5).to(device))
#     # loss = nn.CrossEntropyLoss(weight=torch.tensor(0.7).to(device))
#     for epoch in range(num_epochs):
#         model.train()
#         print("training")
#         for X, y in tqdm(train_iter):
#             # pred = torch.argmax(model(X), dim=1)
#             # pred = model(X)[:,1].reshape(-1, 1)
#             logits = model(X)
#             # print(pred.shape)
#             # print(y)
#             l = loss(logits, y)
#             optimizer.zero_grad()
#             l.backward()
#             optimizer.step()
#         model.eval()
#         y_prob = np.array([])
#         y_pred = np.array([])
#         cur_validation = 0
#         print("validation")
#         with torch.no_grad():
#             for X, y in tqdm(val_iter):
#                 # print(X.shape)
#                 # X = X.view(-1, X.shape[1])
#                 m = model.predict(X)
#                 pred, prob = m[0].flatten(), m[1].flatten()
#                 # pred = m.flatten() 
#                 y_prob = np.concatenate((y_prob, prob.cpu().detach().numpy()))
#                 y_pred = np.concatenate((y_pred, pred.cpu().detach().numpy()))
#                 # print(y_prob.shape)
#                 # print(y_pred.shape)
#             # validation_score.append(validation(y_valid, y_prob))
#             # print(y_pred)
#             cur_validation = validation(y_valid, y_prob)
#             print(f"roc_auc_score: {cur_validation}")
#             print(f"accuracy: {accuracy(y_valid, y_prob)}")
#             if cur_validation > best_validation:
#                 best_validation = cur_validation
#                 print("Save new model")
#                 torch.save(model, PATH)
#         # scheduler.step()
    
#     return y_prob, y_pred

# def validation(y_valid, y_score):
#     return roc_auc_score(y_true=y_valid, y_score=y_score)

# # only use to get test batch
# def get_batch(x, batch_size):
#     ix = torch.arange(0, len(x), batch_size)
#     return [x[i:i+batch_size] for i in ix]

# def accuracy(y_true, y_pred):
#     y_pred = np.where(y_pred > 0.5, 1., 0.)
#     # y_pred = np.argmax(y_pred, dim=-1)
#     # y_pred[y_pred > 0.5] = 1.
#     # y_pred[y_pred <= 0.5] = 0
#     # print(np.sum(y_pred))
#     return accuracy_score(y_true=y_true, y_pred=y_pred)

# def predict(model, x_test, batch_size):
#     test_iter = get_batch(x_test, batch_size)
#     res = []
#     model.eval()
#     with torch.no_grad():
#         for X in test_iter:
#             X = X.to(device)  # 確保數據位於相同設備
#             label, prob = model.predict(X)
# #             print(prob.shape)
#             res.append(prob.flatten().cpu().numpy())  # 將結果移回 CPU 並存入 res
#     return np.concatenate(res)[:len(x_test)]  # 確保輸出與輸入長度相同

In [None]:
# num_epoch, lr, batch_size, weight_decay = 5, 1e-3, 64, 1e-4

# # "Transformer", "KAN", "TransformerKAN"
# if model_name == "Transformer":
#     print("Train Transformer")
#     model = Transformer(in_features=len(x_train_d[0]), drop=0.).to(device)
# elif model_name == "KAN":
#     print("Train KAN")
#     model = KAN([len(x_train_d[0]), batch_size, 1]).to(device)
# elif model_name == "TransformerKAN":
#     print("Train TransformerKAN")
#     model = TransformerKAN(in_features=len(x_train_d[0]), drop=0.).to(device)


# prob, pred = train(model, x_train_d, y_train_d, x_valid_d, y_valid_d, num_epoch, lr, batch_size, weight_decay)
# # print(score)

In [None]:
# gc.collect()

# Load Model

In [None]:
# # with open(f'/kaggle/models/lgbm_gbdt_model_07_train_0.7837037039189849.pickle', 'rb') as f:
# #     cls = pickle.load(f)

# # with open(f'/kaggle/models/xgboost_model_07_train_0.7638036661339924.pickle', 'rb') as f:
# #     xgb_model = pickle.load(f)

# # with open(f'/kaggle/models/catboost_model_07_train_0.8088802180545341.pickle', 'rb') as f:
# #     cat_model = pickle.load(f)

# # with open(f'/kaggle/input/ensemble/scikitlearn/treebase/1/lgbm_gbdt_model_07_train_0.7837037039189849.pickle', 'rb') as f:
# #     cls = pickle.load(f)

# # with open(f'/kaggle/input/ensemble/scikitlearn/treebase/1/xgboost_model_07_train_0.7638036661339924.pickle', 'rb') as f:
# #     xgb_model = pickle.load(f)

# # with open(f'/kaggle/input/ensemble/scikitlearn/treebase/1/catboost_model_07_train_0.8088802180545341.pickle', 'rb') as f:
# #     cat_model = pickle.load(f)

# with open(f'/kaggle/input/ensemble/scikitlearn/random_features/2/lgbm_gbdt_model_07_train_0.7624031332281023.pickle', 'rb') as f:
#     cls = pickle.load(f)

# with open(f'/kaggle/input/ensemble/scikitlearn/random_features/2/xgboost_model_07_train_0.7374121828487601.pickle', 'rb') as f:
#     xgb_model = pickle.load(f)

# with open(f'/kaggle/input/ensemble/scikitlearn/random_features/2/catboost_model_07_train_0.7767447884537366.pickle', 'rb') as f:
#     cat_model = pickle.load(f)


# transformer_model = torch.load('/kaggle/input/ensemble/pytorch/dl/4/Transformer.pth')    
# kan_model = torch.load('/kaggle/input/ensemble/pytorch/dl/4/KAN.pth')
# transformerKan_model = torch.load('/kaggle/input/ensemble/pytorch/dl/4/TransformerKAN.pth')   


In [None]:
# pred = cls.predict(x_valid)
# auc = roc_auc_score(y_true=y_valid, y_score=pred)
# print(auc)

# xgb_pred = xgb_model.predict_proba(x_valid)[:,1]
# auc = roc_auc_score(y_true=y_valid, y_score=xgb_pred)
# print(auc)

# cat_pred = cat_model.predict_proba(x_valid)[:,1]
# auc = roc_auc_score(y_true=y_valid, y_score=cat_pred)
# print(auc)


In [None]:
# gc.collect()

In [None]:
# transformer_pred = predict(transformer_model, x_valid_d, 64)
# auc = roc_auc_score(y_true=y_valid_d, y_score=transformer_pred)
# print(auc)

In [None]:
# kan_pred = predict(kan_model, x_valid_d, 64)
# auc = roc_auc_score(y_true=y_valid_d, y_score=kan_pred)
# print(auc)

In [None]:
# transformerKan_pred = predict(transformerKan_model, x_valid_d, 64)
# auc = roc_auc_score(y_true=y_valid_d, y_score=transformerKan_pred)
# print(auc)

# Ensemble (3 tree + 3 dl)

In [None]:
# # use randomized search instead of linear search to save time
# def RandomizedSearch(n_init, pred1, pred2, pred3, pred4, pred5, pred6, y_true, random_state=None):
#     if random_state:
#         np.random.seed(random_state)
    
#     weight1 = np.arange(0, 20, 1)
#     weight2 = np.arange(0, 20, 1)
#     weight3 = np.arange(0, 20, 1)
#     weight4 = np.arange(0, 20, 1)
#     weight5 = np.arange(0, 20, 1)
#     weight6 = np.arange(0, 20, 1)

#     df = pd.DataFrame(columns=['weight1', 'weight2', 'weight3', 'weight4', 'weight5', 'weight6', 'score'])
#     for i in range(n_init):
#         # pick weight
#         w1 = np.random.choice(weight1, replace=True) 
#         w2 = np.random.choice(weight2, replace=True) 
#         w3 = np.random.choice(weight3, replace=True) 
#         w4 = np.random.choice(weight4, replace=True) 
#         w5 = np.random.choice(weight4, replace=True) 
#         w6 = np.random.choice(weight4, replace=True) 

#         y_ensemble = (w1*pred1 + w2*pred2 + w3*pred3 + w4*pred4 + w5*pred5 + w6*pred6)/(w1+w2+w3+w4+w5+w6)
#         score = roc_auc_score(y_true=y_true, y_score=y_ensemble)
        
#         df.loc[i] = [w1, w2, w3, w4, w5, w6, score]
        
#     return df


In [None]:
# # use randomized search instead of linear search to save time
# def RandomizedSearch(n_init, pred1, pred2, pred3, pred4, y_true, random_state=None):
#     if random_state:
#         np.random.seed(random_state)
    
#     weight1 = np.arange(0, 20, 1)
#     weight2 = np.arange(0, 20, 1)
#     weight3 = np.arange(0, 20, 1)
#     weight4 = np.arange(0, 20, 1)

#     df = pd.DataFrame(columns=['weight1', 'weight2', 'weight3', 'weight4', 'score'])
#     for i in range(n_init):
#         # pick weight
#         w1 = np.random.choice(weight1, replace=True) 
#         w2 = np.random.choice(weight2, replace=True) 
#         w3 = np.random.choice(weight3, replace=True) 
#         w4 = np.random.choice(weight4, replace=True) 

        
#         y_ensemble = (w1*pred1 + w2*pred2 + w3*pred3 + w4*pred)/(w1+w2+w3+w4)
#         score = roc_auc_score(y_true=y_true, y_score=y_ensemble)
        
#         df.loc[i] = [w1, w2, w3, w4, score]
        
#     return df

In [None]:
# # use randomized search instead of linear search to save time
# def RandomizedSearch(n_init, pred1, pred2, pred3, y_true, random_state=None):
#     if random_state:
#         np.random.seed(random_state)
    
#     weight1 = np.arange(0, 20, 1)
#     weight2 = np.arange(0, 20, 1)
#     weight3 = np.arange(0, 20, 1)

#     df = pd.DataFrame(columns=['weight1', 'weight2', 'weight3', 'score'])
#     for i in range(n_init):
#         # pick weight
#         w1 = np.random.choice(weight1, replace=True) 
#         w2 = np.random.choice(weight2, replace=True) 
#         w3 = np.random.choice(weight3, replace=True) 

        
#         y_ensemble = (w1*pred1 + w2*pred2 + w3*pred3)/(w1+w2+w3+w4)
#         score = roc_auc_score(y_true=y_true, y_score=y_ensemble)
        
#         df.loc[i] = [w1, w2, w3, score]
        
#     return df

In [None]:
# df = RandomizedSearch(n_init=100, pred1=pred, pred2=xgb_pred, pred3=cat_pred, pred4=transformer_pred, pred5=kan_pred, pred6=transformerKan_pred, y_true=y_valid, random_state=8787)
# best_weights = df.sort_values(by=['score'], ascending=False).iloc[0]
# df.sort_values(by=['score'], ascending=False)

# Test data predict (Baseline)

In [None]:
# y_test = cls.predict(x_test)
# y_test

In [None]:
# test_ensemble = (best_weights['weight1'] * cls.predict(x_test) + best_weights['weight2'] * xgb_model.predict_proba(x_test)[:,1] + best_weights['weight3'] * cat_model.predict_proba(x_test)[:,1] + best_weights['weight4'] * predict(transformer_model, x_test_d, 64) + best_weights['weight5'] * predict(kan_model, x_test_d, 64) + best_weights['weight6'] * predict(transformerKan_model, x_test_d, 64)) / (best_weights['weight1']+best_weights['weight2']+best_weights['weight3']+best_weights['weight4']+best_weights['weight5']+best_weights['weight6'])
# test_ensemble

In [None]:
# now = datetime.now()
# now = now.strftime("%Y-%m-%dT%H_%M_%S")

In [None]:
# test_df = pd.DataFrame(columns=['case_id', 'score'])
# test_df['case_id'] = x_test.index
# test_df['score'] = test_ensemble

# # test_df.to_csv(f'/kaggle/submission/submission_{now}.csv', index=False)
# test_df.to_csv('submission.csv', index=False)
# test_df