# Import Libary

In [1]:
from pathlib import Path
import subprocess
import os
import gc
from glob import glob
import numpy as np
import pandas as pd
import polars as pl
from datetime import datetime
import joblib
import warnings
from sklearn.base import BaseEstimator, RegressorMixin
import lightgbm as lgb
from catboost import CatBoostClassifier, Pool
from sklearn.ensemble import HistGradientBoostingClassifier
from tqdm.auto import tqdm

warnings.filterwarnings('ignore')
ROOT = 'C:/SUPERAI/KBTG/datasets'
ROOT

  from .autonotebook import tqdm as notebook_tqdm


'C:/SUPERAI/KBTG/datasets'

# Test GPU

In [2]:
import torch
torch.cuda.is_available()

True

# Data Preparation

In [3]:
from sklearn.model_selection import TimeSeriesSplit, GroupKFold, StratifiedGroupKFold
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import KNNImputer

class Pipeline:
    def set_table_dtypes(df):
        for col in df.columns:
            if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.Int64))
            elif col in ["date_decision"]:
                df = df.with_columns(pl.col(col).cast(pl.Date))
            elif col[-1] in ("P", "A"):
                df = df.with_columns(pl.col(col).cast(pl.Float64))
            elif col[-1] in ("M",):
                df = df.with_columns(pl.col(col).cast(pl.String))
            elif col[-1] in ("D",):
                df = df.with_columns(pl.col(col).cast(pl.Date))
        return df

    def handle_dates(df):
        for col in df.columns:
            if col[-1] in ("D",):
                df = df.with_columns(pl.col(col) - pl.col("date_decision"))
                df = df.with_columns(pl.col(col).dt.total_days())
        df = df.drop("date_decision", "MONTH")
        return df

    def filter_cols(df):
        for col in df.columns:
            if (col not in ["target", "case_id", "WEEK_NUM"]) & (df[col].dtype == pl.String):
                freq = df[col].n_unique()
                if (freq == 1) | (freq > 200):
                    df = df.drop(col)
        return df

In [4]:
class Aggregator:
    def num_expr(df):
        cols = [col for col in df.columns if col[-1] in ("P", "A")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        return expr_max
    
    def date_expr(df):
        cols = [col for col in df.columns if col[-1] in ("D")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        return  expr_max
    
    def str_expr(df):
        cols = [col for col in df.columns if col[-1] in ("M",)]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        return  expr_max
    
    def other_expr(df):
        cols = [col for col in df.columns if col[-1] in ("T", "L")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        return  expr_max 
    
    def count_expr(df):
        cols = [col for col in df.columns if "num_group" in col]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols] 
        return  expr_max
    
    def get_exprs(df):
        exprs = Aggregator.num_expr(df) + \
                Aggregator.date_expr(df) + \
                Aggregator.str_expr(df) + \
                Aggregator.other_expr(df) + \
                Aggregator.count_expr(df)

        return exprs

In [5]:
def read_file(path, depth=None):
    df = pl.read_parquet(path)
    df = df.pipe(Pipeline.set_table_dtypes)
    if depth in [1,2]:
        df = df.group_by("case_id").agg(Aggregator.get_exprs(df)) 
    return df

def read_files(regex_path, depth=None):
    chunks = []
    for path in glob(str(regex_path)):
        df = pl.read_parquet(path)
        df = df.pipe(Pipeline.set_table_dtypes)
        if depth in [1, 2]:
            df = df.group_by("case_id").agg(Aggregator.get_exprs(df))
        chunks.append(df)
    df = pl.concat(chunks, how="vertical_relaxed")
    df = df.unique(subset=["case_id"])
    return df

def feature_eng(df_base, depth_0, depth_1, depth_2):
    df_base = (
        df_base
        .with_columns(
            month_decision = pl.col("date_decision").dt.month(),
            weekday_decision = pl.col("date_decision").dt.weekday(),
        )
    )
    for i, df in enumerate(depth_0 + depth_1 + depth_2):
        df_base = df_base.join(df, how="left", on="case_id", suffix=f"_{i}")
    df_base = df_base.pipe(Pipeline.handle_dates)
    return df_base

def to_pandas(df_data, cat_cols=None):
    df_data = df_data.to_pandas()
    if cat_cols is None:
        cat_cols = list(df_data.select_dtypes("object").columns)
    df_data[cat_cols] = df_data[cat_cols].astype("category")
    return df_data, cat_cols

def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtype
        if str(col_type)=="category":
            continue
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            continue
    end_mem = df.memory_usage().sum() / 1024**2    
    return df

## Read Datasets

### Train

In [6]:
ROOT = Path("C:/SUPERAI/KBTG/datasets")
TRAIN_DIR = ROOT / "parquet_files" / "train"

data_store = {
    "df_base": read_file(TRAIN_DIR / "train_base.parquet"),
    "depth_0": [
        read_file(TRAIN_DIR / "train_static_cb_0.parquet"),
        read_files(TRAIN_DIR / "train_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TRAIN_DIR / "train_applprev_1_*.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_a_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_b_1.parquet", 1),
        read_files(TRAIN_DIR / "train_credit_bureau_a_1_*.parquet", 1),
        read_file(TRAIN_DIR / "train_credit_bureau_b_1.parquet", 1),
        read_file(TRAIN_DIR / "train_other_1.parquet", 1),
        read_file(TRAIN_DIR / "train_person_1.parquet", 1),
        read_file(TRAIN_DIR / "train_deposit_1.parquet", 1),
        read_file(TRAIN_DIR / "train_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        read_file(TRAIN_DIR / "train_credit_bureau_b_2.parquet", 2),
    ]
}

In [7]:
data_store

{'df_base': shape: (1_526_659, 5)
 ┌─────────┬───────────────┬────────┬──────────┬────────┐
 │ case_id ┆ date_decision ┆ MONTH  ┆ WEEK_NUM ┆ target │
 │ ---     ┆ ---           ┆ ---    ┆ ---      ┆ ---    │
 │ i64     ┆ date          ┆ i64    ┆ i64      ┆ i64    │
 ╞═════════╪═══════════════╪════════╪══════════╪════════╡
 │ 0       ┆ 2019-01-03    ┆ 201901 ┆ 0        ┆ 0      │
 │ 1       ┆ 2019-01-03    ┆ 201901 ┆ 0        ┆ 0      │
 │ 2       ┆ 2019-01-04    ┆ 201901 ┆ 0        ┆ 0      │
 │ 3       ┆ 2019-01-03    ┆ 201901 ┆ 0        ┆ 0      │
 │ 4       ┆ 2019-01-04    ┆ 201901 ┆ 0        ┆ 1      │
 │ …       ┆ …             ┆ …      ┆ …        ┆ …      │
 │ 2703450 ┆ 2020-10-05    ┆ 202010 ┆ 91       ┆ 0      │
 │ 2703451 ┆ 2020-10-05    ┆ 202010 ┆ 91       ┆ 0      │
 │ 2703452 ┆ 2020-10-05    ┆ 202010 ┆ 91       ┆ 0      │
 │ 2703453 ┆ 2020-10-05    ┆ 202010 ┆ 91       ┆ 0      │
 │ 2703454 ┆ 2020-10-05    ┆ 202010 ┆ 91       ┆ 0      │
 └─────────┴───────────────┴────────┴─

In [8]:
df_train = feature_eng(**data_store)
del data_store
gc.collect()
df_train = df_train.pipe(Pipeline.filter_cols)
df_train, cat_cols = to_pandas(df_train)
df_train = reduce_mem_usage(df_train)
nums = df_train.select_dtypes(exclude='category').columns
df_train

Unnamed: 0,case_id,WEEK_NUM,target,month_decision,weekday_decision,assignmentdate_238D,assignmentdate_4527235D,assignmentdate_4955616D,birthdate_574D,contractssum_5085716L,...,max_last180dayaveragebalance_704A,max_last180dayturnover_1134A,max_last30dayturnover_651A,max_openingdate_857D,max_num_group1_10,max_pmts_dpdvalue_108P,max_pmts_pmtsoverdue_635A,max_pmts_date_1107D,max_num_group1_11,max_num_group2
0,0,0,0,1,4,,,,,,...,,,,,,,,,,
1,1,0,0,1,4,,,,,,...,,,,,,,,,,
2,2,0,0,1,5,,,,,,...,,,,,,,,,,
3,3,0,0,1,4,,,,,,...,,,,,,,,,,
4,4,0,1,1,5,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1526654,2703450,91,0,10,1,,,-998.0,,52863.589844,...,,,,,,,,,,
1526655,2703451,91,0,10,1,,,-5592.0,,324608.531250,...,,,,,,,,,,
1526656,2703452,91,0,10,1,,,,,102738.757812,...,,,,,,,,,,
1526657,2703453,91,0,10,1,,,-4616.0,,212683.296875,...,,,,-1956.0,1.0,,,,,


In [9]:
from itertools import combinations, permutations

nans_df = df_train[nums].isna()
nans_groups = {}

for col in nums:
    cur_group = nans_df[col].sum()
    nans_groups.setdefault(cur_group, []).append(col)

encoder = OrdinalEncoder()
df_train[cat_cols] = encoder.fit_transform(df_train[cat_cols])
df_train

Unnamed: 0,case_id,WEEK_NUM,target,month_decision,weekday_decision,assignmentdate_238D,assignmentdate_4527235D,assignmentdate_4955616D,birthdate_574D,contractssum_5085716L,...,max_last180dayaveragebalance_704A,max_last180dayturnover_1134A,max_last30dayturnover_651A,max_openingdate_857D,max_num_group1_10,max_pmts_dpdvalue_108P,max_pmts_pmtsoverdue_635A,max_pmts_date_1107D,max_num_group1_11,max_num_group2
0,0,0,0,1,4,,,,,,...,,,,,,,,,,
1,1,0,0,1,4,,,,,,...,,,,,,,,,,
2,2,0,0,1,5,,,,,,...,,,,,,,,,,
3,3,0,0,1,4,,,,,,...,,,,,,,,,,
4,4,0,1,1,5,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1526654,2703450,91,0,10,1,,,-998.0,,52863.589844,...,,,,,,,,,,
1526655,2703451,91,0,10,1,,,-5592.0,,324608.531250,...,,,,,,,,,,
1526656,2703452,91,0,10,1,,,,,102738.757812,...,,,,,,,,,,
1526657,2703453,91,0,10,1,,,-4616.0,,212683.296875,...,,,,-1956.0,1.0,,,,,


In [10]:
len(df_train[df_train['target'] == 1]), len(df_train[df_train['target'] == 0])

(47994, 1478665)

In [11]:
df_majority = df_train[df_train['target'] == 0]
df_minority = df_train[df_train['target'] == 1]
n_minority = len(df_minority) + 20000
df_majority_undersampled = df_majority.sample(n=n_minority, random_state=42)
df_train_balanced = pd.concat([df_majority_undersampled, df_minority])
df_train_balanced = df_train_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
df_train_balanced

Unnamed: 0,case_id,WEEK_NUM,target,month_decision,weekday_decision,assignmentdate_238D,assignmentdate_4527235D,assignmentdate_4955616D,birthdate_574D,contractssum_5085716L,...,max_last180dayaveragebalance_704A,max_last180dayturnover_1134A,max_last30dayturnover_651A,max_openingdate_857D,max_num_group1_10,max_pmts_dpdvalue_108P,max_pmts_pmtsoverdue_635A,max_pmts_date_1107D,max_num_group1_11,max_num_group2
0,1510864,34,1,8,4,,,,-11384.0,,...,,,,,,,,,,
1,848281,43,0,11,6,,,,,,...,,,,,,,,,,
2,662087,12,0,3,7,,,,-11504.0,,...,,,,,,,,,,
3,1578056,40,0,10,6,,,,,,...,,,,,,,,,,
4,1766387,56,1,1,3,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115983,653619,11,1,3,3,,,,-10488.0,,...,,,,,,,,,,
115984,2537120,3,1,1,7,,,,,,...,,,,,,,,,,
115985,1749096,54,1,1,2,,,,,,...,,,,,,,,,,
115986,783010,34,0,8,2,,,,,,...,,,,,,,,,,


In [12]:
len(df_train_balanced[df_train_balanced['target'] == 1]), len(df_train_balanced[df_train_balanced['target'] == 0])

(47994, 67994)

### Test

In [13]:
ROOT_Test = Path("C:/SUPERAI/KBTG/datasets")
TEST_DIR = ROOT_Test / "test_dataset" / "transformed"

data_store = {
    "df_base": read_files(Path("C:/SUPERAI/KBTG/datasets/test.parquet")),
    "depth_0": [read_files(TEST_DIR / "test_static_cb_0.parquet"),
        read_files(TEST_DIR / "test_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TEST_DIR / "test_applprev_1_*.parquet", 1),
        read_files(TEST_DIR / "test_tax_registry_a_1.parquet", 1),
        read_files(TEST_DIR / "test_tax_registry_b_1.parquet", 1),
        read_files(TEST_DIR / "test_credit_bureau_a_1_*.parquet", 1),
        read_files(TEST_DIR / "test_credit_bureau_b_1.parquet", 1),
        read_files(TEST_DIR / "test_other_1.parquet", 1),
        read_files(TEST_DIR / "test_person_1.parquet", 1),
        read_files(TEST_DIR / "test_deposit_1.parquet", 1),
        read_files(TEST_DIR / "test_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        read_files(TEST_DIR / "test_credit_bureau_b_2.parquet", 2),
    ]
}

In [14]:
df_test = feature_eng(**data_store)
del data_store
gc.collect()
df_test = df_test.pipe(Pipeline.filter_cols)
df_test, _ = to_pandas(df_test, cat_cols)
df_test = reduce_mem_usage(df_test)
df_test

Unnamed: 0,case_id,month_decision,weekday_decision,assignmentdate_238D,assignmentdate_4527235D,assignmentdate_4955616D,birthdate_574D,contractssum_5085716L,dateofbirth_337D,dateofbirth_342D,...,max_last180dayaveragebalance_704A,max_last180dayturnover_1134A,max_last30dayturnover_651A,max_openingdate_857D,max_num_group1_10,max_pmts_dpdvalue_108P,max_pmts_pmtsoverdue_635A,max_pmts_date_1107D,max_num_group1_11,max_num_group2
0,5282,11,4,,14.0,,,,-26432.0,,...,,,,-1159.0,1.0,,,,,
1,1075,11,7,,14.0,,,,-23232.0,,...,,,,,,,,,,
2,6044,11,1,,14.0,,,,-23392.0,,...,,,,,,,,,,
3,6830,11,5,,,,,,-13856.0,,...,,,,,,,,,,
4,15372,5,6,,,,,514632.000000,-14568.0,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,13490,4,5,,,-3120.0,,15674.370117,-24304.0,,...,,,,-2144.0,0.0,,,,,
19996,381,11,1,,,,,,-10728.0,,...,,,,,,,,,,
19997,3492,11,7,,,,,,-16200.0,,...,,,,,,,,,,
19998,259,11,5,,,,,,,,...,,,,,,,,,,


In [15]:
indexx = df_test['case_id']
indexx

0         5282
1         1075
2         6044
3         6830
4        15372
         ...  
19995    13490
19996      381
19997     3492
19998      259
19999    11397
Name: case_id, Length: 20000, dtype: int16

In [16]:
for col in df_test.columns:
    if df_test[col].dtype == 'object':
        df_test[col] = df_test[col].astype('str').fillna('-1')

encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
encoder.fit(df_train[cat_cols])
df_test[cat_cols] = encoder.transform(df_test[cat_cols])
df_test

Unnamed: 0,case_id,month_decision,weekday_decision,assignmentdate_238D,assignmentdate_4527235D,assignmentdate_4955616D,birthdate_574D,contractssum_5085716L,dateofbirth_337D,dateofbirth_342D,...,max_last180dayaveragebalance_704A,max_last180dayturnover_1134A,max_last30dayturnover_651A,max_openingdate_857D,max_num_group1_10,max_pmts_dpdvalue_108P,max_pmts_pmtsoverdue_635A,max_pmts_date_1107D,max_num_group1_11,max_num_group2
0,5282,11,4,,14.0,,,,-26432.0,,...,,,,-1159.0,1.0,,,,,
1,1075,11,7,,14.0,,,,-23232.0,,...,,,,,,,,,,
2,6044,11,1,,14.0,,,,-23392.0,,...,,,,,,,,,,
3,6830,11,5,,,,,,-13856.0,,...,,,,,,,,,,
4,15372,5,6,,,,,514632.000000,-14568.0,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,13490,4,5,,,-3120.0,,15674.370117,-24304.0,,...,,,,-2144.0,0.0,,,,,
19996,381,11,1,,,,,,-10728.0,,...,,,,,,,,,,
19997,3492,11,7,,,,,,-16200.0,,...,,,,,,,,,,
19998,259,11,5,,,,,,,,...,,,,,,,,,,


In [17]:
for col in df_test.columns:
    if df_test[col].dtype == 'object':
        df_test[col] = df_test[col].astype('category').cat.codes

df_test = df_test.drop(columns=['case_id'])
df_test = reduce_mem_usage(df_test)
df_test

Unnamed: 0,month_decision,weekday_decision,assignmentdate_238D,assignmentdate_4527235D,assignmentdate_4955616D,birthdate_574D,contractssum_5085716L,dateofbirth_337D,dateofbirth_342D,days120_123L,...,max_last180dayaveragebalance_704A,max_last180dayturnover_1134A,max_last30dayturnover_651A,max_openingdate_857D,max_num_group1_10,max_pmts_dpdvalue_108P,max_pmts_pmtsoverdue_635A,max_pmts_date_1107D,max_num_group1_11,max_num_group2
0,11,4,,14.0,,,,-26432.0,,0.0,...,,,,-1159.0,1.0,,,,,
1,11,7,,14.0,,,,-23232.0,,1.0,...,,,,,,,,,,
2,11,1,,14.0,,,,-23392.0,,0.0,...,,,,,,,,,,
3,11,5,,,,,,-13856.0,,2.0,...,,,,,,,,,,
4,5,6,,,,,514632.000000,-14568.0,,1.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,4,5,,,-3120.0,,15674.370117,-24304.0,,3.0,...,,,,-2144.0,0.0,,,,,
19996,11,1,,,,,,-10728.0,,1.0,...,,,,,,,,,,
19997,11,7,,,,,,-16200.0,,8.0,...,,,,,,,,,,
19998,11,5,,,,,,,,,...,,,,,,,,,,


## Save to pkl

In [18]:
y = df_train_balanced["target"]
df_train_balanced = df_train_balanced.drop(columns=["target", "case_id", "WEEK_NUM"])
df_train_balanced = reduce_mem_usage(df_train_balanced)
df_train_balanced

Unnamed: 0,month_decision,weekday_decision,assignmentdate_238D,assignmentdate_4527235D,assignmentdate_4955616D,birthdate_574D,contractssum_5085716L,dateofbirth_337D,dateofbirth_342D,days120_123L,...,max_last180dayaveragebalance_704A,max_last180dayturnover_1134A,max_last30dayturnover_651A,max_openingdate_857D,max_num_group1_10,max_pmts_dpdvalue_108P,max_pmts_pmtsoverdue_635A,max_pmts_date_1107D,max_num_group1_11,max_num_group2
0,8,4,,,,-11384.0,,-11384.0,,5.0,...,,,,,,,,,,
1,11,6,,,,,,-13424.0,,2.0,...,,,,,,,,,,
2,3,7,,,,-11504.0,,-11504.0,,3.0,...,,,,,,,,,,
3,10,6,,,,,,-11968.0,,2.0,...,,,,,,,,,,
4,1,3,,,,,,-11928.0,,1.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115983,3,3,,,,-10488.0,,-10488.0,,6.0,...,,,,,,,,,,
115984,1,7,,,,,,,,,...,,,,,,,,,,
115985,1,2,,,,,,-13008.0,,2.0,...,,,,,,,,,,
115986,8,2,,,,,,-18528.0,,1.0,...,,,,,,,,,,


In [19]:
joblib.dump((df_train_balanced, y, df_test), 'data.pkl')

['data.pkl']

# Modeling

## Train, Validation and Test

In [20]:
df_train, y, df_test = joblib.load('data.pkl')
df_train.shape, df_test.shape

((115988, 437), (20000, 439))

In [21]:
df_train.shape

(115988, 437)

## PCA

In [22]:
from sklearn.decomposition import PCA

In [23]:
from sklearn.model_selection import train_test_split

X_train, X_validation, y_train, y_validation = train_test_split(df_train, y, test_size=0.2, random_state=54, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(df_train, y, test_size=0.2, random_state=54, stratify=y)

print("X_train shape:", X_train.shape)
print("X_validation shape:", X_validation.shape)
print("y_train shape:", y_train.shape)
print("y_validation shape:", y_validation.shape)

X_train shape: (92790, 437)
X_validation shape: (23198, 437)
y_train shape: (92790,)
y_validation shape: (23198,)


## Single Model

In [27]:
!pip install optuna

Collecting optuna
  Using cached optuna-3.6.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Using cached alembic-1.13.1-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Using cached colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Collecting sqlalchemy>=1.3.0 (from optuna)
  Downloading SQLAlchemy-2.0.30-cp311-cp311-win_amd64.whl.metadata (9.8 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)
Collecting greenlet!=0.4.17 (from sqlalchemy>=1.3.0->optuna)
  Using cached greenlet-3.0.3-cp311-cp311-win_amd64.whl.metadata (3.9 kB)
Using cached optuna-3.6.1-py3-none-any.whl (380 kB)
Using cached alembic-1.13.1-py3-none-any.whl (233 kB)
Downloading SQLAlchemy-2.0.30-cp311-cp311-win_amd64.whl (2.1 MB)
   ---------------------------------------- 0.0/2.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/

### LIGHTGBM + OPTUNA

In [34]:
# import lightgbm as lgb
# import optuna
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import roc_auc_score

# def objective(trial):
#     params = {
#         "boosting_type": "gbdt",
#         "colsample_bynode": trial.suggest_float("colsample_bynode", 0.6, 1.0),
#         "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
#         "device": "gpu",
#         "extra_trees": trial.suggest_categorical("extra_trees", [True, False]),
#         "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.1),
#         "reg_alpha": trial.suggest_loguniform("reg_alpha", 0.1, 10.0),
#         "reg_lambda": trial.suggest_loguniform("reg_lambda", 1.0, 100.0),
#         "max_depth": trial.suggest_int("max_depth", 5, 50),
#         "n_estimators": trial.suggest_int("n_estimators", 1000, 3000),
#         "num_leaves": trial.suggest_int("num_leaves", 31, 128),
#         "objective": "binary",
#         "random_state": 54,
#         "verbose": -1,
#     }

#     model = lgb.LGBMClassifier(**params)
    
#     fit_params = {
#         "eval_set": [(X_val, y_val)],
#         "eval_metric": "auc",
#     }

#     model.fit(X_train, y_train, **fit_params)
    
#     preds = model.predict_proba(X_val)[:, 1]
#     auc = roc_auc_score(y_val, preds)
#     return auc

# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=50)

[I 2024-05-22 14:31:21,799] A new study created in memory with name: no-name-b847c951-9137-4eb4-910b-5ffef0de10ee
[I 2024-05-22 14:33:23,700] Trial 0 finished with value: 0.851859369527525 and parameters: {'colsample_bynode': 0.7221968039563266, 'colsample_bytree': 0.6355333927802314, 'extra_trees': True, 'learning_rate': 0.09609196848707582, 'reg_alpha': 0.5040511426404679, 'reg_lambda': 1.248210246709476, 'max_depth': 43, 'n_estimators': 1793, 'num_leaves': 70}. Best is trial 0 with value: 0.851859369527525.
[I 2024-05-22 14:34:42,082] Trial 1 finished with value: 0.8548970569609714 and parameters: {'colsample_bynode': 0.7139491720475039, 'colsample_bytree': 0.8433137784883445, 'extra_trees': False, 'learning_rate': 0.07245649254701114, 'reg_alpha': 4.372756904170818, 'reg_lambda': 39.31773119536354, 'max_depth': 23, 'n_estimators': 2415, 'num_leaves': 31}. Best is trial 1 with value: 0.8548970569609714.
[I 2024-05-22 14:36:43,522] Trial 2 finished with value: 0.8558806033556774 and 

KeyboardInterrupt: 

In [35]:
# best_trial = study.best_trial
# print("Best trial parameters:", best_trial.params)

Best trial parameters: {'colsample_bynode': 0.6973565809553169, 'colsample_bytree': 0.6435330359014061, 'extra_trees': False, 'learning_rate': 0.018178967431871404, 'reg_alpha': 2.0093990688645484, 'reg_lambda': 10.592751623938398, 'max_depth': 12, 'n_estimators': 2894, 'num_leaves': 103}


In [24]:
save_best_params = {'colsample_bynode': 0.6973565809553169,
                     'colsample_bytree': 0.6435330359014061,
                       'extra_trees': False,
                         'learning_rate': 0.018178967431871404,
                           'reg_alpha': 2.0093990688645484,
                             'reg_lambda': 10.592751623938398,
                               'max_depth': 12,
                                 'n_estimators': 2894,
                                   'num_leaves': 103
                                   }

In [25]:

# Train the final model with the best parameters
# best_params = best_trial.params
# best_params["device"] = "gpu"  # Ensure device is set to GPU
# model = lgb.LGBMClassifier(**best_params)
save_best_params["device"] = "gpu"  # Ensure device is set to GPU
model = lgb.LGBMClassifier(**save_best_params)
model.fit(df_train, y)

fitted_models_lgb = [model]
print("Model training with Optuna optimization success")


[LightGBM] [Info] Number of positive: 47994, number of negative: 67994
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 52691
[LightGBM] [Info] Number of data points in the train set: 115988, number of used features: 433
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3060 Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 210 dense feature groups (23.45 MB) transferred to GPU in 0.019565 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.413784 -> initscore=-0.348343
[LightGBM] [Info] Start training from score -0.348343
Model training with Optuna optimization success


## Ensemble

In [30]:
fitted_models_lgb = []
device: str = "gpu"

params1 = {
    "boosting_type": "gbdt",
    "colsample_bynode": 0.8,
    "colsample_bytree": 0.8,
    "device": device,
    "extra_trees": True,
    "learning_rate": 0.05,
    "l1_regularization": 0.1,
    "l2_regularization": 10,
    "max_depth": 20,
    "metric": "auc",
    "n_estimators": 2000,
    "num_leaves": 64,
    "objective": "binary",
    "random_state": 54,
    "verbose": -1,
}
model_1 = lgb.LGBMClassifier(**params1)
model_1.fit(df_train, y)
fitted_models_lgb.append(model_1)
print("Model_1 Success")

model_2 = HistGradientBoostingClassifier(max_iter=300, random_state=42)
model_2.fit(df_train, y)
fitted_models_lgb.append(model_2)
print("Model_2 Success")

train_pool = Pool(X_train, y_train)
val_pool = Pool(X_validationidation, y_validationidation)
model_3 = CatBoostClassifier(
    best_model_min_trees = 2000,
    boosting_type = "Plain",
    eval_metric = "AUC",
    learning_rate = 0.05,
    l2_leaf_reg = 10,
    max_leaves = 64,
    random_seed = 54,
    task_type = "GPU",
    use_best_model = True
)
model_3.fit(train_pool, eval_set=val_pool, verbose=False)
fitted_models_lgb.append(model_3)
print("Model_3 Success")

Model_1 Success
Model_2 Success


Default metric period is 5 because AUC is/are not implemented for GPU


Model_3 Success


# Submission

In [26]:
missing_cols = set(df_test.columns) - set(df_train.columns)
missing_cols

{'lastapprcommoditytypec_5251766M', 'max_profession_152M'}

In [27]:
df_test = df_test.drop(columns=['lastapprcommoditytypec_5251766M', 'max_profession_152M'])
df_test

Unnamed: 0,month_decision,weekday_decision,assignmentdate_238D,assignmentdate_4527235D,assignmentdate_4955616D,birthdate_574D,contractssum_5085716L,dateofbirth_337D,dateofbirth_342D,days120_123L,...,max_last180dayaveragebalance_704A,max_last180dayturnover_1134A,max_last30dayturnover_651A,max_openingdate_857D,max_num_group1_10,max_pmts_dpdvalue_108P,max_pmts_pmtsoverdue_635A,max_pmts_date_1107D,max_num_group1_11,max_num_group2
0,11,4,,14.0,,,,-26432.0,,0.0,...,,,,-1159.0,1.0,,,,,
1,11,7,,14.0,,,,-23232.0,,1.0,...,,,,,,,,,,
2,11,1,,14.0,,,,-23392.0,,0.0,...,,,,,,,,,,
3,11,5,,,,,,-13856.0,,2.0,...,,,,,,,,,,
4,5,6,,,,,514632.000000,-14568.0,,1.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,4,5,,,-3120.0,,15674.370117,-24304.0,,3.0,...,,,,-2144.0,0.0,,,,,
19996,11,1,,,,,,-10728.0,,1.0,...,,,,,,,,,,
19997,11,7,,,,,,-16200.0,,8.0,...,,,,,,,,,,
19998,11,5,,,,,,,,,...,,,,,,,,,,


## Vote

In [33]:
from scipy.stats import mode

class VotingModel(BaseEstimator, RegressorMixin):
    def __init__(self, estimators, weights=None):
        super().__init__()
        self.estimators = estimators
        self.weights = weights
        
    def fit(self, X, y=None):
        return self
    
    def predict(self, X):
        y_preds = [estimator.predict(X) for estimator in self.estimators]
        if self.weights is None:
            return mode(y_preds, axis=0)[0]
        else:
            weighted_sum = np.sum(np.array(y_preds) * self.weights.reshape(-1, 1), axis=0)
            return np.round(weighted_sum).astype(int)
    
    def predict_proba(self, X):
        y_preds = [estimator.predict_proba(X) for estimator in self.estimators]
        mean_proba = np.mean(y_preds, axis=0)
        threshold = 0.45
        predicted_proba = np.where(mean_proba[:, 1] > threshold, 1, 0)
        return mean_proba

model = VotingModel(fitted_models_lgb)
# model = VotingModel(fitted_models_lgb, weights=np.array([0.34, 0.33, 0.33]))
model

In [39]:
y_pred = pd.Series(model.predict_proba(df_test)[:,1], index=df_test.index)
y_pred

0        0.388301
1        0.024378
2        0.013295
3        0.246571
4        0.026155
           ...   
19995    0.550691
19996    0.046398
19997    0.028690
19998    0.163905
19999    0.034834
Length: 20000, dtype: float64

- Single Model pred
 - LIGHGBM + OPTUNA

In [28]:
single_pred = model.predict(df_test)
single_pred

array([0, 0, 0, ..., 1, 0, 0], dtype=int8)

In [29]:
single_pred_prob = model.predict_proba(df_test)
single_pred_prob

array([[0.6641349 , 0.3358651 ],
       [0.77678525, 0.22321475],
       [0.95359716, 0.04640284],
       ...,
       [0.33707971, 0.66292029],
       [0.50504647, 0.49495353],
       [0.65990422, 0.34009578]])

In [30]:
y_pred_prob = pd.Series(single_pred_prob[:,1], index=df_test.index)

In [31]:
sub = pd.DataFrame({
    "case_id": indexx, "target": y_pred_prob
})
sub

Unnamed: 0,case_id,target
0,5282,0.335865
1,1075,0.223215
2,6044,0.046403
3,6830,0.336900
4,15372,0.234048
...,...,...
19995,13490,0.037282
19996,381,0.388430
19997,3492,0.662920
19998,259,0.494954


In [32]:
df_subm = pd.read_csv("C:\SUPERAI\KBTG\datasets\sample_submission.csv")
df_subm

Unnamed: 0,case_id,target
0,16791,0.0
1,12423,0.0
2,19352,0.0
3,17099,0.0
4,7491,1.0
...,...,...
19995,1268,
19996,17450,
19997,15421,
19998,10109,


In [33]:
df_subm = df_subm.drop(columns=['target'])
merged_df = df_subm.merge(sub, on="case_id", how="left")
merged_df

Unnamed: 0,case_id,target
0,16791,0.066866
1,12423,0.769765
2,19352,0.081518
3,17099,0.307087
4,7491,0.913575
...,...,...
19995,1268,0.217359
19996,17450,0.080877
19997,15421,0.178063
19998,10109,0.028702


In [34]:
merged_df[merged_df['target'] == 1]

Unnamed: 0,case_id,target


In [35]:
merged_df.to_csv("Hello Test Model.csv", index=False)