In [2]:
import numpy as np
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import roc_auc_score, auc

import lightgbm as lgb
from lightgbm import LGBMClassifier, LGBMRegressor
import catboost as cat
from catboost import CatBoostClassifier
import xgboost as xgb
from xgboost import XGBClassifier

ModuleNotFoundError: No module named 'lightgbm'

In [None]:
def _to_pandas(df):
    df = df.to_pandas().set_index('case_id')
    df = df.replace([np.inf, -np.inf], np.nan)
    return df

def reduce_memory_usage_pl(df):
        """ Reduce memory usage by polars dataframe {df} with name {name} by changing its data types.
            Original pandas version of this function: https://www.kaggle.com/code/arjanso/reducing-dataframe-memory-size-by-65 """
        print(f"Memory usage of dataframe is {round(df.estimated_size('mb'), 2)} MB")
        Numeric_Int_types = [pl.Int8,pl.Int16,pl.Int32,pl.Int64]
        Numeric_Float_types = [pl.Float32,pl.Float64]    
        for col in df.columns:
            try:
                col_type = df[col].dtype
                if col_type == pl.Categorical:
                    continue
                c_min = df[col].min()
                c_max = df[col].max()
                if col_type in Numeric_Int_types:
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df = df.with_columns(df[col].cast(pl.Int8))
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df = df.with_columns(df[col].cast(pl.Int16))
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df = df.with_columns(df[col].cast(pl.Int32))
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df = df.with_columns(df[col].cast(pl.Int64))
                elif col_type in Numeric_Float_types:
                    if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df = df.with_columns(df[col].cast(pl.Float32))
                    else:
                        pass
                # elif col_type == pl.Utf8:
                #     df = df.with_columns(df[col].cast(pl.Categorical))
                else:
                    pass
            except:
                pass
        print(f"Memory usage of dataframe became {round(df.estimated_size('mb'), 2)} MB")
        return df

In [None]:
data = pl.read_parquet('dataset/train_sample_first_ten.parquet')

# data = pl.read_parquet('dataset/train_filter_features_sample_first_ten.parquet')
# get_label = pl.read_parquet('dataset/train_sample_first_ten.parquet')

# data = reduce_memory_usage_pl(data)
data = _to_pandas(data)
# label = _to_pandas(get_label)['target']

label = data['target']
data = data.drop(columns=['target'])
data

Unnamed: 0_level_0,MONTH,WEEK_NUM,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,applicationscnt_1086L,applicationscnt_464L,...,pmtcount_4527229L,pmtcount_4955617L,pmtcount_693L,pmtscount_423L,pmtssum_45A,requesttype_4525192L,riskassesment_302T,riskassesment_940T,secondquarter_766L,thirdquarter_1082L
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,201901,0,,,1917.599976,0.000000,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1,201901,0,,,3134.000000,0.000000,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,201901,0,,,4937.000000,0.000000,0.0,0.0,0.0,0.0,...,,,,,,,,,,
3,201901,0,,,4643.600098,0.000000,0.0,1.0,0.0,2.0,...,,,,,,,,,,
4,201901,0,,,3390.199951,0.000000,0.0,1.0,0.0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198095,202001,55,0.0,74224.000000,10817.400391,12267.600586,0.0,0.0,0.0,0.0,...,6.0,,,,,PENSION_6,,,0.0,5.0
198096,202001,55,0.0,185840.578125,6963.399902,1377.599976,0.0,0.0,1.0,0.0,...,6.0,,,,,PENSION_6,,,9.0,9.0
198097,202001,55,0.0,182969.796875,2315.600098,7800.800293,0.0,3.0,9.0,0.0,...,,,,,,DEDUCTION_6,,,,
198098,202001,55,0.0,60266.601562,6512.600098,0.000000,0.0,0.0,0.0,6.0,...,,,,,,DEDUCTION_6,,,0.0,2.0


In [None]:
print(label.sum())

In [28]:
x_train, x_valid, y_train, y_valid = train_test_split(data, label, test_size=0.3, shuffle=True)


In [29]:
# cat_features = [col for col in data.columns if data[col].dtype.name == 'category' or data[col].dtype.name == 'object']
# cat_features

# onehot = pd.get_dummies(data, columns=cat_features)
# onehot

# Train LightGBM

In [30]:
lgb_train = lgb.Dataset(x_train, label=y_train)
lgb_valid = lgb.Dataset(x_valid, label=y_valid, reference=lgb_train)

lgb_params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 10,
    "num_leaves": 64,
    "min_data_in_leaf": 10,
    "learning_rate": 1e-3,
    "feature_fraction": 0.5,
    "bagging_fraction": 0.5,
    "bagging_freq": 5,
    "n_estimators": 1000,
    'min_data_in_bin':1,
    'max_bin': 64,
    "verbose": -1,
    "random_state": 42, 
    'n_jobs': -1
}

cls = lgb.train(
    lgb_params,
    lgb_train,
    valid_sets=lgb_valid,
    callbacks=[lgb.log_evaluation(100), lgb.early_stopping(100)]
)
pred = cls.predict(x_valid)



Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.711079
[200]	valid_0's auc: 0.713816
[300]	valid_0's auc: 0.715256
[400]	valid_0's auc: 0.716247
[500]	valid_0's auc: 0.71731
[600]	valid_0's auc: 0.718331
[700]	valid_0's auc: 0.719078
[800]	valid_0's auc: 0.720102
[900]	valid_0's auc: 0.720713
[1000]	valid_0's auc: 0.721517
Did not meet early stopping. Best iteration is:
[1000]	valid_0's auc: 0.721517


In [31]:
roc_auc_score(y_true=y_valid, y_score=pred)



0.7215174606732623

# Train XGBoost

In [32]:
xgb_model = XGBClassifier(
    device="cuda",
    objective='binary:logistic',
    tree_method="hist",
    enable_categorical=True,
    eval_metric='auc',
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=1,
    max_depth=20,
    # gamma=0.7,
    # reg_alpha=0.7,
    n_estimators=1200,
    random_state=42,
)

# Training the model on the training data
xgb_model.fit(
    x_train, y_train,
    eval_set=[(x_valid, y_valid)],
    early_stopping_rounds=100,
    verbose=True,
)



[0]	validation_0-auc:0.59432
[1]	validation_0-auc:0.62403
[2]	validation_0-auc:0.64444
[3]	validation_0-auc:0.64813
[4]	validation_0-auc:0.65703
[5]	validation_0-auc:0.66202
[6]	validation_0-auc:0.66485
[7]	validation_0-auc:0.67009
[8]	validation_0-auc:0.67642
[9]	validation_0-auc:0.67721
[10]	validation_0-auc:0.67944
[11]	validation_0-auc:0.68028
[12]	validation_0-auc:0.68067
[13]	validation_0-auc:0.68104
[14]	validation_0-auc:0.68077
[15]	validation_0-auc:0.68003
[16]	validation_0-auc:0.68037
[17]	validation_0-auc:0.68074
[18]	validation_0-auc:0.68051
[19]	validation_0-auc:0.68144
[20]	validation_0-auc:0.68122
[21]	validation_0-auc:0.68215
[22]	validation_0-auc:0.68035
[23]	validation_0-auc:0.67936
[24]	validation_0-auc:0.67740
[25]	validation_0-auc:0.67506
[26]	validation_0-auc:0.67466
[27]	validation_0-auc:0.67411
[28]	validation_0-auc:0.67356
[29]	validation_0-auc:0.67293
[30]	validation_0-auc:0.67255
[31]	validation_0-auc:0.67253
[32]	validation_0-auc:0.67164
[33]	validation_0-au

In [33]:
xgb_pred = xgb_model.predict_proba(x_valid)[:,1]
roc_auc_score(y_true=y_valid, y_score=xgb_pred)

0.6821496278705355

# Train CatBoost

In [34]:
cat_features = [col for col in x_train.columns if x_train[col].dtype.name == 'category' or x_train[col].dtype.name == 'object']

for col in cat_features:
    x_train[col] = x_train[col].cat.add_categories('Missing').fillna('Missing')
    x_valid[col] = x_valid[col].cat.add_categories('Missing').fillna('Missing')

cat_model = CatBoostClassifier(
    iterations=1200,                 
    depth=12,                        
    learning_rate=0.1,               
    eval_metric='AUC',               
    random_seed=42,                  
    bootstrap_type='Bayesian',       
    bagging_temperature=1,           
    od_type='Iter',                  
    od_wait=50,
    task_type='GPU'
)

cat_model.fit(
    x_train, y_train,
    eval_set=(x_valid, y_valid),
    cat_features=cat_features,
    use_best_model=True,
    verbose=True
)

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6215030	best: 0.6215030 (0)	total: 319ms	remaining: 6m 22s
1:	total: 633ms	remaining: 6m 18s
2:	total: 949ms	remaining: 6m 18s
3:	total: 1.27s	remaining: 6m 20s
4:	total: 1.58s	remaining: 6m 18s
5:	test: 0.6757965	best: 0.6757965 (5)	total: 1.89s	remaining: 6m 16s
6:	total: 2.2s	remaining: 6m 15s
7:	total: 2.53s	remaining: 6m 17s
8:	total: 2.85s	remaining: 6m 16s
9:	total: 3.17s	remaining: 6m 17s
10:	test: 0.6910733	best: 0.6910733 (10)	total: 3.48s	remaining: 6m 16s
11:	total: 3.8s	remaining: 6m 16s
12:	total: 4.12s	remaining: 6m 16s
13:	total: 4.44s	remaining: 6m 16s
14:	total: 4.78s	remaining: 6m 17s
15:	test: 0.6969195	best: 0.6969195 (15)	total: 5.11s	remaining: 6m 18s
16:	total: 5.43s	remaining: 6m 17s
17:	total: 5.74s	remaining: 6m 17s
18:	total: 6.07s	remaining: 6m 17s
19:	total: 6.39s	remaining: 6m 16s
20:	test: 0.7044605	best: 0.7047380 (19)	total: 6.72s	remaining: 6m 17s
21:	total: 7.04s	remaining: 6m 16s
22:	total: 7.37s	remaining: 6m 17s
23:	total: 7.69s	remaini

<catboost.core.CatBoostClassifier at 0x1dcc276d450>

In [35]:
cat_pred = cat_model.predict_proba(x_valid)[:,1]
roc_auc_score(y_true=y_valid, y_score=cat_pred)

0.7185297670089263

# Ensemble

In [36]:
# use randomized search instead of linear search to save time
def RandomizedSearch(n_init, pred1, pred2, pred3, y_true, random_state=None):
    if random_state:
        np.random.seed(random_state)
    
    weight1 = np.arange(1, 20, 1)
    weight2 = np.arange(1, 20, 1)
    weight3 = np.arange(1, 20, 1)
    
    df = pd.DataFrame(columns=['weight1', 'weight2', 'weight3', 'score'])
    for i in range(n_init):
        # pick weight
        w1 = np.random.choice(weight1, replace=True) 
        w2 = np.random.choice(weight2, replace=True) 
        w3 = np.random.choice(weight3, replace=True) 
        
        y_ensemble = (w1*pred1 + w2*pred2 + w3*pred3)/(w1+w2+w3)
        score = roc_auc_score(y_true=y_true, y_score=y_ensemble)
        
        df.loc[i] = [w1, w2, w3, score]
        
    return df


In [37]:
df = RandomizedSearch(n_init=100, pred1=pred, pred2=xgb_pred, pred3=cat_pred, y_true=y_valid, random_state=8787)
df.sort_values(by=['score'], ascending=False)

Unnamed: 0,weight1,weight2,weight3,score
17,16.0,4.0,6.0,0.728308
12,19.0,7.0,11.0,0.728060
19,13.0,4.0,3.0,0.727654
45,12.0,4.0,3.0,0.727618
33,17.0,7.0,17.0,0.727483
...,...,...,...,...
69,2.0,19.0,5.0,0.710792
16,4.0,18.0,2.0,0.707327
6,1.0,18.0,3.0,0.704649
28,2.0,12.0,1.0,0.703595
