In [18]:
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score, log_loss

In [19]:
train_path = './data/train.csv'
df = pd.read_csv(train_path)

y = df["SeriousDlqin2yrs"]
x = df.drop(columns = ["Id", "SeriousDlqin2yrs"])

x_train, x_val, y_train, y_val = train_test_split(
    x, y, test_size = 0.1, random_state=42, stratify=y
)

In [37]:
num_features = x.select_dtypes(include=["int64", "float64"]).columns

num_p = Pipeline([
    ("imputer", SimpleImputer(strategy = "median")),
    ("scaler", StandardScaler())])
preprocess = ColumnTransformer([
    ("num", num_p, num_features)
])

log_reg = Pipeline([
    ("prep", preprocess),
    ("lr", LogisticRegression(
        max_iter=4000,
        solver="lbfgs",
        class_weight="balanced"
    ))
])

In [38]:
log_reg.fit(x_train, y_train)

0,1,2
,steps,"[('prep', ...), ('lr', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,4000


In [41]:
prediction = log_reg.predict_proba(x_val)[:,1]

auc_lr = roc_auc_score(y_val, prediction)
ll = log_loss(y_val, prediction)
auc_lr, ll

(0.7756620454709193, 0.5811321472946759)

In [48]:
from sklearn.tree import DecisionTreeClassifier

tree = Pipeline([
    ("preprocess", preprocess),
    ("tree", DecisionTreeClassifier(
        max_depth = 10,
        min_samples_leaf = 30,
        random_state = 42
    ))
])

tree.fit(x_train, y_train)

prediction = tree.predict_proba(x_val)[:,1]
auc_lr = roc_auc_score(y_val, prediction)
ll = log_loss(y_val, prediction)
auc_lr, ll

(0.8517553339302808, 0.21058193778832923)

In [52]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

rf = Pipeline([
    ("preprocess", preprocess),
    ("rf", RandomForestClassifier(
        n_estimators = 500,
        max_depth = 5,
        min_samples_leaf = 10,
        random_state = 42))
])

rf.fit(x_train, y_train)

prediction = rf.predict_proba(x_val)[:,1]
auc_lr = roc_auc_score(y_val, prediction)
ll = log_loss(y_val, prediction)
auc_lr, ll


(0.8651048260104364, 0.17950566313279587)

In [53]:
gb = Pipeline([
    ("preprocess", preprocess),
    ("gb", GradientBoostingClassifier(
        learning_rate = 1e-2,
        n_estimators = 300,
        max_depth = 3,
        random_state = 42))
])
gb.fit(x_train, y_train)
        

0,1,2
,steps,"[('preprocess', ...), ('gb', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,loss,'log_loss'
,learning_rate,0.01
,n_estimators,300
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


In [55]:

prediction = gb.predict_proba(x_val)[:,1]
auc_lr = roc_auc_score(y_val, prediction)
ll = log_loss(y_val, prediction)
auc_lr, ll

(0.8639756251405988, 0.17935576335330072)

In [56]:
# embedding

In [91]:
num_imputer = SimpleImputer(strategy = "median")
num_scaler = StandardScaler()

Xtr_num = num_scaler.fit_transform(
    num_imputer.fit_transform(x_train[num_features])
)
Xva_num = num_scaler.transform(
    num_imputer.transform(x_val[num_features])
)

In [92]:
import torch
from torch.utils.data import TensorDataset, DataLoader

Xtr_num_t = torch.tensor(Xtr_num, dtype = torch.float32)
Xva_num_t = torch.tensor(Xva_num, dtype = torch.float32)

ytr_t = torch.tensor(y_train.values, dtype = torch.float32)
yva_t = torch.tensor(y_val.values, dtype = torch.float32)

train_loader = DataLoader(TensorDataset(Xtr_num_t, ytr_t), batch_size=128, shuffle=True)

In [93]:
import torch.nn as nn
import math

class EmbNN(nn.Module):
    def __init__(self, num_dim):
        super().__init__()

        self.mlp = nn.Sequential(
            nn.Linear(num_dim, 128),
            nn.LayerNorm(128),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(128, 1)
        )

    def forward(self, x_num):
        return self.mlp(x_num).squeeze(1)
            

In [94]:
from sklearn.metrics import roc_auc_score

model = EmbNN(
    num_dim = Xtr_num.shape[1])

opt = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.BCEWithLogitsLoss()

In [95]:
num_epoches = 20
for epoch in range(num_epoches):
    model.train()
    for xb_num, yb in train_loader:
        logits = model(xb_num)
        loss = loss_fn(logits, yb)

        opt.zero_grad()
        loss.backward()
        opt.step()

    model.eval()
    with torch.no_grad():
        val_logits = model(Xva_num_t)
        val_prob = torch.sigmoid(val_logits).numpy()

    auc = roc_auc_score(y_val, val_prob)
    print(f"Epoch {epoch:02d} | AUC: {auc:.4f}")

Epoch 00 | AUC: 0.8204
Epoch 01 | AUC: 0.8275
Epoch 02 | AUC: 0.8291
Epoch 03 | AUC: 0.8326
Epoch 04 | AUC: 0.8319
Epoch 05 | AUC: 0.8346
Epoch 06 | AUC: 0.8351
Epoch 07 | AUC: 0.8348
Epoch 08 | AUC: 0.8344
Epoch 09 | AUC: 0.8318
Epoch 10 | AUC: 0.8350
Epoch 11 | AUC: 0.8359
Epoch 12 | AUC: 0.8305
Epoch 13 | AUC: 0.8341
Epoch 14 | AUC: 0.8349
Epoch 15 | AUC: 0.8349
Epoch 16 | AUC: 0.8358
Epoch 17 | AUC: 0.8344
Epoch 18 | AUC: 0.8350
Epoch 19 | AUC: 0.8357
