In [77]:
import numpy as np
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.metrics import roc_auc_score, accuracy_score

import torch
import torch.nn as nn
from torch.utils import data

from model import Transformer, KAN, TransformerKAN

model_name = "KAN" # "Transformer", "KAN", "TransformerKAN"
device = 'cuda' if torch.cuda.is_available() else 'cpu'
PATH = f"/home/ayrtont/Desktop/coding/Kaggle-Home-Credit-Credit-Risk-Model-Stability/ckpt/{model_name}.pth"


In [78]:
# stability_results = []
# for fold, (idx, model) in enumerate(zip(cv_results['indices']['test'], cv_results['estimator'])):
#     df_res = pd.DataFrame()
    
#     df_res['WEEK_NUM'] = df_train['WEEK_NUM'].iloc[idx].values
#     df_res['target'] = df_train['target'].iloc[idx].values
#     df_res['score'] = model.predict_proba(X.iloc[idx])[:, 1]
#     df_res['fold'] = fold
#     stability_results.append(df_res)
    
# df_stability_results = pd.concat(stability_results)
# df_stability_results[df_stability_results['target'] == 0]['score'].plot(kind='hist', alpha=0.5, bins=100, label='target=0')
# df_stability_results[df_stability_results['target'] == 1]['score'].plot(kind='hist', secondary_y=True, alpha=0.5, bins=100, label='target=1')
# plt.legend();


def gini_stability(base, w_fallingrate=88.0, w_resstd=-0.5):
    gini_in_time = base.loc[:, ["WEEK_NUM", "target", "score"]]\
        .sort_values("WEEK_NUM")\
        .groupby("WEEK_NUM")[["target", "score"]]\
        .apply(lambda x: 2*roc_auc_score(x["target"], x["score"])-1).tolist()
    
    x = np.arange(len(gini_in_time))
    y = gini_in_time
    a, b = np.polyfit(x, y, 1)
    y_hat = a*x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    avg_gini = np.mean(gini_in_time)
    return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std

# try:
#     df_stability_results.groupby('fold').apply(gini_stability, include_groups=False)
# except:
#     pass

In [79]:
def _to_pandas(df):
    df = df.to_pandas().set_index('case_id')
    df = df.replace([np.inf, -np.inf], np.nan)
    return df

def reduce_memory_usage_pl(df):
        """ Reduce memory usage by polars dataframe {df} with name {name} by changing its data types.
            Original pandas version of this function: https://www.kaggle.com/code/arjanso/reducing-dataframe-memory-size-by-65 """
        print(f"Memory usage of dataframe is {round(df.estimated_size('mb'), 2)} MB")
        Numeric_Int_types = [pl.Int8,pl.Int16,pl.Int32,pl.Int64]
        Numeric_Float_types = [pl.Float32,pl.Float64]    
        for col in df.columns:
            try:
                col_type = df[col].dtype
                if col_type == pl.Categorical:
                    continue
                c_min = df[col].min()
                c_max = df[col].max()
                if col_type in Numeric_Int_types:
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df = df.with_columns(df[col].cast(pl.Int8))
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df = df.with_columns(df[col].cast(pl.Int16))
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df = df.with_columns(df[col].cast(pl.Int32))
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df = df.with_columns(df[col].cast(pl.Int64))
                elif col_type in Numeric_Float_types:
                    if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df = df.with_columns(df[col].cast(pl.Float32))
                    else:
                        pass
                # elif col_type == pl.Utf8:
                #     df = df.with_columns(df[col].cast(pl.Categorical))
                else:
                    pass
            except:
                pass
        print(f"Memory usage of dataframe became {round(df.estimated_size('mb'), 2)} MB")
        return df

In [80]:
# data = pl.read_parquet('dataset/train_sample_first_ten.parquet')

# dataset = pl.read_parquet('dataset/train_filter_features_sample_first_ten.parquet')
dataset = pl.read_parquet('dataset/train_filter_features_sample_random.parquet')
# get_label = pl.read_parquet('dataset/train_sample_first_ten.parquet')

# data = reduce_memory_usage_pl(data)
dataset = _to_pandas(dataset)
# label = _to_pandas(get_label)['target']

label = dataset['target']
dataset = dataset.drop(columns=['target'])
dataset

Unnamed: 0_level_0,numrejects9m_859L,maxdpdinstlnum_3546846P,pmtaverage_3A,for3years_128L,isbidproductrequest_292L,formonth_535L,numinstpaidlastcontr_4325080L,forquarter_634L,mindbdtollast24m_4525191P,clientscnt_1130L,...,applicationscnt_629L,totaldebt_9A,numinstregularpaid_973L,pmtaverage_4527227A,posfpd10lastmonth_333P,avgdbddpdlast3m_4187120P,foryear_850L,pctinstlsallpaidlate1d_3546856L,numactiverelcontr_750L,twobodfilling_608L
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1813397,0.0,2.0,,,,,3.0,,1.0,0.0,...,0.0,44572.00000,6.0,,0.0,1.0,,1.00000,0.0,FO
1449809,0.0,,,,,,3.0,,,0.0,...,0.0,0.00000,3.0,,0.0,,,0.00000,1.0,FO
1599588,0.0,6.0,,0.0,,0.0,6.0,0.0,-2.0,0.0,...,0.0,0.00000,39.0,,0.0,-1.0,2.0,0.10256,0.0,FO
1485727,0.0,4.0,,,,,6.0,,,0.0,...,0.0,0.00000,6.0,,0.0,,,0.83333,0.0,FO
1794015,0.0,,,,,,0.0,,-94.0,0.0,...,0.0,212514.40625,38.0,,0.0,-17.0,,0.00000,2.0,FO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1324856,0.0,13.0,,,,,,,,0.0,...,0.0,0.00000,24.0,,0.0,,,0.70833,0.0,FO
1910365,0.0,5.0,,,,,9.0,,3556.0,0.0,...,0.0,0.00000,9.0,,0.0,,,0.66667,0.0,FO
219338,0.0,1.0,,,,,4.0,,-30.0,0.0,...,0.0,0.00000,40.0,,0.0,0.0,,0.02500,0.0,FO
828380,0.0,,,,,,,,,0.0,...,0.0,0.00000,,7162.399902,0.0,,,,0.0,FO


In [81]:
numeric_features = dataset.select_dtypes(include=np.number).columns
dataset[numeric_features] = dataset[numeric_features].apply(
    lambda x: (x-x.mean())/(x.std()))
dataset[numeric_features] = dataset[numeric_features].fillna(0)


cat_features = [col for col in dataset.columns if dataset[col].dtype.name == 'category' or dataset[col].dtype.name == 'object']
dataset = pd.get_dummies(dataset, columns=cat_features, dtype=np.float32)

dataset

Unnamed: 0_level_0,numrejects9m_859L,maxdpdinstlnum_3546846P,pmtaverage_3A,for3years_128L,formonth_535L,numinstpaidlastcontr_4325080L,forquarter_634L,mindbdtollast24m_4525191P,clientscnt_1130L,maxdpdlast3m_392P,...,lastst_736L_K,lastst_736L_N,lastst_736L_S,lastst_736L_Q,lastst_736L_L,lastst_736L_H,lastst_736L_R,lastst_736L_P,twobodfilling_608L_BO,twobodfilling_608L_FO
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1813397,-0.296087,-1.045258,0.0,0.000000,0.000000,-0.775294,0.000000,0.010364,-0.121362,-0.040985,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1449809,-0.296087,0.000000,0.0,0.000000,0.000000,-0.775294,0.000000,0.000000,-0.121362,-0.048267,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1599588,-0.296087,-0.377724,0.0,-0.016429,-0.441217,-0.285828,-0.535145,0.000428,-0.121362,-0.048267,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1485727,-0.296087,-0.711491,0.0,0.000000,0.000000,-0.285828,0.000000,0.000000,-0.121362,-0.048267,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1794015,-0.296087,0.000000,0.0,0.000000,0.000000,-1.264761,0.000000,-0.304260,-0.121362,-0.048267,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1324856,-0.296087,0.790461,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,-0.121362,-0.048267,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1910365,-0.296087,-0.544607,0.0,0.000000,0.000000,0.203638,0.000000,11.783907,-0.121362,-0.048267,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
219338,-0.296087,-1.212142,0.0,0.000000,0.000000,-0.612139,0.000000,-0.092303,-0.121362,-0.048267,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
828380,-0.296087,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,-0.121362,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [82]:
x_train, x_valid, y_train, y_valid = train_test_split(dataset, label, test_size=0.3, shuffle=True)


x_train  = torch.tensor(x_train.values, dtype=torch.float32).to(device)
x_valid  = torch.tensor(x_valid.values, dtype=torch.float32).to(device)
y_train  = torch.tensor(y_train.values, dtype=torch.float32).to(device).reshape(-1, 1)
y_valid  = torch.tensor(y_valid.values, dtype=torch.float32).reshape(-1, 1)

In [83]:
from tqdm import tqdm

def load_data(data_array, batch_size, is_train=True):
    dataset = data.TensorDataset(*data_array)
    return data.DataLoader(dataset, batch_size, shuffle=is_train)

def train(model, x_train, y_train, x_valid, y_valid, num_epochs, lr, batch_size, weight_decay):
    best_validation = 0
    train_iter = load_data((x_train, y_train), batch_size)
    val_iter = load_data((x_valid, y_valid), batch_size, is_train=False)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.8)
    # loss = nn.BCELoss()
    # loss = nn.BCEWithLogitsLoss()
    loss = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(5).to(device))
    # loss = nn.CrossEntropyLoss(weight=torch.tensor(0.7).to(device))
    for epoch in range(num_epochs):
        model.train()
        print("training")
        for X, y in tqdm(train_iter):
            # pred = torch.argmax(model(X), dim=1)
            # pred = model(X)[:,1].reshape(-1, 1)
            logits = model(X)
            # print(pred.shape)
            # print(y)
            l = loss(logits, y)
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
        model.eval()
        y_prob = np.array([])
        y_pred = np.array([])
        cur_validation = 0
        print("validation")
        with torch.no_grad():
            for X, y in tqdm(val_iter):
                # print(X.shape)
                # X = X.view(-1, X.shape[1])
                m = model.predict(X)
                pred, prob = m[0].flatten(), m[1].flatten()
                # pred = m.flatten() 
                y_prob = np.concatenate((y_prob, prob.cpu().detach().numpy()))
                y_pred = np.concatenate((y_pred, pred.cpu().detach().numpy()))
                # print(y_prob.shape)
                # print(y_pred.shape)
            # validation_score.append(validation(y_valid, y_prob))
            # print(y_pred)
            cur_validation = validation(y_valid, y_prob)
            print(f"roc_auc_score: {cur_validation}")
            print(f"accuracy: {accuracy(y_valid, y_prob)}")
            if cur_validation > best_validation:
                best_validation = cur_validation
                print("Save new model")
                torch.save(model, PATH)
        # scheduler.step()
    
    return y_prob, y_pred

def validation(y_valid, y_score):
    return roc_auc_score(y_true=y_valid, y_score=y_score)

# only use to get test batch
def get_batch(x, batch_size):
    ix = torch.arange(0, len(x)-batch_size+1, batch_size)
    # print(ix)
    # print(x.shape)
    x = torch.stack([x[i:i+batch_size] for i in ix])
    # print(x.shape)
    return x

def accuracy(y_true, y_pred):
    y_pred = np.where(y_pred > 0.5, 1., 0.)
    # y_pred = np.argmax(y_pred, dim=-1)
    # y_pred[y_pred > 0.5] = 1.
    # y_pred[y_pred <= 0.5] = 0
    # print(np.sum(y_pred))
    return accuracy_score(y_true=y_true, y_pred=y_pred)

def predict(model, x_test, batch_size):
    test_iter = get_batch(x_test, batch_size)
    res = []
    model.eval()
    with torch.no_grad():
        for X in test_iter:
            label, prob = model.predict(X)
            # print(prob.shape)
            res += prob.flatten()
            
    return res

In [84]:
num_epoch, lr, batch_size, weight_decay = 5, 1e-3, 64, 1e-4

# "Transformer", "KAN", "TransformerKAN"
if model_name == "Transformer":
    print("Train Transformer")
    model = Transformer(in_features=len(x_train[0]), drop=0.).to(device)
elif model_name == "KAN":
    print("Train KAN")
    model = KAN([len(x_train[0]), batch_size, 1]).to(device)
elif model_name == "TransformerKAN":
    print("Train TransformerKAN")
    model = TransformerKAN(in_features=len(x_train[0]), drop=0.).to(device)


prob, pred = train(model, x_train, y_train, x_valid, y_valid, num_epoch, lr, batch_size, weight_decay)
# print(score)

Train KAN
training


100%|██████████| 1670/1670 [00:04<00:00, 373.28it/s]


validation


100%|██████████| 716/716 [00:00<00:00, 879.13it/s]


roc_auc_score: 0.7525503677122339
accuracy: 0.9649344978165939
Save new model
training


100%|██████████| 1670/1670 [00:04<00:00, 415.63it/s]


validation


100%|██████████| 716/716 [00:00<00:00, 936.06it/s]


roc_auc_score: 0.7550212303396464
accuracy: 0.9601965065502184
Save new model
training


100%|██████████| 1670/1670 [00:04<00:00, 412.88it/s]


validation


100%|██████████| 716/716 [00:00<00:00, 958.55it/s]


roc_auc_score: 0.758073545885903
accuracy: 0.964235807860262
Save new model
training


100%|██████████| 1670/1670 [00:03<00:00, 424.41it/s]


validation


100%|██████████| 716/716 [00:00<00:00, 942.88it/s]


roc_auc_score: 0.7600473192332421
accuracy: 0.9602620087336244
Save new model
training


100%|██████████| 1670/1670 [00:03<00:00, 422.57it/s]


validation


100%|██████████| 716/716 [00:00<00:00, 909.10it/s]

roc_auc_score: 0.7580282453453269
accuracy: 0.9517685589519651





In [85]:
print(prob)

[0.29501757 0.08230102 0.13696159 ... 0.09739676 0.23899536 0.16472353]


In [86]:
print(accuracy(y_valid, prob))
count = 0
for y1, y2 in zip(prob, y_valid):
    if (y1 > 0.5 and y2 == 1):
        # print(f"prob: {y1}, label: {y2}")
        count += 1
print(count)
# print(accuracy(label, prob))

0.9517685589519651
183


In [87]:
load_PATH = "/home/ayrtont/Desktop/coding/Kaggle-Home-Credit-Credit-Risk-Model-Stability/ckpt/TransformerKAN.pth"
m = torch.load(PATH)

In [88]:
res = predict(m, x_valid, batch_size)

for pro, lab in zip(res, y_valid):
    if pro > 0.5 and lab == 1:
        print(f"prob: {pro}, label: {lab}")

prob: 0.5297530889511108, label: tensor([1.])
prob: 0.5131036639213562, label: tensor([1.])
prob: 0.7869729995727539, label: tensor([1.])
prob: 0.6509506702423096, label: tensor([1.])
prob: 0.622656524181366, label: tensor([1.])
prob: 0.6067826747894287, label: tensor([1.])
prob: 0.6738108992576599, label: tensor([1.])
prob: 0.5543125867843628, label: tensor([1.])
prob: 0.7199099659919739, label: tensor([1.])
prob: 0.6449520587921143, label: tensor([1.])
prob: 0.5598059296607971, label: tensor([1.])
prob: 0.5186902284622192, label: tensor([1.])
prob: 0.6957083344459534, label: tensor([1.])
prob: 0.6137818694114685, label: tensor([1.])
prob: 0.5435118079185486, label: tensor([1.])
prob: 0.5100193023681641, label: tensor([1.])
prob: 0.590916097164154, label: tensor([1.])
prob: 0.5828084349632263, label: tensor([1.])
prob: 0.5583236813545227, label: tensor([1.])
prob: 0.6260208487510681, label: tensor([1.])
prob: 0.5055991411209106, label: tensor([1.])
prob: 0.5734447836875916, label: ten