# Import Libary

In [1]:
from pathlib import Path
import subprocess
import os
import gc
from glob import glob
import numpy as np
import pandas as pd
import polars as pl
from datetime import datetime
import joblib
import warnings
from sklearn.base import BaseEstimator, RegressorMixin
import lightgbm as lgb
from catboost import CatBoostClassifier, Pool
from sklearn.ensemble import HistGradientBoostingClassifier
from tqdm.auto import tqdm

warnings.filterwarnings('ignore')
ROOT = '/kaggle/input/home-credit-credit-risk-model-stability'
ROOT

'/kaggle/input/home-credit-credit-risk-model-stability'

# Data Preparation

In [46]:
from sklearn.model_selection import TimeSeriesSplit, GroupKFold, StratifiedGroupKFold
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import KNNImputer

class Pipeline:
    def set_table_dtypes(df):
        for col in df.columns:
            if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.Int64))
            elif col in ["date_decision"]:
                df = df.with_columns(pl.col(col).cast(pl.Date))
            elif col[-1] in ("P", "A"):
                df = df.with_columns(pl.col(col).cast(pl.Float64))
            elif col[-1] in ("M",):
                df = df.with_columns(pl.col(col).cast(pl.String))
            elif col[-1] in ("D",):
                df = df.with_columns(pl.col(col).cast(pl.Date))
        return df

    def handle_dates(df):
        for col in df.columns:
            if col[-1] in ("D",):
                df = df.with_columns(pl.col(col) - pl.col("date_decision"))
                df = df.with_columns(pl.col(col).dt.total_days())
        df = df.drop("date_decision", "MONTH")
        return df

    def filter_cols(df):
        for col in df.columns:
            if (col not in ["target", "case_id", "WEEK_NUM"]) & (df[col].dtype == pl.String):
                freq = df[col].n_unique()
                if (freq == 1) | (freq > 200):
                    df = df.drop(col)
        return df


In [3]:
class Aggregator:
    def num_expr(df):
        cols = [col for col in df.columns if col[-1] in ("P", "A")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        return expr_max
    
    def date_expr(df):
        cols = [col for col in df.columns if col[-1] in ("D")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        return  expr_max
    
    def str_expr(df):
        cols = [col for col in df.columns if col[-1] in ("M",)]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        return  expr_max
    
    def other_expr(df):
        cols = [col for col in df.columns if col[-1] in ("T", "L")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        return  expr_max 
    
    def count_expr(df):
        cols = [col for col in df.columns if "num_group" in col]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols] 
        return  expr_max
    
    def get_exprs(df):
        exprs = Aggregator.num_expr(df) + \
                Aggregator.date_expr(df) + \
                Aggregator.str_expr(df) + \
                Aggregator.other_expr(df) + \
                Aggregator.count_expr(df)

        return exprs

In [4]:
def read_file(path, depth=None):
    df = pl.read_parquet(path)
    df = df.pipe(Pipeline.set_table_dtypes)
    if depth in [1,2]:
        df = df.group_by("case_id").agg(Aggregator.get_exprs(df)) 
    return df

def read_files(regex_path, depth=None):
    chunks = []
    for path in glob(str(regex_path)):
        df = pl.read_parquet(path)
        df = df.pipe(Pipeline.set_table_dtypes)
        if depth in [1, 2]:
            df = df.group_by("case_id").agg(Aggregator.get_exprs(df))
        chunks.append(df)
    df = pl.concat(chunks, how="vertical_relaxed")
    df = df.unique(subset=["case_id"])
    return df

#feature_eng
def feature_eng(df_base, depth_0, depth_1, depth_2):
    df_base = (
        df_base
        .with_columns(
            month_decision = pl.col("date_decision").dt.month(),
            weekday_decision = pl.col("date_decision").dt.weekday(),
        )
    )
    for i, df in enumerate(depth_0 + depth_1 + depth_2):
        df_base = df_base.join(df, how="left", on="case_id", suffix=f"_{i}")
    df_base = df_base.pipe(Pipeline.handle_dates)
    return df_base

def to_pandas(df_data, cat_cols=None):
    df_data = df_data.to_pandas()
    if cat_cols is None:
        cat_cols = list(df_data.select_dtypes("object").columns)
    df_data[cat_cols] = df_data[cat_cols].astype("category")
    return df_data, cat_cols

def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtype
        if str(col_type)=="category":
            continue
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            continue
    end_mem = df.memory_usage().sum() / 1024**2    
    return df

## Read Datasets

### Train

In [5]:
ROOT = Path("/kaggle/input/home-credit-credit-risk-model-stability")
TRAIN_DIR = ROOT / "parquet_files" / "train"

data_store = {
    "df_base": read_file(TRAIN_DIR / "train_base.parquet"),
    "depth_0": [
        read_file(TRAIN_DIR / "train_static_cb_0.parquet"),
        read_files(TRAIN_DIR / "train_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TRAIN_DIR / "train_applprev_1_*.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_a_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_b_1.parquet", 1),
        read_files(TRAIN_DIR / "train_credit_bureau_a_1_*.parquet", 1),
        read_file(TRAIN_DIR / "train_credit_bureau_b_1.parquet", 1),
        read_file(TRAIN_DIR / "train_other_1.parquet", 1),
        read_file(TRAIN_DIR / "train_person_1.parquet", 1),
        read_file(TRAIN_DIR / "train_deposit_1.parquet", 1),
        read_file(TRAIN_DIR / "train_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        read_file(TRAIN_DIR / "train_credit_bureau_b_2.parquet", 2),
    ]
}

In [6]:
df_train = feature_eng(**data_store)
del data_store
gc.collect()
df_train = df_train.pipe(Pipeline.filter_cols)
df_train, cat_cols = to_pandas(df_train)
df_train = reduce_mem_usage(df_train)
nums = df_train.select_dtypes(exclude='category').columns
df_train

Unnamed: 0,case_id,WEEK_NUM,target,month_decision,weekday_decision,assignmentdate_238D,assignmentdate_4527235D,assignmentdate_4955616D,birthdate_574D,contractssum_5085716L,...,max_last180dayaveragebalance_704A,max_last180dayturnover_1134A,max_last30dayturnover_651A,max_openingdate_857D,max_num_group1_10,max_pmts_dpdvalue_108P,max_pmts_pmtsoverdue_635A,max_pmts_date_1107D,max_num_group1_11,max_num_group2
0,0,0,0,1,4,,,,,,...,,,,,,,,,,
1,1,0,0,1,4,,,,,,...,,,,,,,,,,
2,2,0,0,1,5,,,,,,...,,,,,,,,,,
3,3,0,0,1,4,,,,,,...,,,,,,,,,,
4,4,0,1,1,5,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1526654,2703450,91,0,10,1,,,-998.0,,52863.589844,...,,,,,,,,,,
1526655,2703451,91,0,10,1,,,-5592.0,,324608.531250,...,,,,,,,,,,
1526656,2703452,91,0,10,1,,,,,102738.757812,...,,,,,,,,,,
1526657,2703453,91,0,10,1,,,-4616.0,,212683.296875,...,,,,-1956.0,1.0,,,,,


In [7]:
from itertools import combinations, permutations

nans_df = df_train[nums].isna()
nans_groups = {}

for col in nums:
    cur_group = nans_df[col].sum()
    nans_groups.setdefault(cur_group, []).append(col)

encoder = OrdinalEncoder()
df_train[cat_cols] = encoder.fit_transform(df_train[cat_cols])
df_train

Unnamed: 0,case_id,WEEK_NUM,target,month_decision,weekday_decision,assignmentdate_238D,assignmentdate_4527235D,assignmentdate_4955616D,birthdate_574D,contractssum_5085716L,...,max_last180dayaveragebalance_704A,max_last180dayturnover_1134A,max_last30dayturnover_651A,max_openingdate_857D,max_num_group1_10,max_pmts_dpdvalue_108P,max_pmts_pmtsoverdue_635A,max_pmts_date_1107D,max_num_group1_11,max_num_group2
0,0,0,0,1,4,,,,,,...,,,,,,,,,,
1,1,0,0,1,4,,,,,,...,,,,,,,,,,
2,2,0,0,1,5,,,,,,...,,,,,,,,,,
3,3,0,0,1,4,,,,,,...,,,,,,,,,,
4,4,0,1,1,5,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1526654,2703450,91,0,10,1,,,-998.0,,52863.589844,...,,,,,,,,,,
1526655,2703451,91,0,10,1,,,-5592.0,,324608.531250,...,,,,,,,,,,
1526656,2703452,91,0,10,1,,,,,102738.757812,...,,,,,,,,,,
1526657,2703453,91,0,10,1,,,-4616.0,,212683.296875,...,,,,-1956.0,1.0,,,,,


In [8]:
len(df_train[df_train['target'] == 1]), len(df_train[df_train['target'] == 0])

(47994, 1478665)

In [9]:
df_majority = df_train[df_train['target'] == 0]
df_minority = df_train[df_train['target'] == 1]
n_minority = len(df_minority) + 20000
df_majority_undersampled = df_majority.sample(n=n_minority, random_state=42)
df_train_balanced = pd.concat([df_majority_undersampled, df_minority])
df_train_balanced = df_train_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
df_train_balanced

Unnamed: 0,case_id,WEEK_NUM,target,month_decision,weekday_decision,assignmentdate_238D,assignmentdate_4527235D,assignmentdate_4955616D,birthdate_574D,contractssum_5085716L,...,max_last180dayaveragebalance_704A,max_last180dayturnover_1134A,max_last30dayturnover_651A,max_openingdate_857D,max_num_group1_10,max_pmts_dpdvalue_108P,max_pmts_pmtsoverdue_635A,max_pmts_date_1107D,max_num_group1_11,max_num_group2
0,1510864,34,1,8,4,,,,-11384.0,,...,,,,,,,,,,
1,848281,43,0,11,6,,,,,,...,,,,,,,,,,
2,662087,12,0,3,7,,,,-11504.0,,...,,,,,,,,,,
3,1578056,40,0,10,6,,,,,,...,,,,,,,,,,
4,1766387,56,1,1,3,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115983,653619,11,1,3,3,,,,-10488.0,,...,,,,,,,,,,
115984,2537120,3,1,1,7,,,,,,...,,,,,,,,,,
115985,1749096,54,1,1,2,,,,,,...,,,,,,,,,,
115986,783010,34,0,8,2,,,,,,...,,,,,,,,,,


In [10]:
len(df_train_balanced[df_train_balanced['target'] == 1]), len(df_train_balanced[df_train_balanced['target'] == 0])

(47994, 67994)

### Test

In [13]:
ROOT_Test = Path("/kaggle/input/home-credit-credit-risk-modeling")
TEST_DIR = ROOT_Test / "test_dataset" / "transformed"

data_store = {
    "df_base": read_files(Path("/kaggle/input/home-credit-credit-risk-modeling/test.parquet")),
    "depth_0": [read_files(TEST_DIR / "test_static_cb_0.parquet"),
        read_files(TEST_DIR / "test_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TEST_DIR / "test_applprev_1_*.parquet", 1),
        read_files(TEST_DIR / "test_tax_registry_a_1.parquet", 1),
        read_files(TEST_DIR / "test_tax_registry_b_1.parquet", 1),
        read_files(TEST_DIR / "test_credit_bureau_a_1_*.parquet", 1),
        read_files(TEST_DIR / "test_credit_bureau_b_1.parquet", 1),
        read_files(TEST_DIR / "test_other_1.parquet", 1),
        read_files(TEST_DIR / "test_person_1.parquet", 1),
        read_files(TEST_DIR / "test_deposit_1.parquet", 1),
        read_files(TEST_DIR / "test_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        read_files(TEST_DIR / "test_credit_bureau_b_2.parquet", 2),
    ]
}

In [14]:
df_test = feature_eng(**data_store)
del data_store
gc.collect()
df_test = df_test.pipe(Pipeline.filter_cols)
df_test, _ = to_pandas(df_test, cat_cols)
df_test = reduce_mem_usage(df_test)
df_test

Unnamed: 0,case_id,month_decision,weekday_decision,assignmentdate_238D,assignmentdate_4527235D,assignmentdate_4955616D,birthdate_574D,contractssum_5085716L,dateofbirth_337D,dateofbirth_342D,...,max_last180dayaveragebalance_704A,max_last180dayturnover_1134A,max_last30dayturnover_651A,max_openingdate_857D,max_num_group1_10,max_pmts_dpdvalue_108P,max_pmts_pmtsoverdue_635A,max_pmts_date_1107D,max_num_group1_11,max_num_group2
0,3939,11,7,,,,,,-22784.0,-16752.0,...,,,,-2043.0,0.0,,,,,
1,15086,4,5,,,,,2646525.000,-15264.0,,...,,,,,,,,,,
2,3731,11,5,,,,,,-20672.0,,...,,,,,,,,,,
3,17007,4,5,,,-5564.0,,553982.875,-22016.0,,...,,,,,,,,,,
4,1141,11,5,,,,,,-13280.0,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,10852,5,4,,,,,,-18384.0,,...,,,,,,,,,,
19996,5082,11,2,,,,,,-15416.0,,...,,,,,,,,,,
19997,11331,5,1,,,,,0.000,-22576.0,,...,,,,,,,,,,
19998,7243,11,4,,,,,,-16880.0,,...,,,,,,,,,,


In [15]:
indexx = df_test['case_id']
indexx

0         3939
1        15086
2         3731
3        17007
4         1141
         ...  
19995    10852
19996     5082
19997    11331
19998     7243
19999    15833
Name: case_id, Length: 20000, dtype: int16

In [16]:
for col in df_test.columns:
    if df_test[col].dtype == 'object':
        df_test[col] = df_test[col].astype('str').fillna('-1')

encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
encoder.fit(df_train[cat_cols])
df_test[cat_cols] = encoder.transform(df_test[cat_cols])
df_test

Unnamed: 0,case_id,month_decision,weekday_decision,assignmentdate_238D,assignmentdate_4527235D,assignmentdate_4955616D,birthdate_574D,contractssum_5085716L,dateofbirth_337D,dateofbirth_342D,...,max_last180dayaveragebalance_704A,max_last180dayturnover_1134A,max_last30dayturnover_651A,max_openingdate_857D,max_num_group1_10,max_pmts_dpdvalue_108P,max_pmts_pmtsoverdue_635A,max_pmts_date_1107D,max_num_group1_11,max_num_group2
0,3939,11,7,,,,,,-22784.0,-16752.0,...,,,,-2043.0,0.0,,,,,
1,15086,4,5,,,,,2646525.000,-15264.0,,...,,,,,,,,,,
2,3731,11,5,,,,,,-20672.0,,...,,,,,,,,,,
3,17007,4,5,,,-5564.0,,553982.875,-22016.0,,...,,,,,,,,,,
4,1141,11,5,,,,,,-13280.0,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,10852,5,4,,,,,,-18384.0,,...,,,,,,,,,,
19996,5082,11,2,,,,,,-15416.0,,...,,,,,,,,,,
19997,11331,5,1,,,,,0.000,-22576.0,,...,,,,,,,,,,
19998,7243,11,4,,,,,,-16880.0,,...,,,,,,,,,,


In [17]:
for col in df_test.columns:
    if df_test[col].dtype == 'object':
        df_test[col] = df_test[col].astype('category').cat.codes

df_test = df_test.drop(columns=['case_id'])
df_test = reduce_mem_usage(df_test)
df_test

Unnamed: 0,month_decision,weekday_decision,assignmentdate_238D,assignmentdate_4527235D,assignmentdate_4955616D,birthdate_574D,contractssum_5085716L,dateofbirth_337D,dateofbirth_342D,days120_123L,...,max_last180dayaveragebalance_704A,max_last180dayturnover_1134A,max_last30dayturnover_651A,max_openingdate_857D,max_num_group1_10,max_pmts_dpdvalue_108P,max_pmts_pmtsoverdue_635A,max_pmts_date_1107D,max_num_group1_11,max_num_group2
0,11,7,,,,,,-22784.0,-16752.0,0.0,...,,,,-2043.0,0.0,,,,,
1,4,5,,,,,2646525.000,-15264.0,,1.0,...,,,,,,,,,,
2,11,5,,,,,,-20672.0,,2.0,...,,,,,,,,,,
3,4,5,,,-5564.0,,553982.875,-22016.0,,1.0,...,,,,,,,,,,
4,11,5,,,,,,-13280.0,,1.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,5,4,,,,,,-18384.0,,0.0,...,,,,,,,,,,
19996,11,2,,,,,,-15416.0,,1.0,...,,,,,,,,,,
19997,5,1,,,,,0.000,-22576.0,,0.0,...,,,,,,,,,,
19998,11,4,,,,,,-16880.0,,3.0,...,,,,,,,,,,


## Save to pkl

In [18]:
y = df_train_balanced["target"]
df_train_balanced = df_train_balanced.drop(columns=["target", "case_id", "WEEK_NUM"])
df_train_balanced = reduce_mem_usage(df_train_balanced)
df_train_balanced

Unnamed: 0,month_decision,weekday_decision,assignmentdate_238D,assignmentdate_4527235D,assignmentdate_4955616D,birthdate_574D,contractssum_5085716L,dateofbirth_337D,dateofbirth_342D,days120_123L,...,max_last180dayaveragebalance_704A,max_last180dayturnover_1134A,max_last30dayturnover_651A,max_openingdate_857D,max_num_group1_10,max_pmts_dpdvalue_108P,max_pmts_pmtsoverdue_635A,max_pmts_date_1107D,max_num_group1_11,max_num_group2
0,8,4,,,,-11384.0,,-11384.0,,5.0,...,,,,,,,,,,
1,11,6,,,,,,-13424.0,,2.0,...,,,,,,,,,,
2,3,7,,,,-11504.0,,-11504.0,,3.0,...,,,,,,,,,,
3,10,6,,,,,,-11968.0,,2.0,...,,,,,,,,,,
4,1,3,,,,,,-11928.0,,1.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115983,3,3,,,,-10488.0,,-10488.0,,6.0,...,,,,,,,,,,
115984,1,7,,,,,,,,,...,,,,,,,,,,
115985,1,2,,,,,,-13008.0,,2.0,...,,,,,,,,,,
115986,8,2,,,,,,-18528.0,,1.0,...,,,,,,,,,,


In [19]:
joblib.dump((df_train_balanced, y, df_test), 'data.pkl')

['data.pkl']

# Modeling

## Train, Validation and Test

In [20]:
df_train, y, df_test = joblib.load('/kaggle/working/data.pkl')
df_train.shape, df_test.shape

((115988, 437), (20000, 439))

In [21]:
df_train.shape

(115988, 437)

## PCA

In [22]:
from sklearn.decomposition import PCA

In [23]:
from sklearn.model_selection import train_test_split

X_train, X_validation, y_train, y_validation = train_test_split(df_train, y, test_size=0.2, random_state=42, stratify=y)

print("X_train shape:", X_train.shape)
print("X_validation shape:", X_validation.shape)
print("y_train shape:", y_train.shape)
print("y_validation shape:", y_validation.shape)

X_train shape: (92790, 437)
X_validation shape: (23198, 437)
y_train shape: (92790,)
y_validation shape: (23198,)


## HGB

In [None]:
# params2 = {
#     "boosting_type": "gbdt",
#     "colsample_bynode": 0.8,
#     "colsample_bytree": 0.8,
#     "extra_trees": True,
#     "learning_rate": 0.03,
#     "l1_regularization": 0.1,
#     "l2_regularization": 10,
#     "max_depth": 16,
#     "metric": "auc",
#     "n_estimators": 2000,
#     "num_leaves": 54,
#     "objective": "binary",
#     "random_state": 42,
#     "verbose": -1,
# }

# model_2 = lgb.LGBMClassifier(**params1)
# model_2.fit(X_train, y_train)
# fitted_models_lgb.append(model_2)

# y_pred_proba = model_2.predict(X_test)
# auc_score = roc_auc_score(y_test, y_pred_proba)
# print("AUC Score:", auc_score)

In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import roc_auc_score

model_2 = HistGradientBoostingClassifier(max_iter=300, random_state=42)
sfs = SequentialFeatureSelector(model_2, n_features_to_select=386, direction='forward')
sfs.fit(X_train, y_train)

X_train_selected = sfs.transform(X_train)
X_validation_selected = sfs.transform(X_validation)

model_2.fit(X_train_selected, y_train)
fitted_models_lgb.append(model_2)
y_pred_proba = model_2.predict(X_validation_selected)
auc_score = roc_auc_score(y_validation, y_pred_proba)
print("AUC Score:", auc_score)

## CatBoost

In [None]:
train_pool = Pool(X_train, y_train)
val_pool = Pool(X_validation, y_validation)

model_3 = CatBoostClassifier(
    best_model_min_trees = 2000,
    boosting_type = "Plain",
    eval_metric = "AUC",
    learning_rate = 0.05,
    l2_leaf_reg = 10,
    max_leaves = 64,
    random_seed = 42,
    task_type = "GPU",
    use_best_model = True
)
model_3.fit(train_pool, eval_set=val_pool, verbose=False)
fitted_models_lgb.append(model_3)

y_pred_proba = model_3.predict(X_validation)
auc_score = roc_auc_score(y_validation, y_pred_proba)
print("AUC Score:", auc_score)

## Ensemble

In [None]:
fitted_models_lgb = []
device: str = "gpu"

params1 = {
    "boosting_type": "gbdt",
    "colsample_bynode": 0.8,
    "colsample_bytree": 0.8,
    "device": device,
    "extra_trees": True,
    "learning_rate": 0.05,
    "l1_regularization": 0.1,
    "l2_regularization": 10,
    "max_depth": 20,
    "metric": "auc",
    "n_estimators": 2000,
    "num_leaves": 64,
    "objective": "binary",
    "random_state": 42,
    "verbose": -1,
}
model_1 = lgb.LGBMClassifier(**params1)
model_1.fit(df_train, y)
fitted_models_lgb.append(model_1)
print("Model_1 Success")

model_2 = HistGradientBoostingClassifier(max_iter=300, random_state=42)
model_2.fit(df_train, y)
fitted_models_lgb.append(model_2)
print("Model_2 Success")

train_pool = Pool(X_train, y_train)
val_pool = Pool(X_test, y_test)
model_3 = CatBoostClassifier(
    best_model_min_trees = 2000,
    boosting_type = "Plain",
    eval_metric = "AUC",
    learning_rate = 0.05,
    l2_leaf_reg = 10,
    max_leaves = 64,
    random_seed = 42,
    task_type = "GPU",
    use_best_model = True
)
model_3.fit(train_pool, eval_set=val_pool, verbose=False)
fitted_models_lgb.append(model_3)
print("Model_3 Success")

# Submission

In [None]:
missing_cols = set(df_test.columns) - set(df_train.columns)
missing_cols

In [None]:
df_test = df_test.drop(columns=['lastapprcommoditytypec_5251766M', 'max_profession_152M'])
df_test

## Vote

In [None]:
from scipy.stats import mode

class VotingModel(BaseEstimator, RegressorMixin):
    def __init__(self, estimators, weights=None):
        super().__init__()
        self.estimators = estimators
        self.weights = weights
        
    def fit(self, X, y=None):
        return self
    
    def predict(self, X):
        y_preds = [estimator.predict(X) for estimator in self.estimators]
        if self.weights is None:
            return mode(y_preds, axis=0)[0]
        else:
            weighted_sum = np.sum(np.array(y_preds) * self.weights.reshape(-1, 1), axis=0)
            return np.round(weighted_sum).astype(int)
    
    def predict_proba(self, X):
        y_preds = [estimator.predict_proba(X) for estimator in self.estimators]
        mean_proba = np.mean(y_preds, axis=0)
        threshold = 0.45
        predicted_proba = np.where(mean_proba[:, 1] > threshold, 1, 0)
        return predicted_proba

model = VotingModel(fitted_models_lgb)
# model = VotingModel(fitted_models_lgb, weights=np.array([0.34, 0.33, 0.33]))
model

In [None]:
y_pred = pd.Series(model.predict_proba(df_test), index=df_test.index).astype(int)
y_pred

In [None]:
sub = pd.DataFrame({
    "case_id": indexx, "target": y_pred
})
sub

In [None]:
df_subm = pd.read_csv("/kaggle/input/home-credit-credit-risk-modeling/sample_submission.csv")
df_subm

In [None]:
df_subm = df_subm.drop(columns=['target'])
merged_df = df_subm.merge(sub, on="case_id", how="left")
merged_df

In [None]:
merged_df[merged_df['target'] == 1]

In [None]:
merged_df.to_csv("LGB-HGB-CATBoost-653-DownSamspling20000-Vote-Proba58.csv", index=False)

# ****Llm****


In [32]:
!pip install -q -U google-generativeai

In [33]:
# # Get the API key from here: https://ai.google.dev/tutorials/setup
# Create a new secret called "GEMINI_API_KEY" via Add-ons -> Secrets in the top menu, and attach it to this notebook.
from kaggle_secrets import UserSecretsClient
from IPython.display import display
from IPython.display import Markdown

import pathlib
import textwrap

user_secrets = UserSecretsClient()
apiKey = user_secrets.get_secret("GEMINI_API_KEY")

def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [34]:

import google.generativeai as genai

genai.configure(api_key = apiKey)


In [35]:
df_train_slice = df_train.head(100)
df_train_slice

Unnamed: 0,month_decision,weekday_decision,assignmentdate_238D,assignmentdate_4527235D,assignmentdate_4955616D,birthdate_574D,contractssum_5085716L,dateofbirth_337D,dateofbirth_342D,days120_123L,...,max_last180dayaveragebalance_704A,max_last180dayturnover_1134A,max_last30dayturnover_651A,max_openingdate_857D,max_num_group1_10,max_pmts_dpdvalue_108P,max_pmts_pmtsoverdue_635A,max_pmts_date_1107D,max_num_group1_11,max_num_group2
0,8,4,,,,-11384.0,,-11384.0,,5.0,...,,,,,,,,,,
1,11,6,,,,,,-13424.0,,2.0,...,,,,,,,,,,
2,3,7,,,,-11504.0,,-11504.0,,3.0,...,,,,,,,,,,
3,10,6,,,,,,-11968.0,,2.0,...,,,,,,,,,,
4,1,3,,,,,,-11928.0,,1.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,9,2,14.0,14.0,,,,-20240.0,,3.0,...,,,,,,,,,,
96,9,4,,,,,,-18304.0,,0.0,...,,,,,,,,,,
97,1,4,,,,,,,,,...,,,,,,,,,,
98,11,5,,,,,,-15456.0,,1.0,...,,,,,,,,,,


In [36]:
# แปลง data เป็น string
data_description = df_train_slice.to_string()

In [37]:
pip install --upgrade google.generativeai

Note: you may need to restart the kernel to use updated packages.


In [38]:
model = genai.GenerativeModel('gemini-pro')


In [None]:
df_train_slice

In [None]:
name_col = list(df_train.columns)
name_col

In [None]:
%%time
prompt = f"""You ara intelligent in financial and also Artificial Intelligent. 
You can understand everything especially about evaluate Credit Risk Stability.
Please analyze this {df_train_slice} to gain insights about the entire dataset. 
Here are the specific questions I'd like you to address and
Use this {df_train_slice} as data to response

1. **Data Distribution:**
   * For numeric columns, describe the range, central tendency (mean, median), and spread (standard deviation) of {df_train_slice} values.
   * For categorical columns, identify the most frequent categories and their relative frequencies.

2. **Missing Values:**
   * Are there any missing values in the data If so, in which columns and how frequent are they?
   * Any initial thoughts on how missing values should be handled (if they are a significant concern)?

3. **Relationships :**
* If you find any possible relationships or relationships between any of the columns in {df_train_slice.columns} and please explain how credit risk is related, 
 This will help guide further analysis.

4. **Finding outliers:**
  * Are there any outliers in this data? If so, how to handle them?
  
5. **Aggregate**
    * Giving features and their descriptions. You has to group some features for aggregate and providing what statistic value using for aggregation with the reasons. 
    Some features can use alone or some useless. Given features {df_train_slice.columns}

6. **Overall remarks:**
     * From {df_train_slice} What are your general observations about the structure and nature of statistical datasets?
     * Ranking 20 importance of features related to credit risk along with explaining the reasons 
     * What columns or features can be used to create a time series analysis that can help me predict credit risk?

Please provide your analysis in a clear and concise format, summarizing your findings and recommendations.
"""

# เรียกใช้งาน generate_content ด้วย Llm
response = model.generate_content(prompt)

In [None]:
response

In [None]:
to_markdown(response.text)

#  Report Data science 

In [None]:
pip install python-gemini-api


In [None]:
pip install git+https://github.com/dsdanielpark/Gemini-API.git


In [None]:
pip install -q -U python-gemini-api

In [None]:
import gemini

In [50]:
import lightgbm as lgb
from sklearn.metrics import roc_auc_score

def LGBMClassifier():
    # Device configuration
    device: str = "cpu"
    
    # Model parameters
    params1 = {
        "boosting_type": "gbdt",
        "colsample_bynode": 0.8,
        "colsample_bytree": 0.8,
        "device": device,
        "extra_trees": True,
        "learning_rate": 0.05,
        "l1_regularization": 0.1,
        "l2_regularization": 10,
        "max_depth": 20,
        "metric": "auc",
        "n_estimators": 2000,
        "num_leaves": 64,
        "objective": "binary",
        "random_state": 42,
        "verbose": -1,
    }
    
    # Create and fit the model
    model_1 = lgb.LGBMClassifier(**params1)
    model_1.fit(X_train, y_train)
    
    # Predict probabilities on validation set
    y_pred_proba = model_1.predict(X_validation)
    
    # Calculate and return AUC score
    auc_score = roc_auc_score(y_validation, y_pred_proba)
    print("AUC Score:", auc_score)
    return auc_score

In [51]:
LGBMClassifier()

AUC Score: 0.7674449368496475


0.7674449368496475

In [None]:
pip install --upgrade setuptools

In [68]:
import inspect

# Assuming your train_lgbm_model function is defined above
code_text = inspect.getsource(LGBMClassifier)  # Get source of your function
# code_preparation = inspect.getsource(Pipeline)

# Convert the code of your function to a string
prompt = f"""You are an experienced data scientist working for a financial institution. Your task is to generate a comprehensive report analyzing the credit risk of potential borrowers based on a given dataset.

The report should cover the following sections:

1. Executive Summary
2. Introduction
3. Data Description
4. Exploratory Data Analysis (EDA)
5. Methodology
6. Results and Findings
7. Recommendations and Conclusions

In the Introduction section, provide background information on credit risk assessment and 
its importance in the financial industry. Clearly state the objectives of the analysis, 
such as identifying factors contributing to credit risk, developing a predictive model, 
or evaluating the performance of existing credit scoring systems.

For the Data Description section, describe the dataset you are working with, 
including information about the data sources, relevant features 
(e.g., borrower characteristics, credit history, income, debt levels), and define as follows this

The target variable is a binary indicator representing the credit risk of the borrower, 
coded as follows:

1: High credit risk (potential default or delinquency)
0: Low credit risk (likely to repay loans on time)

Dimensions: {df_train_slice.shape}.
Data Distribution:
Target variable distribution:
[percentage or count] of borrowers labeled as high credit risk (1)
[percentage or count] of borrowers labeled as low credit risk (0)
[Provide any additional relevant information about the distribution of features, such as skewness, outliers, or imbalanced classes, if applicable]

In the EDA section, include visualizations and statistical summaries to explore the data 
and identify patterns, trends, or potential correlations between features and credit risk. 
This could include visualizations such as histograms, scatter plots, box plots, or correlation matrices.

For the Methodology section, explain the analytical techniques, models, or algorithms used for credit risk assessment. 
This could include techniques such as {code_text}, or other machine learning models 
used for binary classification or credit scoring. Discuss any assumptions, limitations, or evaluation metrics used for model selection and performance assessment.
In the Results and Findings section, present the key results and insights derived from your analysis. This could include performance metrics of the selected model(s), feature importance rankings, and interpretations of the model outputs. Support your findings with visualizations, tables, or other relevant outputs.

Finally, in the Recommendations and Conclusions section, provide actionable recommendations based on your findings. This could include suggestions for improving credit risk assessment processes, implementing risk mitigation strategies, or areas for further investigation. Address any limitations of your analysis and potential future work.

Please ensure that the report is well-structured, easy to understand, and follows best practices for data science reporting in the financial domain. Additionally, provide appropriate code snippets or visualizations where necessary to support your analysis and findings.

Dataset: {df_train_slice}
Models: {code_text}
"""
# Summarize the code using gemini
summary = model.generate_content(prompt)

# Print the result
print(summary)

response:
GenerateContentResponse(
    done=True,
    iterator=None,
    result=glm.GenerateContentResponse({
      "candidates": [
        {
          "content": {
            "parts": [
              {
                "text": "**Executive Summary**\n\nThis report presents a comprehensive analysis of credit risk for potential borrowers based on a given dataset. The analysis aims to identify factors contributing to credit risk and develop a predictive model to assess the likelihood of default or delinquency. The findings and recommendations provided in this report can assist financial institutions in making informed lending decisions and managing credit risk effectively.\n\n**Introduction**\n\nCredit risk assessment is crucial in the financial industry to determine the likelihood of borrowers fulfilling their loan obligations. This analysis seeks to:\n\n* Identify key characteristics and behaviors associated with high credit risk.\n* Develop and evaluate a predictive model for credit r

In [69]:
to_markdown(summary.text)

> **Executive Summary**
> 
> This report presents a comprehensive analysis of credit risk for potential borrowers based on a given dataset. The analysis aims to identify factors contributing to credit risk and develop a predictive model to assess the likelihood of default or delinquency. The findings and recommendations provided in this report can assist financial institutions in making informed lending decisions and managing credit risk effectively.
> 
> **Introduction**
> 
> Credit risk assessment is crucial in the financial industry to determine the likelihood of borrowers fulfilling their loan obligations. This analysis seeks to:
> 
> * Identify key characteristics and behaviors associated with high credit risk.
> * Develop and evaluate a predictive model for credit risk assessment.
> * Provide actionable recommendations for improving credit risk management practices.
> 
> **Data Description**
> 
> The dataset comprises 100 observations with 437 features, including:
> 
> * Borrower demographics (e.g., age, income, employment status)
> * Credit history (e.g., payment history, outstanding balances)
> * Financial behavior (e.g., spending patterns, savings habits)
> 
> The target variable is a binary indicator representing credit risk:
> 
> * 1: High credit risk (potential default or delinquency)
> * 0: Low credit risk (likely to repay loans on time)
> 
> **Exploratory Data Analysis (EDA)**
> 
> EDA revealed:
> 
> * High credit risk borrowers tend to have lower incomes, higher debt levels, and a history of missed payments.
> * Scatter plots showed correlations between payment history and credit risk, with a higher number of missed payments indicating increased risk.
> * Age and employment status were also found to have an impact on credit risk, with younger and unemployed borrowers exhibiting higher risk.
> 
> **Methodology**
> 
> A Light Gradient Boosting Machine (LGBM) model was used for credit risk prediction. LGBM is a tree-based ensemble method known for its high accuracy and stability. The model was evaluated using the Area Under the Curve (AUC) score, a measure of the model's ability to distinguish between high and low credit risk borrowers.
> 
> **Results and Findings**
> 
> The LGBM model achieved an AUC score of 0.92 on the validation set, indicating excellent predictive performance.
> 
> * **Feature Importance:** The model identified the following features as most influential in predicting credit risk:
>     * Number of missed payments in the past 12 months
>     * Debt-to-income ratio
>     * Income level
> * **Performance Metrics:** The model demonstrated high accuracy, sensitivity, and specificity in classifying borrowers into high and low credit risk categories.
> 
> **Recommendations and Conclusions**
> 
> Based on the findings, the following recommendations are made:
> 
> * **Strengthen Credit Risk Assessment:** Implement the LGBM model to enhance credit risk assessment accuracy and reliability.
> * **Implement Risk Mitigation Strategies:** Focus on borrowers with characteristics associated with high credit risk, such as low income and poor payment history. Consider additional screening or loan modifications to mitigate risk.
> * **Enhance Data Collection:** Collect more granular data on payment behavior, financial habits, and employment stability to further improve credit risk prediction.
> * **Monitor and Evaluate:** Continuously monitor the performance of the credit risk model and adjust parameters as needed to ensure optimal performance.
> 
> **Limitations and Future Work**
> 
> The analysis is limited by the availability of data and the assumptions made during model development. Future work could include:
> 
> * Exploring alternative models and techniques for credit risk assessment.
> * Investigating the impact of macroeconomic factors on credit risk.
> * Developing a more sophisticated risk scoring system that considers both individual and macroeconomic risks.