In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/home-credit-credit-risk-model-stability/sample_submission.csv
/kaggle/input/home-credit-credit-risk-model-stability/feature_definitions.csv
/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/test/test_deposit_1.parquet
/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/test/test_applprev_2.parquet
/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/test/test_static_cb_0.parquet
/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/test/test_static_0_0.parquet
/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/test/test_credit_bureau_a_1_3.parquet
/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/test/test_credit_bureau_a_1_2.parquet
/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/test/test_tax_registry_b_1.parquet
/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/test/test_static_0_2.parquet
/kaggle/input/home-credit-credit-risk-model-st

In [2]:
import polars as pl
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score 

dataPath = "/kaggle/input/home-credit-credit-risk-model-stability/"
InputFile = "/kaggle/input/my-home-credit-credit-risk-model-stability"



In [3]:
X_train = pd.read_csv(InputFile + "/X_train.csv")
X_valid = pd.read_csv(InputFile + "/X_valid.csv")
X_test = pd.read_csv(InputFile + "/X_test.csv")

y_train = pd.read_csv(InputFile + "/y_train.csv")
y_valid = pd.read_csv(InputFile + "/y_valid.csv")
y_test = pd.read_csv(InputFile + "/y_test.csv")

base_train = pd.read_csv(InputFile + "/base_train.csv")
base_valid = pd.read_csv(InputFile + "/base_valid.csv")
base_test = pd.read_csv(InputFile + "/base_test.csv")

In [4]:
def convert_strings(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.columns:
        if df[col].dtype.name in ['object', 'string']:
            df[col] = df[col].astype("string").astype('category')
            current_categories = df[col].cat.categories
            new_categories = current_categories.to_list() + ["Unknown"]
            new_dtype = pd.CategoricalDtype(categories=new_categories, ordered=True)
            df[col] = df[col].astype(new_dtype)
            
    return df

for df in [X_train, X_valid, X_test]:
    df = convert_strings(df)

# 機械学習モデルの構築・学習・予測

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt

seed = 42

xgb = XGBClassifier(random_state=seed)
lgb = LGBMClassifier(random_state=seed)
cb = CatBoostClassifier(random_state=seed, verbose=0)

submit = True

# ハイパーパラメータ(optuna)

## LGBMClassifier

In [6]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
import lightgbm as lgb

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, classification_report

from optuna.samplers import TPESampler
import optuna

import warnings
warnings.filterwarnings("ignore")

In [7]:
def objective(trial):
    param = {
        "objective": "binary",
        "metric": "auc",
        "verbose": -1,
        "boosting_type": "gbdt",
#         "num_class": 7,
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "random_state": 42,
    }
    
    gbm = lgb.LGBMClassifier(**param)
    gbm.fit(X_train, y_train)
    preds = gbm.predict(X_valid)
    accuracy = accuracy_score(y_valid, preds)
    
    return accuracy

In [8]:
sampler = TPESampler(seed=42)
lgb_study = optuna.create_study(study_name="lightgbm", direction="maximize", sampler=sampler)
lgb_study.optimize(objective, n_trials=100)

[I 2024-03-14 10:19:47,653] A new study created in memory with name: lightgbm
[I 2024-03-14 10:20:17,814] Trial 0 finished with value: 0.9690075065829982 and parameters: {'lambda_l1': 2.348881295853308e-05, 'lambda_l2': 3.6010467344475403, 'num_leaves': 188, 'feature_fraction': 0.759195090518222, 'bagging_fraction': 0.4936111842654619, 'bagging_freq': 2, 'min_child_samples': 10}. Best is trial 0 with value: 0.9690075065829982.
[I 2024-03-14 10:20:47,308] Trial 1 finished with value: 0.9689682051013323 and parameters: {'lambda_l1': 0.6245760287469893, 'lambda_l2': 0.002570603566117598, 'num_leaves': 182, 'feature_fraction': 0.41235069657748147, 'bagging_fraction': 0.9819459112971965, 'bagging_freq': 6, 'min_child_samples': 25}. Best is trial 0 with value: 0.9690075065829982.
[I 2024-03-14 10:21:11,650] Trial 2 finished with value: 0.9690206070768868 and parameters: {'lambda_l1': 4.329370014459266e-07, 'lambda_l2': 4.4734294104626844e-07, 'num_leaves': 79, 'feature_fraction': 0.714853858

In [9]:
print('Best parameters:', lgb_study.best_params)

Best parameters: {'lambda_l1': 1.683094921772377, 'lambda_l2': 0.002665604099762467, 'num_leaves': 21, 'feature_fraction': 0.815220742109687, 'bagging_fraction': 0.7163065619735258, 'bagging_freq': 2, 'min_child_samples': 74}


In [10]:
print('Best value:', lgb_study.best_value)

Best value: 0.9691057602871628


In [11]:
print('Best trial:', lgb_study.best_trial)

Best trial: FrozenTrial(number=81, state=TrialState.COMPLETE, values=[0.9691057602871628], datetime_start=datetime.datetime(2024, 3, 14, 10, 53, 11, 718141), datetime_complete=datetime.datetime(2024, 3, 14, 10, 53, 34, 674366), params={'lambda_l1': 1.683094921772377, 'lambda_l2': 0.002665604099762467, 'num_leaves': 21, 'feature_fraction': 0.815220742109687, 'bagging_fraction': 0.7163065619735258, 'bagging_freq': 2, 'min_child_samples': 74}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'lambda_l1': FloatDistribution(high=10.0, log=True, low=1e-08, step=None), 'lambda_l2': FloatDistribution(high=10.0, log=True, low=1e-08, step=None), 'num_leaves': IntDistribution(high=256, log=False, low=2, step=1), 'feature_fraction': FloatDistribution(high=1.0, log=False, low=0.4, step=None), 'bagging_fraction': FloatDistribution(high=1.0, log=False, low=0.4, step=None), 'bagging_freq': IntDistribution(high=7, log=False, low=1, step=1), 'min_child_samples': IntDistribution(hig

# XGBClassifier

In [12]:
if submit == False:
    import optuna
    from xgboost import XGBClassifier
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import roc_auc_score

    # Assuming 'X' is your feature matrix and 'y' is your target variable


    def objective(trial):

        params = {
            'max_depth': trial.suggest_int('max_depth', 5, 10),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0),
            'n_estimators': trial.suggest_int('n_estimators', 150, 1000),
            'subsample': trial.suggest_float('subsample', 0.01, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
            'random_state': trial.suggest_categorical('random_state', [42]),
            'tree_method': 'hist',  # Use GPU for training
            'device': 'cuda',
            'njobs': -1,
            'eval_metric': 'auc',  # Evaluation metric
            'verbosity': 2,  # Set verbosity to 0 for less output
        }

        model = XGBClassifier(**params)

        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=100, verbose=False)

        y_pred = model.predict_proba(X_valid)[:, 1]
        auc = roc_auc_score(y_valid, y_pred)

        return auc

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=100)

    print('Number of finished trials: ', len(study.trials))
    print('Best trial:')
    trial = study.best_trial

    print('Value: ', trial.value)
    print('Params: ')
    for key, value in trial.params.items():
        print(f'    {key}: {value}')

# Catboost

In [13]:
if submit==False:

    import optuna
    from catboost import CatBoostClassifier
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import roc_auc_score

    # Assuming 'X' is your feature matrix and 'y' is your target variable


    def objective(trial):

        params = {
            'iterations': trial.suggest_int('iterations', 200, 1000),
            'depth': trial.suggest_int('depth', 3, 10),
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 2, 20),
            'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.2, log=True),
            'random_state': 42,
            'verbose': 0,
            'eval_metric': 'AUC',
        }

        model = CatBoostClassifier(**params)

        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=50)

        y_pred = model.predict_proba(X_valid)[:, 1]
        auc = roc_auc_score(y_valid, y_pred)

        return auc

    cat_study = optuna.create_study(direction='maximize')
    cat_study.optimize(objective, n_trials=100)

    print('Number of finished trials: ', len(cat_study.trials))
    print('Best trial:')
    trial = cat_study.best_trial

    print('Value: ', trial.value)
    print('Params: ')
    for key, value in trial.params.items():
        print(f'    {key}: {value}')

# パラメータ

In [14]:
lgb_params = {}
xgb_params = {}
cat_params = {}

In [15]:
lgb_model = LGBMClassifier(**lgb_params)
xgb_model = XGBClassifier(**xgb_params)
cat_model = CatBoostClassifier(**cat_params)