In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/my-home-credit-credit-risk-model-stability/base_test.csv
/kaggle/input/my-home-credit-credit-risk-model-stability/y_valid.csv
/kaggle/input/my-home-credit-credit-risk-model-stability/X_valid.csv
/kaggle/input/my-home-credit-credit-risk-model-stability/y_test.csv
/kaggle/input/my-home-credit-credit-risk-model-stability/X_test.csv
/kaggle/input/my-home-credit-credit-risk-model-stability/y_train.csv
/kaggle/input/my-home-credit-credit-risk-model-stability/X_train.csv
/kaggle/input/my-home-credit-credit-risk-model-stability/base_train.csv
/kaggle/input/my-home-credit-credit-risk-model-stability/base_valid.csv
/kaggle/input/home-credit-credit-risk-model-stability/sample_submission.csv
/kaggle/input/home-credit-credit-risk-model-stability/feature_definitions.csv
/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/test/test_deposit_1.parquet
/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/test/test_applprev_2.parquet
/kaggle/input/home-credit-

In [2]:
import polars as pl
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score 

dataPath = "/kaggle/input/home-credit-credit-risk-model-stability/"
InputFile = "/kaggle/input/my-home-credit-credit-risk-model-stability"

In [3]:
X_train = pd.read_csv(InputFile + "/X_train.csv")
X_valid = pd.read_csv(InputFile + "/X_valid.csv")
X_test = pd.read_csv(InputFile + "/X_test.csv")

y_train = pd.read_csv(InputFile + "/y_train.csv")
y_valid = pd.read_csv(InputFile + "/y_valid.csv")
y_test = pd.read_csv(InputFile + "/y_test.csv")

base_train = pd.read_csv(InputFile + "/base_train.csv")
base_valid = pd.read_csv(InputFile + "/base_valid.csv")
base_test = pd.read_csv(InputFile + "/base_test.csv")

In [4]:
def convert_strings(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.columns:
        if df[col].dtype.name in ['object', 'string']:
            df[col] = df[col].astype("string").astype('category')
            current_categories = df[col].cat.categories
            new_categories = current_categories.to_list() + ["Unknown"]
            new_dtype = pd.CategoricalDtype(categories=new_categories, ordered=True)
            df[col] = df[col].astype(new_dtype)
            
    return df

for df in [X_train, X_valid, X_test]:
    df = convert_strings(df)

# 機械学習モデルの構築・学習・予測

## ライブラリのインポート

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 評価指標
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, recall_score, precision_score, confusion_matrix, mean_absolute_error, mean_squared_error, r2_score

# 機械学習モデル
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier


# 実験
import datetime
import pickle

In [6]:
lgb_params = {'lambda_l1': 2.1027501459454756, 
              'lambda_l2': 5.127117846679184e-05, 
              'num_leaves': 33, 
              'feature_fraction': 0.9791930917480678, 
              'bagging_fraction': 0.5791835574482972, 
              'bagging_freq': 3, 
              'min_child_samples': 84,
              "n_estimators": 1000,
              "verbose": -1,
              "boosting_type": "gbdt",
              "objective": "binary",
              "metric": "auc",
              "max_depth": 3,
              "learning_rate": 0.05,
             }
xgb_params = {}
cat_params = {}

In [7]:
def make_model_and_eval(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # accuracy
    acc_train = accuracy_score(y_train, y_pred_train)
    acc_test = accuracy_score(y_test, y_pred_test)
    # F1_score
    f1_train = f1_score(y_train, y_pred_train, average='macro')
    f1_test = f1_score(y_test, y_pred_test, average='macro')
    # recall
    recall_train = recall_score(y_train, y_pred_train, average='macro')
    recall_test = recall_score(y_test, y_pred_test, average='macro')
    # precision
    precision_train = precision_score(y_train, y_pred_train, average='macro')
    precision_test = precision_score(y_test, y_pred_test, average='macro')
#    # AUC
    auc_train = roc_auc_score(y_train, y_pred_train)
    auc_test = roc_auc_score(y_test, y_pred_test)
   # MAE
    mae_train = mean_absolute_error(y_train, y_pred_train)
    mae_test = mean_absolute_error(y_test, y_pred_test)
   # MSE
    mse_train = mean_squared_error(y_train, y_pred_train)
    mse_test = mean_squared_error(y_test, y_pred_test)
   # RMSE
    rmse_train = np.sqrt(mse_train)
    rmse_test = np.sqrt(mse_test)
   # r2_score
    r2_train = r2_score(y_train, y_pred_train)
    r2_test = r2_score(y_test, y_pred_test)

    tn_train, fp_train, fn_train, tp_train = confusion_matrix(y_train, y_pred_train).ravel()
    tn_test, fp_test, fn_test, tp_test = confusion_matrix(y_test, y_pred_test).ravel()
    score_train = pd.DataFrame({'DataCategory':['Train'],
                               'auc':[auc_train],
                               'accuracy':[acc_train],
                               'f1':[f1_train],
                               'recall':[recall_train],
                               'precision':[precision_train],
                               'MAE':[mae_train],
                               'MSE':[mse_train],
                               'RMSE':[rmse_train],
                               'r2':[r2_train],
                               'tp':[tp_train],
                               'fn':[fn_train],
                               'fp':[fp_train],
                               'tn':[tn_train]
                               })
    score_test = pd.DataFrame({'DataCategory':['Valid'],
                              'auc':[auc_test],
                              'accuracy':[acc_test],
                              'f1':[f1_test],
                              'recall':[recall_test],
                              'precision':[precision_test],
                              'MAE':[mae_test],
                              'MSE':[mse_test],
                              'RMSE':[rmse_train],
                              'r2':[r2_train],
                              'tp':[tp_test],
                              'fn':[fn_test],
                              'fp':[fp_test],
                              'tn':[tn_test]
                              })
    score = pd.concat([score_train, score_test], ignore_index=True)
    
    importance = pd.DataFrame({'cols': X_train.columns,
                              'importance': model.feature_importances_})
    importance = importance.sort_values('importance', ascending=False)
    cols = pd.DataFrame({'X_cols': X_train.columns})
    display(score)
    
    return score, importance, model, cols

In [8]:
now = datetime.datetime.now().strftime("%Y%m%d%H")
target_output_dir = 'results_' + now

os.makedirs(target_output_dir, exist_ok=True)
print(target_output_dir)



models = {
#           'DecisionTreeClassifier': DecisionTreeClassifier(random_state=42), 
#           'RandomForest':RandomForestClassifier(random_state=42),
#           'GradientBoostingClassifier':GradientBoostingClassifier(random_state=42),
          'lightgbm': LGBMClassifier(**lgb_params),
#           'xgboost': XGBClassifier(**xgb_params),
#           'AdaBoostClassifier': AdaBoostClassifier(random_state=42),
#           'CatBoostClassifier': CatBoostClassifier(**cat_params),
#           'ExtraTreesClassifier': ExtraTreesClassifier(random_state=42),
         }

score_all = []
importance_all = []
for model_name, model in models.items():
    print(model_name)
    score, importance, model, cols = make_model_and_eval(model, X_train, X_valid, y_train, y_valid)
    score['model_name'] = model_name
    importance['model_name'] = model_name
    
    model_names = f'model_{model_name}.pickle'
    model_path = os.path.join(target_output_dir, model_names)
    with open(model_path, mode='wb') as f:
        pickle.dump(model, f, protocol=2)
    score_all.append(score)
    importance_all.append(importance)
score_all = pd.concat(score_all, ignore_index=True)
importance_all = pd.concat(importance_all, ignore_index=True)
cols = pd.DataFrame({'X_cols':X_train.columns})

score_name = 'score.csv'
importance_name = 'importance.csv'
cols_name = 'X_cols.csv'

score_path = os.path.join(target_output_dir, score_name)
importance_path = os.path.join(target_output_dir, importance_name)
cols_path = os.path.join(target_output_dir, cols_name)

score_all.to_csv(score_path, index=False)
importance_all.to_csv(importance_path, index=False)
cols.to_csv(cols_path, index=False)

results_2024031410
lightgbm


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Unnamed: 0,DataCategory,auc,accuracy,f1,recall,precision,MAE,MSE,RMSE,r2,tp,fn,fp,tn
0,Train,0.501706,0.968474,0.495499,0.501706,0.727149,0.031526,0.031526,0.177557,-0.03276,102,28770,108,887015
1,Valid,0.501415,0.969037,0.495081,0.501415,0.690463,0.030963,0.030963,0.177557,-0.03276,28,9414,40,295850


# pickleファイルのインポート

In [9]:
import pickle

model_name = 'model_lightgbm.pickle'

model_path = os.path.join(target_output_dir, model_name)

with open(model_path, mode='rb') as f:
    model = pickle.load(f)
    
print(model)

LGBMClassifier(bagging_fraction=0.5791835574482972, bagging_freq=3,
               feature_fraction=0.9791930917480678,
               lambda_l1=2.1027501459454756, lambda_l2=5.127117846679184e-05,
               learning_rate=0.05, max_depth=3, metric='auc',
               min_child_samples=84, n_estimators=1000, num_leaves=33,
               objective='binary', verbose=-1)


In [10]:
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_valid = lgb.Dataset(X_valid, label=y_valid, reference=lgb_train)

model = lgb.train(
    lgb_params,
    lgb_train,
    valid_sets=lgb_valid,
    callbacks=[lgb.log_evaluation(50), lgb.early_stopping(10)]
)



Training until validation scores don't improve for 10 rounds
[50]	valid_0's auc: 0.705108
[100]	valid_0's auc: 0.724319
[150]	valid_0's auc: 0.731642
[200]	valid_0's auc: 0.735681
[250]	valid_0's auc: 0.739215
[300]	valid_0's auc: 0.741274
[350]	valid_0's auc: 0.74328
[400]	valid_0's auc: 0.745356
[450]	valid_0's auc: 0.746816
[500]	valid_0's auc: 0.748071
[550]	valid_0's auc: 0.749205
[600]	valid_0's auc: 0.7499
[650]	valid_0's auc: 0.750735
[700]	valid_0's auc: 0.751578
[750]	valid_0's auc: 0.752397
[800]	valid_0's auc: 0.752914
[850]	valid_0's auc: 0.753443
[900]	valid_0's auc: 0.753931
[950]	valid_0's auc: 0.754323
[1000]	valid_0's auc: 0.754793
Did not meet early stopping. Best iteration is:
[1000]	valid_0's auc: 0.754793


# モデルの予測

In [11]:
for base, X in [(base_train, X_train), (base_valid, X_valid), (base_test, X_test)]:
    y_pred = model.predict(X)
    base["score"] = y_pred

print(f'The AUC score on the train set is: {roc_auc_score(base_train["target"], base_train["score"])}')
print(f'The AUC score on the valid set is: {roc_auc_score(base_valid["target"], base_valid["score"])}')
print(f'The AUC score on the test set is: {roc_auc_score(base_test["target"], base_test["score"])}')

The AUC score on the train set is: 0.7694204271355157
The AUC score on the valid set is: 0.7547927334554713
The AUC score on the test set is: 0.7514004051823603


In [12]:
def gini_stability(base, w_fallingrate=88.0, w_resstd=-0.5):
    gini_in_time = base.loc[:, ["WEEK_NUM", "target", "score"]]\
        .sort_values("WEEK_NUM")\
        .groupby("WEEK_NUM")[["target", "score"]]\
        .apply(lambda x: 2*roc_auc_score(x["target"], x["score"])-1).tolist()
    
    x = np.arange(len(gini_in_time))
    y = gini_in_time
    a, b = np.polyfit(x, y, 1)
    y_hat = a*x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    avg_gini = np.mean(gini_in_time)
    
    return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std

stability_score_train = gini_stability(base_train)
stability_score_valid = gini_stability(base_valid)
stability_score_test = gini_stability(base_test)

print(f'The stability score on the train set is: {stability_score_train}')
print(f'The stability score on the valid set is: {stability_score_valid}')
print(f'The stability score on the test set is: {stability_score_test}')

The stability score on the train set is: 0.5089863457502912
The stability score on the valid set is: 0.48084241778046377
The stability score on the test set is: 0.4652965518025257


# 提出

In [13]:
def set_table_dtypes(df: pl.DataFrame) -> pl.DataFrame:
    for col in df.columns:
        if col[-1] in ("P", "A"):
            df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))
            
    return df

test_basetable = pl.read_csv(dataPath + "csv_files/test/test_base.csv")
test_static = pl.concat(
[
    pl.read_csv(dataPath + "csv_files/test/test_static_0_0.csv").pipe(set_table_dtypes),
    pl.read_csv(dataPath + "csv_files/test/test_static_0_1.csv").pipe(set_table_dtypes),
    pl.read_csv(dataPath + "csv_files/test/test_static_0_2.csv").pipe(set_table_dtypes),
],
    how="vertical_relaxed",
)
test_static_cb = pl.read_csv(dataPath + "csv_files/test/test_static_cb_0.csv").pipe(set_table_dtypes)
test_person_1 = pl.read_csv(dataPath + "csv_files/test/test_person_1.csv").pipe(set_table_dtypes)
test_credit_bureau_b_2 = pl.read_csv(dataPath + "csv_files/test/test_credit_bureau_b_2.csv").pipe(set_table_dtypes)

selected_static_cols = []
for col in test_static.columns:
    if col[-1] in ("A", "M"):
        selected_static_cols.append(col)
selected_static_cb_cols = []
for col in test_static_cb.columns:
    if col[-1] in ("A", "M"):
        selected_static_cb_cols.append(col)

test_person_1_feats_1 = test_person_1.group_by("case_id").agg(
    pl.col("mainoccupationinc_384A").max().alias("mainoccupationinc_384Amax"),
    (pl.col("incometype_1044T") == "SELFEMPLOYED").max().alias("mainoccupationinc_384A_any_selfemployed")
)

test_person_1_feats_2 = test_person_1.select(["case_id", "num_group1", "housetype_905L"]).filter(
    pl.col("num_group1") == 0
).drop("num_group1").rename({"housetype_905L": "person_housetype"})

test_credit_bureau_b_2_feats = test_credit_bureau_b_2.group_by("case_id").agg(
    pl.col("pmts_pmtsoverdue_635A").max().alias("pmts_pmtsoverdue_635A_max"),
    (pl.col("pmts_dpdvalue_108P") > 31).max().alias("pmts_dpdvalue_108P_over31")
)
        
data_submission = test_basetable.join(
    test_static.select(["case_id"] + selected_static_cols), how="left", on="case_id"
).join(
    test_static_cb.select(["case_id"] + selected_static_cb_cols), how="left", on="case_id"
).join(
    test_person_1_feats_1, how="left", on="case_id"
).join(
    test_person_1_feats_2, how="left", on="case_id"
).join(
    test_credit_bureau_b_2_feats, how="left", on="case_id"
)

cols_pred = []
for col in X_train.columns:
    if col[-1].isupper() and col[:-1].islower():
        cols_pred.append(col)

In [14]:
X_submission = data_submission[cols_pred].to_pandas()
X_submission = convert_strings(X_submission)
categorical_cols = X_train.select_dtypes(include=['category']).columns

for col in categorical_cols:
    train_categories = set(X_train[col].cat.categories)
    submission_categories = set(X_submission[col].cat.categories)
    new_categories = submission_categories - train_categories
    X_submission.loc[X_submission[col].isin(new_categories), col] = "Unknown"
    new_dtype = pd.CategoricalDtype(categories=train_categories, ordered=True)
    X_train[col] = X_train[col].astype(new_dtype)
    X_submission[col] = X_submission[col].astype(new_dtype)
    
y_submission_pred = model.predict(X_submission)

In [15]:
submission = pd.DataFrame({
    "case_id": data_submission["case_id"].to_numpy(),
    "score": y_submission_pred
}).set_index('case_id')
submission.to_csv("./submission.csv")

In [16]:
submission.head()

Unnamed: 0_level_0,score
case_id,Unnamed: 1_level_1
57543,0.007776
57549,0.020471
57551,0.005541
57552,0.010647
57569,0.09829
