# モデルチューニング

# LightGBMのハイパーパラメータのチューニング
## ハイパーパラメータの自動チューニング

In [1]:
# Google Driveと接続を行います。これを行うことで、Driveにあるデータにアクセスできるようになります。
# 下記セルを実行すると、Googleアカウントのログインを求められますのでログインしてください。
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#### ライブラリのインポート

In [2]:
import numpy as np
import pandas as pd
import os
import pickle
import gc

# 分布確認  anacondaでは、一先ず、コメントにする。
#import pandas_profiling as pdp

# 可視化
import matplotlib.pyplot as plt

# 前処理
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder

# バリデーション
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, GroupKFold

# 評価指標
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

# モデリング: lightgbm
import lightgbm as lgb

import numpy as np

import warnings
warnings.filterwarnings("ignore")
%matplotlib inline


#### ファイルの読み込みと特徴量エンジニアリング

In [3]:
df_train = pd.read_csv("Bank Churn/train.csv")

### OneHotEncoderのインスタンスを作成し、sparse=Falseで密行列を指定
ohe_Gender=OneHotEncoder(sparse=False)


### OneHotEncoderを訓練データの"Gendet"列に適させる

ohe_Gender.fit(df_train[["Gender"]])

### OneHotエンコーディングされた特徴量を含むDataFrameを作成し、列名を指定
tmp_Gender = pd.DataFrame(
    ### OneHotエンコーディングを実行し、結果をDataFrameに保存
    ohe_Gender.transform(df_train[["Gender"]]),
    ### 列名設定
    columns=["Gender_{}".format(i) for i in ohe_Gender.categories_[0]],
)


### 元のデータフレームとOneHotエンコーディングされた特徴量を結合
df_train = pd.concat([df_train, tmp_Gender], axis=1)

### "Gender"列とOne-Hotエンコーディングされた列を抽出して表示
df_train[["Gender","Gender_Female","Gender_Male"]]


### OneHotEncoderのインスタンスを作成し、sparse=Falseで密行列を指定
ohe_Geography=OneHotEncoder(sparse=False)

### OneHotEncoderを訓練データの"Gendet"列に適させる
ohe_Geography.fit(df_train[["Geography"]])

### OneHotエンコーディングされた特徴量を含むDataFrameを作成し、列名を指定
tmp_Geography = pd.DataFrame(
    ### OneHotエンコーディングを実行し、結果をDataFrameに保存
    ohe_Geography.transform(df_train[["Geography"]]),
    ### 列名設定
    columns=["Geography_{}".format(i) for i in ohe_Geography.categories_[0]],
)

### 元のデータフレームとOneHotエンコーディングされた特徴量を結合
df_train = pd.concat([df_train, tmp_Geography], axis=1)

### "Geography"列とOne-Hotエンコーディングされた列を抽出して表示
df_train[["Geography","Geography_France","Geography_Spain","Geography_Germany"]]



Unnamed: 0,Geography,Geography_France,Geography_Spain,Geography_Germany
0,France,1.0,0.0,0.0
1,France,1.0,0.0,0.0
2,France,1.0,0.0,0.0
3,France,1.0,0.0,0.0
4,Spain,0.0,1.0,0.0
...,...,...,...,...
165029,Spain,0.0,1.0,0.0
165030,France,1.0,0.0,0.0
165031,France,1.0,0.0,0.0
165032,Spain,0.0,1.0,0.0


In [1]:
#GenderがMaleでかつIsActiveMemberが1の場合を1、それ以外を0とする新しい特徴量を作成する
df_train['Gender_Male_and_Active'] = ((df_train['Gender'] == 'Male') & (df_train['IsActiveMember'] == 1)).astype(int)
df_train["Gender_Male_+_Active"] = df_train["Gender_Male"] + df_train["IsActiveMember"]


NameError: name 'df_train' is not defined

In [5]:
# count=encoding
# "Surname" 列の各値の出現回数を数え、その結果を辞書に変換し、変数 ce_Surname に代入します。
ce_Surname = df_train["Surname"].value_counts().to_dict()

# 出現回数の辞書を出力します。
print(ce_Surname)

# "Surname" 列の各値を、出現回数の辞書にマッピングし、新しい列 "Surname_ce" を作成します。
df_train["Surname_ce"] = df_train["Surname"].map(ce_Surname)

df_train[["Surname_ce"]]
# "Surname" 列と新しい "Surname_ce" 列を含む DataFrame の最初の数行を表示します。
df_train[["Surname", "Surname_ce"]].head()

{'Hsia': 2456, "T'ien": 2282, 'Hs?': 1611, 'Kao': 1577, 'Maclean': 1577, "Ts'ui": 1567, "P'eng": 1503, 'H?': 1420, 'Hsueh': 1306, 'Shih': 1251, "Ch'ang": 1229, "Ch'iu": 1199, 'Mazzanti': 1192, 'Tuan': 1164, 'Fanucci': 1139, 'Onyemauchechukwu': 1138, 'Tien': 1132, 'Smith': 1129, 'Nwankwo': 1117, 'Manna': 1024, 'Teng': 1008, 'Hsiung': 988, "Ch'ien": 974, 'Genovese': 962, 'Hsieh': 939, "T'ao": 906, "Ts'ai": 900, 'Trevisani': 894, 'Martin': 875, "T'ang": 859, 'Hsing': 853, 'Y?an': 846, 'Pirozzi': 814, 'Y?': 759, "K'ung": 754, "Ch'eng": 752, 'Walker': 744, 'Tsao': 740, 'Scott': 729, 'White': 727, 'Nwachinemelu': 726, 'Toscani': 723, "Ch'in": 722, 'Pinto': 717, 'Trevisano': 714, 'Brown': 713, 'Palerma': 708, 'L?': 694, 'Lucchesi': 689, 'Lo Duca': 687, 'Hsiao': 666, 'Ting': 663, 'Nwabugwu': 662, 'Ritchie': 645, 'Liao': 644, 'K?': 632, 'Lucciano': 627, 'Chukwudi': 615, 'Mazzi': 611, 'Chiu': 608, 'Tung': 604, 'Hao': 594, 'Lu': 592, 'Lai': 591, 'Lo': 589, 'Yeh': 584, 'Cremonesi': 581, 'Hsu': 580

Unnamed: 0,Surname,Surname_ce
0,Okwudilichukwu,261
1,Okwudiliolisa,300
2,Hsueh,1306
3,Kao,1577
4,Chiemenam,246


In [6]:
#数値変数×カテゴリ変数(平均値作成)
df_train["mean_Age_by_Surname"] = df_train.groupby("Surname")["Age"].transform("mean")

df_train[["Surname", "Age", "mean_Age_by_Surname"]].head()

Unnamed: 0,Surname,Age,mean_Age_by_Surname
0,Okwudilichukwu,33.0,39.708812
1,Okwudiliolisa,33.0,39.753333
2,Hsueh,40.0,38.189127
3,Kao,34.0,37.192771
4,Chiemenam,33.0,39.089431


In [29]:
#以下、数値変数とカテゴリ変数を組み合わせての特徴量エンジニアリング

df_train["std_Age_by_Surname"] = df_train.groupby("Surname")["Age"].transform("std")
df_train["sum_Age_by_Surname"] = df_train.groupby("Surname")["Age"].transform("sum")
df_train["max_Age_by_Surname"] = df_train.groupby("Surname")["Age"].transform("max")
df_train["min_Age_by_Surname"] = df_train.groupby("Surname")["Age"].transform("min")
df_train["Balance_-_NumOfProducts"] = df_train["Balance"] - df_train["NumOfProducts"]
df_train["NumOfProducts_+_Age"] = df_train["NumOfProducts"] + df_train["Age"]
df_train["NumOfProducts_-_Age"] = df_train["NumOfProducts"] - df_train["Age"]
df_train["NumOfProducts_*_Age"] = df_train["NumOfProducts"] * df_train["Age"]
df_train["NumOfProducts_/_Age"] = df_train["NumOfProducts"] / df_train["Age"]
df_train["NumOfProducts_*_CreditScore"] = df_train["NumOfProducts"] * df_train["CreditScore"]
df_train["NumOfProducts_/_CreditScore"] = df_train["NumOfProducts"] / df_train["CreditScore"]
df_train["Tenure_+_HasCrCard"] = df_train["Tenure"] + df_train["HasCrCard"]
df_train["Tenure_*_HasCrCard"] = df_train["Tenure"] * df_train["HasCrCard"]


#### データセット作成 

In [28]:
x_train,y_train,id_train=df_train[["IsActiveMember","EstimatedSalary","CreditScore","Gender_Male","Age","Balance",
                                   "Geography_Germany",'Gender_Male_and_Active',"Gender_Male_+_Active","Surname","Surname_ce","mean_Age_by_Surname",
                                   "std_Age_by_Surname","sum_Age_by_Surname","max_Age_by_Surname","NumOfProducts","Balance_-_NumOfProducts",
                                   "NumOfProducts_+_Age","NumOfProducts_-_Age","NumOfProducts_*_Age","NumOfProducts_/_Age","NumOfProducts_*_CreditScore",
                                   "NumOfProducts_/_CreditScore","Tenure_+_HasCrCard","Tenure_*_HasCrCard"]],\
                        df_train[["Exited"]],\
                        df_train[["id"]]
print(x_train.shape, y_train.shape, id_train.shape)

(165034, 25) (165034, 1) (165034, 1)


In [9]:
# DataFrame x_train の各列に対してループを行う
for col in x_train.columns:
    # もし列のデータ型が文字列型であれば

    if x_train[col].dtype=="O":
        # その列のデータ型をカテゴリ型に変換する
        x_train[col] = x_train[col].astype("category")

#### optunaのインポート

In [10]:
pip install optuna

Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl (380 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.3-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.8/78.8 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.3 alembic-1.13.1 colorlog-6.8.2 optuna-3.6.1


In [13]:
import optuna

#### 目的関数の定義

In [14]:

# 探索しないハイパーパラメータ
params_base = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "learning_rate": 0.02,
    'n_estimators': 100000,
    "bagging_freq": 1,
    "seed": 123,
}

def objective(trial):
    # 探索するハイパーパラメータ
    params_tuning = {
        "num_leaves": trial.suggest_int("num_leaves", 8, 256),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 5, 200),
        "min_sum_hessian_in_leaf": trial.suggest_float("min_sum_hessian_in_leaf", 1e-5, 1e-2, log=True),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-2, 1e2, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-2, 1e2, log=True),
    }
    params_tuning.update(params_base)

    # モデル学習・評価
    list_metrics = []
    cv = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=123).split(x_train, y_train))
    list_fold=[0,1,2]
    for nfold in list_fold:
        idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
        x_tr, y_tr = x_train.loc[idx_tr, :], y_train.loc[idx_tr, :]
        x_va, y_va = x_train.loc[idx_va, :], y_train.loc[idx_va, :]
        model = lgb.LGBMClassifier(**params_tuning)
        model.fit(x_tr,
                  y_tr,
                  eval_set=[(x_tr,y_tr), (x_va,y_va)],
                  #early_stopping_rounds=100,
                  callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=True),
                  lgb.log_evaluation()], # コマンドライン出力用コールバック関数　修正処理
                  #verbose=10,
                  eval_metric='auc',  # 早期停止の評価メトリックスを指定するのが良いです
                 )
        y_va_pred = model.predict_proba(x_va)[:,1]
        metric_va = roc_auc_score(y_va, y_va_pred)
        list_metrics.append(metric_va)

    # 評価値の計算
    metrics = np.mean(list_metrics)

    return metrics

#### 最適化処理（探索の実行）

In [15]:

# TPESamplerを使用してサンプラーオブジェクトを作成し、乱数のシードを指定
sampler = optuna.samplers.TPESampler(seed=123)

# サンプラーを指定してStudyオブジェクトを作成し、最大化方向を設定
study = optuna.create_study(sampler=sampler, direction="maximize")

# 目的関数を最適化してハイパーパラメータを探索（30回の試行）
study.optimize(objective, n_trials=30)

Output hidden; open in https://colab.research.google.com to view.

#### 探索結果の確認

In [16]:
# Studyオブジェクトから最良のトライアル（試行）を取得し、trial変数に代入
trial = study.best_trial

# 最良のトライアルの評価値を表示し、小数点以下4桁まで表示
print("acc(best)={:.4f}".format(trial.value))

# 最良のトライアルで使用されたハイパーパラメータの値を表示
display(trial.params)

acc(best)=0.8952


{'num_leaves': 32,
 'min_data_in_leaf': 198,
 'min_sum_hessian_in_leaf': 0.000911024108147967,
 'feature_fraction': 0.5040305717020102,
 'bagging_fraction': 0.9940542446575642,
 'lambda_l1': 74.98841302101062,
 'lambda_l2': 34.144257057990224}

#### ベストなハイパーパラメータの取得

In [17]:
# 最良のトライアルで使用されたハイパーパラメータを取得し、params_best変数に代入
params_best = trial.params

# ベースのハイパーパラメータと最良のトライアルで使用されたハイパーパラメータを結合
params_best.update(params_base)

# 結合されたハイパーパラメータの値を表示
display(params_best)

{'num_leaves': 32,
 'min_data_in_leaf': 198,
 'min_sum_hessian_in_leaf': 0.000911024108147967,
 'feature_fraction': 0.5040305717020102,
 'bagging_fraction': 0.9940542446575642,
 'lambda_l1': 74.98841302101062,
 'lambda_l2': 34.144257057990224,
 'boosting_type': 'gbdt',
 'objective': 'binary',
 'metric': 'auc',
 'learning_rate': 0.02,
 'n_estimators': 100000,
 'bagging_freq': 1,
 'seed': 123}

In [18]:
params ={'num_leaves': 32,
 'min_data_in_leaf': 198,
 'min_sum_hessian_in_leaf': 0.000911024108147967,
 'feature_fraction': 0.5040305717020102,
 'bagging_fraction': 0.9940542446575642,
 'lambda_l1': 74.98841302101062,
 'lambda_l2': 34.144257057990224,
 'boosting_type': 'gbdt',
 'objective': 'binary',
 'metric': 'auc',
 'learning_rate': 0.02,
 'n_estimators': 100000,
 'bagging_freq': 1,
 'seed': 123}
def train_cv(input_x,
             input_y,
             input_id,
             params,
             n_splits=5,
            ):
    metrics = []
    imp = pd.DataFrame()

    cv = list(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=123).split(input_x, input_y))
    for nfold in np.arange(n_splits):
        print("-"*20, nfold, "-"*20)
        idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
        x_tr, y_tr = input_x.loc[idx_tr, :], input_y.loc[idx_tr, :]
        x_va, y_va = input_x.loc[idx_va, :], input_y.loc[idx_va, :]
        print(x_tr.shape, y_tr.shape)
        print(x_va.shape, y_va.shape)
        print("y_train:{:.3f}, y_tr:{:.3f}, y_va:{:.3f}".format(
            input_y["Exited"].mean(),
            y_tr["Exited"].mean(),
            y_va["Exited"].mean(),
        ))

        model = lgb.LGBMClassifier(**params)
        model.fit(x_tr,
                  y_tr,
                  eval_set=[(x_tr,y_tr), (x_va,y_va)],
                  #early_stopping_rounds=100,
                  callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=True),
                  lgb.log_evaluation()], # コマンドライン出力用コールバック関数　修正処理
                  #verbose=10,
                  eval_metric='auc',  # 早期停止の評価メトリックスを指定するのが良いです
                  )


        y_tr_pred = model.predict_proba(x_tr)[:,1]
        y_va_pred = model.predict_proba(x_va)[:,1]
        metric_tr = roc_auc_score(y_tr, y_tr_pred)
        metric_va = roc_auc_score(y_va, y_va_pred)
        print("[accuracy] tr: {:.2f}, va: {:.2f}".format(metric_tr, metric_va))
        metrics.append([nfold, metric_tr, metric_va])

        _imp = pd.DataFrame({"col":input_x.columns, "imp":model.feature_importances_, "nfold":nfold})
        imp = pd.concat([imp, _imp], axis=0, ignore_index=True)

    print("-"*20, "result", "-"*20)
    metrics = np.array(metrics)
    print(metrics)

    print("[cv ] tr: {:.2f}+-{:.2f}, va: {:.2f}+-{:.2f}".format(
        metrics[:,1].mean(), metrics[:,1].std(),
        metrics[:,2].mean(), metrics[:,2].std(),
    ))

    imp = imp.groupby("col")["imp"].agg(["mean", "std"])
    imp.columns = ["imp", "imp_std"]
    imp = imp.reset_index(drop=False)

    print("Done.")

    return imp, metrics

In [2]:
#ベストなハイパーパラメータの取得後のモデル学習
imp, metrics = train_cv(x_train, y_train, id_train, params, n_splits=5)

NameError: name 'train_cv' is not defined

In [27]:
#推論用データセット、特徴量エンジニアリング

df_test = pd.read_csv("Bank Churn/test.csv")

### OneHotEncoderのインスタンスを作成し、sparse=Falseで密行列を指定
ohe_Gender=OneHotEncoder(sparse=False)


### OneHotEncoderを訓練データの"Gender"列に適させる

ohe_Gender.fit(df_test[["Gender"]])

### OneHotエンコーディングされた特徴量を含むDataFrameを作成し、列名を指定
tmp_Gender = pd.DataFrame(
    ### OneHotエンコーディングを実行し、結果をDataFrameに保存
    ohe_Gender.transform(df_test[["Gender"]]),
    ### 列名設定
    columns=["Gender_{}".format(i) for i in ohe_Gender.categories_[0]],
)


### 元のデータフレームとOneHotエンコーディングされた特徴量を結合
df_test = pd.concat([df_test, tmp_Gender], axis=1)
### "Gender"列とOne-Hotエンコーディングされた列を抽出して表示
df_test[["Gender","Gender_Female","Gender_Male"]]



### OneHotEncoderのインスタンスを作成し、sparse=Falseで密行列を指定
ohe_Geography=OneHotEncoder(sparse=False)

### OneHotEncoderを訓練データの"Gendet"列に適させる
ohe_Geography.fit(df_train[["Geography"]])

### OneHotエンコーディングされた特徴量を含むDataFrameを作成し、列名を指定
tmp_Geography = pd.DataFrame(
    ### OneHotエンコーディングを実行し、結果をDataFrameに保存
    ohe_Geography.transform(df_test[["Geography"]]),
    ### 列名を "Embarked_0", "Embarked_1" などに設定
    columns=["Geography_{}".format(i) for i in ohe_Geography.categories_[0]],
)

### 元のデータフレームとOneHotエンコーディングされた特徴量を結合
df_test = pd.concat([df_test, tmp_Geography], axis=1)

### "Geography"列とOne-Hotエンコーディングされた列を抽出して表示
df_test[["Geography","Geography_France","Geography_Spain","Geography_Germany"]]


df_test['Gender_Male_and_Active'] = ((df_test['Gender'] == 'Male') & (df_test['IsActiveMember'] == 1)).astype(int)



df_test["Gender_Male_+_Active"] = df_test["Gender_Male"] + df_test["IsActiveMember"]

# "Surname" 列の各値の出現回数を数え、その結果を辞書に変換し、変数 ce_Surname に代入します。
ce_Surname = df_test["Surname"].value_counts().to_dict()

# 出現回数の辞書を出力します。
print(ce_Surname)

# "Surname" 列の各値を、出現回数の辞書にマッピングし、新しい列 "Surname_ce" を作成します。
df_test["Surname_ce"] = df_test["Surname"].map(ce_Surname)

df_test[["Surname_ce"]]
# "Surname" 列と新しい "Surname_ce" 列を含む DataFrame の最初の数行を表示します。
df_test[["Surname", "Surname_ce"]].head()

df_test["mean_Age_by_Surname"] = df_test.groupby("Surname")["Age"].transform("mean")

df_test["std_Age_by_Surname"] = df_test.groupby("Surname")["Age"].transform("std")
df_test["sum_Age_by_Surname"] = df_test.groupby("Surname")["Age"].transform("sum")
df_test["max_Age_by_Surname"] = df_test.groupby("Surname")["Age"].transform("max")
df_test["min_Age_by_Surname"] = df_test.groupby("Surname")["Age"].transform("min")
df_test["Balance_-_NumOfProducts"] = df_test["Balance"] - df_test["NumOfProducts"]


df_test["NumOfProducts_+_Age"] = df_test["NumOfProducts"] + df_test["Age"]
df_test["NumOfProducts_-_Age"] = df_test["NumOfProducts"] - df_test["Age"]
df_test["NumOfProducts_*_Age"] = df_test["NumOfProducts"] * df_test["Age"]
df_test["NumOfProducts_/_Age"] = df_test["NumOfProducts"] / df_test["Age"]

df_test["NumOfProducts_*_CreditScore"] = df_test["NumOfProducts"] * df_test["CreditScore"]
df_test["NumOfProducts_/_CreditScore"] = df_test["NumOfProducts"] / df_test["CreditScore"]
df_test["Tenure_+_HasCrCard"] = df_test["Tenure"] + df_test["HasCrCard"]
df_test["Tenure_*_HasCrCard"] = df_test["Tenure"] * df_test["HasCrCard"]



x_test = df_test[["IsActiveMember","EstimatedSalary","CreditScore","Gender_Male","Age","Balance",
                                   "Geography_Germany",'Gender_Male_and_Active',"Gender_Male_+_Active","Surname","Surname_ce","mean_Age_by_Surname",
                                   "std_Age_by_Surname","sum_Age_by_Surname","max_Age_by_Surname","NumOfProducts","Balance_-_NumOfProducts",
                  "NumOfProducts_+_Age","NumOfProducts_-_Age","NumOfProducts_*_Age","NumOfProducts_/_Age","NumOfProducts_*_CreditScore",
                  "NumOfProducts_/_CreditScore","Tenure_+_HasCrCard","Tenure_*_HasCrCard"]]
id_test = df_test[["id"]]



{'Hsia': 1606, "T'ien": 1484, 'Hs?': 1124, 'Maclean': 1042, "Ts'ui": 1017, 'Kao': 1000, "P'eng": 954, 'H?': 944, 'Hsueh': 872, 'Mazzanti': 860, "Ch'iu": 857, 'Tuan': 819, 'Nwankwo': 810, 'Shih': 793, "Ch'ang": 792, 'Smith': 775, 'Onyemauchechukwu': 769, 'Fanucci': 735, 'Tien': 728, "Ch'ien": 697, 'Manna': 687, 'Trevisani': 678, 'Hsiung': 673, 'Genovese': 649, 'Hsieh': 643, 'Teng': 614, "Ts'ai": 590, 'Hsing': 584, "T'ang": 578, "T'ao": 576, 'Pirozzi': 562, 'Y?': 553, 'Y?an': 545, "K'ung": 534, "Ch'eng": 520, 'Scott': 516, 'Toscani': 512, 'Tsao': 508, 'Hsiao': 504, 'Walker': 499, 'Nwachinemelu': 485, 'White': 483, 'Martin': 482, 'Pinto': 479, "Ch'in": 478, 'Trevisano': 477, 'Palerma': 477, 'Brown': 470, 'Liao': 461, 'Nwabugwu': 456, 'K?': 452, 'Lucchesi': 445, 'Lo Duca': 443, 'Ting': 442, 'Lucciano': 442, 'Lu': 439, 'Lo': 437, 'L?': 433, 'Mazzi': 421, 'Tung': 419, 'Ritchie': 412, 'Lai': 411, 'Lucchese': 403, 'Chukwudi': 400, 'Chiu': 394, 'Folliero': 383, 'Cremonesi': 382, 'Lettiere': 379

In [21]:
# DataFrame x_test の各列に対してループを行う
for col in x_test.columns:
    # もし列のデータ型が文字列型であれば

    if x_test[col].dtype=="O":
        # その列のデータ型をカテゴリ型に変換する
        x_test[col] = x_test[col].astype("category")

モデル学習・評価（推論用データセット作成後）

In [23]:
params ={'num_leaves': 32,
 'min_data_in_leaf': 198,
 'min_sum_hessian_in_leaf': 0.000911024108147967,
 'feature_fraction': 0.5040305717020102,
 'bagging_fraction': 0.9940542446575642,
 'lambda_l1': 74.98841302101062,
 'lambda_l2': 34.144257057990224,
 'boosting_type': 'gbdt',
 'objective': 'binary',
 'metric': 'auc',
 'learning_rate': 0.02,
 'n_estimators': 100000,
 'bagging_freq': 1,
 'seed': 123}

def predict_lgb(input_x,
             input_y,
             input_id,
             params,
             n_splits=5,
            ):
    metrics = []
    imp = pd.DataFrame()

    cv = list(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=123).split(input_x, input_y))
    for nfold in np.arange(n_splits):
        print("-"*20, nfold, "-"*20)
        idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
        x_tr, y_tr = input_x.loc[idx_tr, :], input_y.loc[idx_tr, :]
        x_va, y_va = input_x.loc[idx_va, :], input_y.loc[idx_va, :]
        print(x_tr.shape, y_tr.shape)
        print(x_va.shape, y_va.shape)
        print("y_train:{:.3f}, y_tr:{:.3f}, y_va:{:.3f}".format(
            input_y["Exited"].mean(),
            y_tr["Exited"].mean(),
            y_va["Exited"].mean(),
        ))

        model = lgb.LGBMClassifier(**params)
        model.fit(x_tr,
                  y_tr,
                  eval_set=[(x_tr,y_tr), (x_va,y_va)],
                  #early_stopping_rounds=100,
                  callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=True),
                  lgb.log_evaluation()], # コマンドライン出力用コールバック関数　修正処理
                  #verbose=10,
                  eval_metric='auc',  # 早期停止の評価メトリックスを指定するのが良いです
                  )


        y_tr_pred = model.predict_proba(x_tr)[:,1]
        y_va_pred = model.predict_proba(x_va)[:,1]
        metric_tr = roc_auc_score(y_tr, y_tr_pred)
        metric_va = roc_auc_score(y_va, y_va_pred)
        print("[accuracy] tr: {:.2f}, va: {:.2f}".format(metric_tr, metric_va))
        metrics.append([nfold, metric_tr, metric_va])

        _imp = pd.DataFrame({"col":input_x.columns, "imp":model.feature_importances_, "nfold":nfold})
        imp = pd.concat([imp, _imp], axis=0, ignore_index=True)

    print("-"*20, "result", "-"*20)
    metrics = np.array(metrics)
    print(metrics)

    print("[cv ] tr: {:.2f}+-{:.2f}, va: {:.2f}+-{:.2f}".format(
        metrics[:,1].mean(), metrics[:,1].std(),
        metrics[:,2].mean(), metrics[:,2].std(),
    ))

    imp = imp.groupby("col")["imp"].agg(["mean", "std"])
    imp.columns = ["imp", "imp_std"]
    imp = imp.reset_index(drop=False)

    print("Done.")

    return model,imp, metrics

In [24]:
#モデル学習・評価（推論用データセット作成後）
model,imp, metrics = predict_lgb(x_train, y_train, id_train, params, n_splits=5)

[1;30;43mストリーミング出力は最後の 5000 行に切り捨てられました。[0m
[291]	training's auc: 0.904184	valid_1's auc: 0.892063
[292]	training's auc: 0.904185	valid_1's auc: 0.892066
[293]	training's auc: 0.904235	valid_1's auc: 0.892078
[294]	training's auc: 0.904293	valid_1's auc: 0.892092
[295]	training's auc: 0.904302	valid_1's auc: 0.892099
[296]	training's auc: 0.90431	valid_1's auc: 0.892104
[297]	training's auc: 0.904358	valid_1's auc: 0.892114
[298]	training's auc: 0.904369	valid_1's auc: 0.892122
[299]	training's auc: 0.904379	valid_1's auc: 0.892138
[300]	training's auc: 0.904384	valid_1's auc: 0.892142
[301]	training's auc: 0.904398	valid_1's auc: 0.89215
[302]	training's auc: 0.904451	valid_1's auc: 0.892164
[303]	training's auc: 0.904494	valid_1's auc: 0.892165
[304]	training's auc: 0.9045	valid_1's auc: 0.892169
[305]	training's auc: 0.904541	valid_1's auc: 0.892181
[306]	training's auc: 0.904546	valid_1's auc: 0.892187
[307]	training's auc: 0.904589	valid_1's auc: 0.892198
[308]	training's auc: 0

学習モデルによる推論


In [25]:
y_test_pred = model.predict_proba(x_test)



提出用ファイルの作成


In [26]:
df_submit = pd.DataFrame({"id": id_test["id"], "Exited": y_test_pred[:, 1]})
display(df_submit.head(5))
df_submit.to_csv("/content/drive/MyDrive/kaggle実践/銀行解約データセットを使用した二項分類/submission_optuna.csv", index=None)

Unnamed: 0,id,Exited
0,165034,0.011625
1,165035,0.662486
2,165036,0.01986
3,165037,0.24609
4,165038,0.402806
