In [9]:
import numpy as np # 今回は明示的には使わない
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import lightgbm as lgb # LightGBM でモデルを作成する
from sklearn.metrics import accuracy_score, roc_auc_score #精度評価に使用
import pickle # モデルの保存・読み込みに使用

In [2]:
# 自作のモジュールを読み込み
# 自作のプロット用モジュールを読み込む
import sys
sys.path.append("../input/modeling-and-evaluation/")
import optuna_hypara as my_optuna

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# 各種定数の定義
# 乱数シード
RANDOM_SEED = 42
# target となる特徴量
TARGET_COULMN_NAME = "Exited"
# 削除する特徴量のリスト
DROP_COLUMNS = ["id", "CustomerId", "Surname"]
# one-hot encoding する特徴量のリスト
ONE_HOT_ENCODING_COLUMNS = ["Geography", "Gender"]
# ベストモデルのパス
BEST_MODEL_PATH = "../working/best_lgbm_model.pkl"

In [4]:
# 学習データの読み込み
train_df =  pd.read_csv("../input/tdse-inclass-competition-0/train.csv")

In [5]:
# 特徴量。Surname, id, CustomerIDは削除
X_train_df = train_df.drop(
    [TARGET_COULMN_NAME] + DROP_COLUMNS,
    axis=1,
) 
# one-hot encoding
X_train_df = pd.get_dummies(
    X_train_df,
    columns=ONE_HOT_ENCODING_COLUMNS,
)
# 目的変数
y_train_df = train_df[TARGET_COULMN_NAME]

In [6]:
# テストデータの読み込み
test_df =  pd.read_csv("../input/tdse-inclass-competition-0/test.csv")
# 学習データに合わせて特徴量を加工
X_test_df = test_df.drop(
    DROP_COLUMNS,
    axis=1,
)
# one-hot encoding
X_test_df = pd.get_dummies(
    X_test_df,
    columns=ONE_HOT_ENCODING_COLUMNS,
)

In [7]:
my_optuna.optimize_lgbm_hyperparameters(
    X_train_df,
    y_train_df,
    n_trials=10,
    seed=RANDOM_SEED,
    model_path=BEST_MODEL_PATH,
)

[I 2024-06-09 21:09:56,913] A new study created in memory with name: no-name-195fb133-e472-4758-8318-f6797ca7eef2
[I 2024-06-09 21:09:58,727] Trial 0 finished with value: 0.8876085383841619 and parameters: {'num_leaves': 105, 'feature_fraction': 0.9168507998324879, 'bagging_fraction': 0.45873628974870784, 'bagging_freq': 5, 'min_child_samples': 57}. Best is trial 0 with value: 0.8876085383841619.
[I 2024-06-09 21:10:00,683] Trial 1 finished with value: 0.889679381490296 and parameters: {'num_leaves': 122, 'feature_fraction': 0.8738179719841908, 'bagging_fraction': 0.86149938514544, 'bagging_freq': 5, 'min_child_samples': 70}. Best is trial 1 with value: 0.889679381490296.
[I 2024-06-09 21:10:02,203] Trial 2 finished with value: 0.8906822918891522 and parameters: {'num_leaves': 80, 'feature_fraction': 0.5053463818770678, 'bagging_fraction': 0.8549444799322454, 'bagging_freq': 5, 'min_child_samples': 24}. Best is trial 2 with value: 0.8906822918891522.
[I 2024-06-09 21:10:03,833] Trial 3

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001135 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 864
[LightGBM] [Info] Number of data points in the train set: 132027, number of used features: 13
[LightGBM] [Info] Start training from score 0.211820
Best trial:
  Value: 0.8910029219887936
  Params: 
    num_leaves: 47
    feature_fraction: 0.5732580277742092
    bagging_fraction: 0.9243958355946519
    bagging_freq: 5
    min_child_samples: 31


<optuna.study.study.Study at 0x15e483250>

In [10]:
# モデルの読み込み
with open(BEST_MODEL_PATH, 'rb') as f:
    model = pickle.load(f)

In [11]:
# テストデータに対する予測
y_pred_np = model.predict(X_test_df)

In [12]:
y_pred_np

array([0.0280737 , 0.82196997, 0.03929901, ..., 0.00851255, 0.18852961,
       0.2208965 ])

In [13]:
# 提出用ファイルの作成
submission_df = pd.DataFrame({
    "id": test_df["id"],
    "Exited": y_pred_np,
})

In [14]:
submission_df.head()

Unnamed: 0,id,Exited
0,165034,0.028074
1,165035,0.82197
2,165036,0.039299
3,165037,0.238457
4,165038,0.355002


In [None]:
submission_df.to_csv("submission.csv", index=False)