In [1]:
import numpy as np # 今回は明示的には使わない
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import polars as pl # data processing like SQL
import lightgbm as lgb # LightGBM でモデルを作成する
from sklearn.metrics import accuracy_score, roc_auc_score #精度評価に使用
import pickle # モデルの保存・読み込みに使用

In [2]:
# 自作のモジュールを読み込み
import sys
sys.path.append("../input/modeling-and-evaluation/")
import optuna_hypara as my_optuna

In [3]:
# 各種定数の定義
# 乱数シード
RANDOM_SEED = 42
# target となる特徴量
TARGET_COULMN_NAME = "Exited"
# 削除する特徴量のリスト
DROP_COLUMNS = ["id", "CustomerId", "Surname"]
# one-hot encoding する特徴量のリスト
ONE_HOT_ENCODING_COLUMNS = ["Geography", "Gender"]
# ベストモデルのパス
BEST_MODEL_PATH = "../working/best_lgbm_model.pkl"

In [4]:
# 学習データの読み込み
train_df =  pd.read_csv("../input/tdse-inclass-competition-0/train.csv")

In [5]:
# 特徴量。Surname, id, CustomerIDは削除
X_train_df = train_df.drop(
    [TARGET_COULMN_NAME] + DROP_COLUMNS,
    axis=1,
) 
# one-hot encoding
X_train_df = pd.get_dummies(
    X_train_df,
    columns=ONE_HOT_ENCODING_COLUMNS,
)
# 目的変数
y_train_df = train_df[TARGET_COULMN_NAME]

In [6]:
# テストデータの読み込み
test_df =  pd.read_csv("../input/tdse-inclass-competition-0/test.csv")

In [7]:
# polarsでCSVを読み込み、CustomerIdとSurname、Geograpyが一致しているtrainでExited=1の顧客のtestでのidを保持する
train_pl = pl.read_csv("../input/tdse-inclass-competition-0/train.csv")
test_pl = pl.read_csv("../input/tdse-inclass-competition-0/test.csv")

# クエリの実行
result_pl = (
    test_pl
    .join(
        train_pl.filter(pl.col("Exited") == 1),
        on=["CustomerId", "Surname", "Geography"],
        how="inner"
    )
    .select(["id"])
)

In [8]:
# 学習データに合わせて特徴量を加工
X_test_df = test_df.drop(
    DROP_COLUMNS,
    axis=1,
)
# one-hot encoding
X_test_df = pd.get_dummies(
    X_test_df,
    columns=ONE_HOT_ENCODING_COLUMNS,
)

In [9]:
# optunaによるハイパラ探索
my_optuna.optimize_lgbm_hyperparameters(
    X_train_df,
    y_train_df,
    n_trials=50,
    seed=RANDOM_SEED,
    model_path=BEST_MODEL_PATH,
)

[I 2024-06-25 13:28:03,013] A new study created in memory with name: no-name-f0bf3c56-2287-448a-8a4d-a6b2fbde4429
[I 2024-06-25 13:28:04,589] Trial 0 finished with value: 0.8908755954882251 and parameters: {'num_leaves': 23, 'feature_fraction': 0.5501303753303672, 'bagging_fraction': 0.6981960757903249, 'bagging_freq': 1, 'min_child_samples': 30}. Best is trial 0 with value: 0.8908755954882251.
[I 2024-06-25 13:28:08,153] Trial 1 finished with value: 0.8895805358811769 and parameters: {'num_leaves': 112, 'feature_fraction': 0.9427187851421015, 'bagging_fraction': 0.7933454274939895, 'bagging_freq': 7, 'min_child_samples': 56}. Best is trial 0 with value: 0.8908755954882251.
[I 2024-06-25 13:28:10,927] Trial 2 finished with value: 0.8906339894452096 and parameters: {'num_leaves': 91, 'feature_fraction': 0.6019999713605202, 'bagging_fraction': 0.9218960402075691, 'bagging_freq': 3, 'min_child_samples': 5}. Best is trial 0 with value: 0.8908755954882251.
[I 2024-06-25 13:28:14,451] Trial 

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009959 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 864
[LightGBM] [Info] Number of data points in the train set: 132027, number of used features: 13
[LightGBM] [Info] Start training from score 0.211820
Best trial:
  Value: 0.8913662361722388
  Params: 
    num_leaves: 21
    feature_fraction: 0.7953553951582197
    bagging_fraction: 0.7701259551321165
    bagging_freq: 2
    min_child_samples: 43


<optuna.study.study.Study at 0x7e51f9c26da0>

In [10]:
# モデルの読み込み
with open(BEST_MODEL_PATH, 'rb') as f:
    model = pickle.load(f)

In [11]:
# テストデータに対する予測
y_pred_np = model.predict(X_test_df)

In [12]:
y_pred_np

array([0.03226135, 0.84083458, 0.03233401, ..., 0.01544924, 0.15808602,
       0.18932645])

In [13]:
# 提出用ファイルの作成
submission_pl = pl.DataFrame({
    "id": test_df["id"],
    "Exited": y_pred_np,
})

# polarsでCSVを読み込み、CustomerIdとSurname、Geograpyが一致しているtrainでExited=1の顧客のtestでのidを保持する
train_pl = pl.read_csv("../input/tdse-inclass-competition-0/train.csv")
test_pl = pl.read_csv("../input/tdse-inclass-competition-0/test.csv")

# クエリの実行
result_pl = (
    test_pl
    .join(
        train_pl.filter(pl.col("Exited") == 1),
        on=["CustomerId", "Surname", "Geography"],
        how="inner"
    )
    .select(["id"])
)

# 　result_plに含まれるidのうち、予測値が0.5を超えるものはすべて1に置換する
# df_1に含まれるidを抽出
ids_in_result_pl = result_pl.select(pl.col("id")).to_series()

# submission_plでExitedが0.5を超える行をフィルタ (0.3とかでもいいかもしれない)
submission_pl_filtered = submission_pl.filter(pl.col("Exited") >= 0.5)

# submission_pl_filteredのidを抽出
ids_to_update = submission_pl_filtered.select(pl.col("id")).to_series()

# 更新対象の行をフィルタ
submission_updated_pl = submission_pl.with_columns(
    pl.when(
        (pl.col("id").is_in(ids_in_result_pl)) & (pl.col("id").is_in(ids_to_update))
    ).then(pl.lit(1.0)).otherwise(pl.col("Exited")).alias("Exited")
)

In [14]:
submission_updated_pl.head()

id,Exited
i64,f64
165034,0.032261
165035,0.840835
165036,0.032334
165037,0.244467
165038,0.347258


In [15]:
submission_updated_pl.write_csv("submission.csv")