In [2]:
import numpy as np
import pandas as pd
from autogluon.tabular import TabularDataset, TabularPredictor
from autogluon.core.metrics import make_scorer
from sklearn.metrics import roc_auc_score #精度評価に使用

In [3]:
# 各種定数の定義
# 乱数シード
RANDOM_SEED = 42
# target となる特徴量
TARGET_COULMN_NAME = "Exited"
# 削除する特徴量のリスト
DROP_COLUMNS = ["id", "CustomerId", "Surname"]
# one-hot encoding する特徴量のリスト
ONE_HOT_ENCODING_COLUMNS = ["Geography", "Gender"]
# ベストモデルのパス
BEST_MODEL_PATH = "../working/best_lgbm_model.pkl"

In [4]:
# 評価関数の定義
ag_roc_auc_scorer = make_scorer(
    name="roc_auc_score",
    score_func=roc_auc_score,
    optimum=1,
    greater_is_better=True,
    needs_threshold=True,
)

In [5]:
# 学習データの読み込み
train_df =  pd.read_csv("../input/tdse-inclass-competition-0/train.csv")

In [6]:
# 特徴量。Surname, id, CustomerIDは削除
X_train_df = train_df.drop(
    DROP_COLUMNS,
    axis=1,
) 
# one-hot encoding
X_train_df = pd.get_dummies(
    X_train_df,
    columns=ONE_HOT_ENCODING_COLUMNS,
)

In [7]:
# テストデータの読み込み
test_df =  pd.read_csv("../input/tdse-inclass-competition-0/test.csv")
# 学習データに合わせて特徴量を加工
X_test_df = test_df.drop(
    DROP_COLUMNS,
    axis=1,
)
# one-hot encoding
X_test_df = pd.get_dummies(
    X_test_df,
    columns=ONE_HOT_ENCODING_COLUMNS,
)

In [8]:
predictor = TabularPredictor(
    label=TARGET_COULMN_NAME,
    eval_metric=ag_roc_auc_scorer,
    path="../working",
    ).fit(X_train_df)

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.10.2
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 23.5.0: Wed May  1 20:16:51 PDT 2024; root:xnu-10063.121.3~5/RELEASE_ARM64_T8103
CPU Count:          8
Memory Avail:       3.23 GB / 16.00 GB (20.2%)
Disk Space Avail:   59.50 GB / 926.35 GB (6.4%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
	Consider setting `time_limit` to e

: 

In [None]:
y_pred = predictor.predict(X_test_df)

In [None]:
submission_df = X_test_df["id"].to_frame()
submission_df["Exited"] = y_pred

In [12]:
submission_df.head()

Unnamed: 0,id,Exited
0,165034,0.026149
1,165035,0.831005
2,165036,0.031366
3,165037,0.204602
4,165038,0.338391


In [13]:
submission_df.to_csv("submission.csv", index=False)