In [1]:
!pip install autogluon

Collecting autogluon
  Downloading autogluon-1.1.1-py3-none-any.whl.metadata (11 kB)
Collecting autogluon.core==1.1.1 (from autogluon.core[all]==1.1.1->autogluon)
  Downloading autogluon.core-1.1.1-py3-none-any.whl.metadata (11 kB)
Collecting autogluon.features==1.1.1 (from autogluon)
  Downloading autogluon.features-1.1.1-py3-none-any.whl.metadata (11 kB)
Collecting autogluon.tabular==1.1.1 (from autogluon.tabular[all]==1.1.1->autogluon)
  Downloading autogluon.tabular-1.1.1-py3-none-any.whl.metadata (13 kB)
Collecting autogluon.multimodal==1.1.1 (from autogluon)
  Downloading autogluon.multimodal-1.1.1-py3-none-any.whl.metadata (12 kB)
Collecting autogluon.timeseries==1.1.1 (from autogluon.timeseries[all]==1.1.1->autogluon)
  Downloading autogluon.timeseries-1.1.1-py3-none-any.whl.metadata (12 kB)
Collecting scikit-learn<1.4.1,>=1.3.0 (from autogluon.core==1.1.1->autogluon.core[all]==1.1.1->autogluon)
  Downloading scikit_learn-1.4.0-1-cp310-cp310-manylinux_2_17_x86_64.m

In [2]:
import numpy as np
import pandas as pd
from autogluon.tabular import TabularDataset, TabularPredictor
from autogluon.core.metrics import make_scorer
from sklearn.metrics import roc_auc_score #精度評価に使用

In [3]:
# 各種定数の定義
# 乱数シード
RANDOM_SEED = 42
# target となる特徴量
TARGET_COULMN_NAME = "Exited"
# 削除する特徴量のリスト
DROP_COLUMNS = ["id", "CustomerId", "Surname"]
# one-hot encoding する特徴量のリスト
ONE_HOT_ENCODING_COLUMNS = ["Geography", "Gender"]
# ベストモデルのパス
BEST_MODEL_PATH = "../working/best_lgbm_model.pkl"

In [4]:
# 評価関数の定義
ag_roc_auc_scorer = make_scorer(
    name="roc_auc_score",
    score_func=roc_auc_score,
    optimum=1,
    greater_is_better=True,
    needs_threshold=True,
)

In [5]:
# 学習データの読み込み
train_df =  pd.read_csv("../input/tdse-inclass-competition-0/train.csv")

In [6]:
# 特徴量。Surname, id, CustomerIDは削除
X_train_df = train_df.drop(
    DROP_COLUMNS,
    axis=1,
) 
# one-hot encoding
X_train_df = pd.get_dummies(
    X_train_df,
    columns=ONE_HOT_ENCODING_COLUMNS,
)

In [7]:
# テストデータの読み込み
test_df =  pd.read_csv("../input/tdse-inclass-competition-0/test.csv")
# 学習データに合わせて特徴量を加工
X_test_df = test_df.drop(
    DROP_COLUMNS,
    axis=1,
)
# one-hot encoding
X_test_df = pd.get_dummies(
    X_test_df,
    columns=ONE_HOT_ENCODING_COLUMNS,
)

In [8]:
predictor = TabularPredictor(
    label=TARGET_COULMN_NAME,
    eval_metric=ag_roc_auc_scorer,
    path="../working",
    ).fit(X_train_df)

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.10.13
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Tue Dec 19 13:14:11 UTC 2023
CPU Count:          4
Memory Avail:       30.14 GB / 31.36 GB (96.1%)
Disk Space Avail:   19.50 GB / 19.52 GB (99.9%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment

In [9]:
# if you'd like to use saved model:
# predictor = TabularPredictor.load("../working")

In [10]:
y_pred = predictor.predict_proba(X_test_df)

In [11]:
y_pred

Unnamed: 0,0,1
0,0.971893,0.028107
1,0.151376,0.848624
2,0.971780,0.028220
3,0.781907,0.218093
4,0.623301,0.376699
...,...,...
110018,0.963312,0.036688
110019,0.891626,0.108374
110020,0.980919,0.019081
110021,0.846562,0.153438


In [12]:
submission_df = test_df["id"].to_frame()
submission_df["Exited"] = y_pred[1]

In [13]:
submission_df.head()

Unnamed: 0,id,Exited
0,165034,0.028107
1,165035,0.848624
2,165036,0.02822
3,165037,0.218093
4,165038,0.376699


In [14]:
submission_df.to_csv("submission.csv", index=False)