In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import xgboost as xgb
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif, chi2
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef, confusion_matrix

In [2]:
RANDOM_STATE = 42
TARGET_COLUMN = "ProdTaken"
DROP_COLUMNS = ["id", "MonthlyIncome", "customer_info"]
NUMERIC_FEATURES = [
    "CityTier",
    "DurationOfPitch",
    "Occupation",
    "NumberOfPersonVisiting",
    "NumberOfFollowups",
    "ProductPitched",
    "PreferredPropertyStar",
    "NumberOfTrips",
    "PitchSatisfactionScore",
    "Designation",
    "Age",
    "MonthlyIncome_numeric",
    "children",
]
CATEGORICAL_FEATURES = [
    "Gender(is_male)",
    "Passport",
    "car",
    "TypeofContact_Company Invited",
    "TypeofContact_Self Enquiry",
    "TypeofContact_unknown",
    "marriage_history_未婚",
    "marriage_history_独身",
    "marriage_history_結婚済み",
    "marriage_history_離婚済み",
]


In [3]:
# データの読み込み
# train_df = pd.read_csv("../data/20240812/train_preprocessed.csv")
# test_df = pd.read_csv("../data/20240812/test_preprocessed.csv")
from google.colab import drive
drive.mount('/content/drive')
train_df = pd.read_csv("/content/drive/MyDrive/signate_cup_2024_data/train_preprocessed.csv")
test_df = pd.read_csv("/content/drive/MyDrive/signate_cup_2024_data/test_preprocessed.csv")

# 特徴量とターゲットの分離
X = train_df.drop([TARGET_COLUMN, "id"], axis=1)
y = train_df[TARGET_COLUMN]

# データの分割 (訓練データと検証データ)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

X_test = test_df.drop("id", axis=1)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
X_train.columns

Index(['CityTier', 'DurationOfPitch', 'Occupation', 'Gender(is_male)',
       'NumberOfPersonVisiting', 'NumberOfFollowups', 'ProductPitched',
       'PreferredPropertyStar', 'NumberOfTrips', 'Passport',
       'PitchSatisfactionScore', 'Designation', 'Age', 'MonthlyIncome_numeric',
       'car', 'children', 'TypeofContact_Company Invited',
       'TypeofContact_Self Enquiry', 'TypeofContact_unknown',
       'marriage_history_未婚', 'marriage_history_独身', 'marriage_history_結婚済み',
       'marriage_history_離婚済み'],
      dtype='object')

In [5]:

# SMOTE適用前のクラス分布の確認
print(f"データ数 (SMOTE適用前): {len(X_train)} 行")
print(f"クラス 0 の数: {sum(y_train==0)}")
print(f"クラス 1 の数: {sum(y_train==1)}\n")

# SMOTEを使用してクラス不均衡を解消
smote = SMOTE(sampling_strategy=0.5, random_state=RANDOM_STATE)
X_train, y_train = smote.fit_resample(X_train, y_train)

# SMOTE適用後のクラス分布の確認
print(f"データ数 (SMOTE適用後): {len(X_train)} 行")
print(f"クラス 0 の数: {sum(y_train==0)}")
print(f"クラス 1 の数: {sum(y_train==1)}\n")


データ数 (SMOTE適用前): 2791 行
クラス 0 の数: 2385
クラス 1 の数: 406

データ数 (SMOTE適用後): 3577 行
クラス 0 の数: 2385
クラス 1 の数: 1192



In [6]:

# # 分散に基づく特徴量選択
# vars = VarianceThreshold(0.2)
# vars.fit(X_train)
# X_train_fs = pd.DataFrame(vars.transform(X_train), columns=vars.get_feature_names_out())
# X_val_fs = pd.DataFrame(vars.transform(X_val), columns=vars.get_feature_names_out())
# X_test_fs = pd.DataFrame(vars.transform(X_test), columns=vars.get_feature_names_out())


In [7]:
X_train_fs = X_train
X_val_fs = X_val
X_test_fs = X_test

In [8]:

# 数値特徴量に対するf_classifを使用した特徴量選択
nume_cols = NUMERIC_FEATURES
classif = SelectKBest(f_classif, k=4)
classif.fit(X_train_fs[nume_cols], y_train)
X_train_fnum = pd.DataFrame(classif.transform(X_train_fs[nume_cols]), columns=classif.get_feature_names_out())
X_val_fnum = pd.DataFrame(classif.transform(X_val_fs[nume_cols]), columns=classif.get_feature_names_out())
X_test_fnum = pd.DataFrame(classif.transform(X_test_fs[nume_cols]), columns=classif.get_feature_names_out())

# カテゴリカル特徴量に対するカイ二乗検定を使用した特徴量選択
cate_cols = CATEGORICAL_FEATURES
chi2 = SelectKBest(chi2, k=8)
chi2.fit(X_train_fs[cate_cols], y_train)
X_train_chi2 = pd.DataFrame(chi2.transform(X_train_fs[cate_cols]), columns=chi2.get_feature_names_out())
X_val_chi2 = pd.DataFrame(chi2.transform(X_val_fs[cate_cols]), columns=chi2.get_feature_names_out())
X_test_chi2 = pd.DataFrame(chi2.transform(X_test_fs[cate_cols]), columns=chi2.get_feature_names_out())

# 選択された特徴量の結合
all_coll = classif.get_feature_names_out().tolist() + chi2.get_feature_names_out().tolist()
X_train = pd.concat([X_train_fnum, X_train_chi2], axis=1)
X_val = pd.concat([X_val_fnum, X_val_chi2], axis=1)
X_test = pd.concat([X_test_fnum, X_test_chi2], axis=1)

# 選択された特徴量を表示
print(f"選択された特徴量: {all_coll}")

# 特徴量のスケーリング
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)


選択された特徴量: ['ProductPitched', 'Designation', 'Age', 'MonthlyIncome_numeric', 'Gender(is_male)', 'Passport', 'TypeofContact_Company Invited', 'TypeofContact_Self Enquiry', 'marriage_history_未婚', 'marriage_history_独身', 'marriage_history_結婚済み', 'marriage_history_離婚済み']


In [9]:

# XGBoostモデルの定義
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=RANDOM_STATE)

# ハイパーパラメータの候補
param_grid = {
    "max_depth" : [int(x) for x in np.linspace(10, 50, 10)],
    "min_child_weight" : [int(x) for x in np.linspace(1, 10, 10)],
    "gamma" : [float(x) for x in np.linspace(0, 1, 11)],
    "tree_method" : ["auto", "exact", "approx", "hist"],
    "colsample_bytree" : [float(x) for x in np.linspace(0, 1, 11)],
    "learning_rate" : [float(x) for x in np.linspace(0, 1, 100)],
    "reg_lambda" : [float(x) for x in np.linspace(0, 1, 11)],
    "reg_alpha" : [float(x) for x in np.linspace(0, 1, 11)]
}

# グリッドサーチによるハイパーパラメータチューニング
grid_search_model = RandomizedSearchCV(xgb_model, param_grid, cv=5, scoring='accuracy', random_state=42)
grid_search_model.fit(X_train, y_train)

# モデルを保存
pickle.dump(grid_search_model, open("hyper_tuned_model.pkl", "wb"))

# 最適なハイパーパラメータの表示
best_params = grid_search_model.best_params_
print("最適なハイパーパラメータ:", best_params)

def eval_classification(model, X_train, y_train, X_test, y_test):
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)
    y_pred_proba_train = model.predict_proba(X_train)

    accuracy = round(accuracy_score(y_test, y_pred), 4)
    precision = round(precision_score(y_test, y_pred), 4)
    recall = round(recall_score(y_test, y_pred), 4)
    f1 = round(f1_score(y_test, y_pred), 4)
    mcc = round(matthews_corrcoef(y_test, y_pred), 4)
    auc_test = round(roc_auc_score(y_test, y_pred_proba[:, 1]), 4)
    auc_train = round(roc_auc_score(y_train, y_pred_proba_train[:, 1]), 4)

    return accuracy, precision, recall, f1, mcc, auc_test, auc_train
def eval_after_hyperparameter_tuning(model, X_train, y_train, X_test, y_test):
    eval_new = eval_classification(model, X_train, y_train, X_test, y_test)
    eval_score = ["Accuracy", "Precision", "Recall", "F1 Score", "AUC (Test)", "AUC (Train)"]
    for score, e in zip(eval_score, eval_new):
        print(score + ": " + str(e))
# ベストモデルでの再評価
eval_after_hyperparameter_tuning(grid_search_model, X_train, y_train, X_val, y_val)


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

最適なハイパーパラメータ: {'tree_method': 'auto', 'reg_lambda': 0.8, 'reg_alpha': 0.8, 'min_child_weight': 2, 'max_depth': 18, 'learning_rate': 0.2828282828282829, 'gamma': 0.8, 'colsample_bytree': 0.6000000000000001}
Accuracy: 0.8782
Precision: 0.5375
Recall: 0.4725
F1 Score: 0.5029
AUC (Test): 0.435
AUC (Train): 0.8462


In [10]:

# 最終モデルの学習
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test)

# 最適なパラメータを使用してXGBoostモデルの学習
params = {
    "objective": "binary:logistic",  # 二値分類タスク
    "max_depth": best_params["max_depth"],  # 最適な木の深さ
    "min_child_weight": best_params["min_child_weight"],
    "gamma": best_params["gamma"],
    "tree_method": best_params["tree_method"],
    "colsample_bytree": best_params["colsample_bytree"],
    "learning_rate": best_params["learning_rate"],
    "reg_lambda": best_params["reg_lambda"],
    "reg_alpha": best_params["reg_alpha"],
    "eval_metric": "logloss"  # 評価指標
}

evals = [(dtrain, "train"), (dval, "eval")]
bst = xgb.train(params, dtrain, num_boost_round=100, evals=evals, early_stopping_rounds=10)


[0]	train-logloss:0.52235	eval-logloss:0.43629
[1]	train-logloss:0.47114	eval-logloss:0.40718
[2]	train-logloss:0.42145	eval-logloss:0.37662
[3]	train-logloss:0.39446	eval-logloss:0.36137
[4]	train-logloss:0.38149	eval-logloss:0.35591
[5]	train-logloss:0.35171	eval-logloss:0.34130
[6]	train-logloss:0.33027	eval-logloss:0.32976
[7]	train-logloss:0.32184	eval-logloss:0.32386
[8]	train-logloss:0.30290	eval-logloss:0.31616
[9]	train-logloss:0.29558	eval-logloss:0.31356
[10]	train-logloss:0.28602	eval-logloss:0.31124
[11]	train-logloss:0.27421	eval-logloss:0.30665
[12]	train-logloss:0.26970	eval-logloss:0.30563
[13]	train-logloss:0.26621	eval-logloss:0.30372
[14]	train-logloss:0.26337	eval-logloss:0.30299
[15]	train-logloss:0.25726	eval-logloss:0.30391
[16]	train-logloss:0.25633	eval-logloss:0.30336
[17]	train-logloss:0.25414	eval-logloss:0.30262
[18]	train-logloss:0.24940	eval-logloss:0.30001
[19]	train-logloss:0.24786	eval-logloss:0.30022
[20]	train-logloss:0.24754	eval-logloss:0.29997
[2

In [11]:
# modelの保存
pickle.dump(bst, open("best_model.pkl", "wb"))

In [12]:


# 検証データに対する予測（確率値）
y_pred_val = bst.predict(dval)

# AUC ROCの評価
auc_roc = roc_auc_score(y_val, y_pred_val)
print(f"Validation AUC ROC: {auc_roc:.4f}")


Validation AUC ROC: 0.8462


In [14]:
# テストデータに対する予測（確率値）
predictions = bst.predict(dtest)

# 元データのIDカラムを抽出
test_ids = test_df['id']  # 'id'カラム名は実際のデータに合わせて変更してください

# 予測結果とIDを結合
submission_df = pd.DataFrame({
    'id': test_ids,
    'prediction': predictions
})

# CSVファイルとして保存
submission_df.to_csv('submission_xgboost.csv', index=False, header=None)