In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
import pandas as pd

# CSV 読み込み & 欠損処理
df = pd.read_csv("train_data.csv")
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df.dropna(inplace=True)
df.drop("customerID", axis=1, inplace=True)
df["Churn"] = df["Churn"].map({"No": 0, "Yes": 1})

# One-Hot 不要。CatBoostはカテゴリ列のまま扱える
categorical_features = df.select_dtypes(include=["object"]).columns.tolist()
X = df.drop("Churn", axis=1)
y = df["Churn"]

# Label Encoding（カテゴリ列を整数に）
for col in categorical_features:
    X[col] = LabelEncoder().fit_transform(X[col])

# 学習・検証分割
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# SMOTE 適用
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)


In [2]:
import optuna
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score

def objective(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 100, 500),
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1.0, 10.0),
        "random_state": 42,
        "verbose": 0,
    }

    model = CatBoostClassifier(**params)
    f1 = cross_val_score(model, X_train_res, y_train_res, scoring="f1", cv=5).mean()
    return f1

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

print("Best parameters:", study.best_params)
print("Best F1 score:", study.best_value)


  from .autonotebook import tqdm as notebook_tqdm
[I 2025-05-09 17:59:58,035] A new study created in memory with name: no-name-2265d1fe-2812-4910-a717-d31b78d78efa
[I 2025-05-09 18:00:04,203] Trial 0 finished with value: 0.8124680023799001 and parameters: {'iterations': 179, 'depth': 8, 'learning_rate': 0.01034377308013232, 'l2_leaf_reg': 2.5114847691188427}. Best is trial 0 with value: 0.8124680023799001.
[I 2025-05-09 18:00:14,163] Trial 1 finished with value: 0.8025365968089462 and parameters: {'iterations': 448, 'depth': 5, 'learning_rate': 0.002044339540057037, 'l2_leaf_reg': 4.719580358391238}. Best is trial 0 with value: 0.8124680023799001.
[I 2025-05-09 18:00:28,874] Trial 2 finished with value: 0.8226305076476439 and parameters: {'iterations': 246, 'depth': 9, 'learning_rate': 0.10009186980852128, 'l2_leaf_reg': 4.408129512770314}. Best is trial 2 with value: 0.8226305076476439.
[I 2025-05-09 18:00:42,258] Trial 3 finished with value: 0.8225626847290718 and parameters: {'itera

Best parameters: {'iterations': 347, 'depth': 9, 'learning_rate': 0.06955738908214144, 'l2_leaf_reg': 1.8494721612197065}
Best F1 score: 0.8299259575164288


In [3]:
best_model = CatBoostClassifier(**study.best_params)
best_model.fit(X_train_res, y_train_res)

from sklearn.metrics import classification_report, f1_score

y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))
print("F1 score on test data:", f1_score(y_test, y_pred))


0:	learn: 0.6445832	total: 31.4ms	remaining: 10.9s
1:	learn: 0.6088710	total: 48ms	remaining: 8.28s
2:	learn: 0.5732579	total: 69ms	remaining: 7.91s
3:	learn: 0.5409978	total: 81.1ms	remaining: 6.95s
4:	learn: 0.5187144	total: 94ms	remaining: 6.43s
5:	learn: 0.4965580	total: 108ms	remaining: 6.14s
6:	learn: 0.4793214	total: 122ms	remaining: 5.9s
7:	learn: 0.4618469	total: 131ms	remaining: 5.54s
8:	learn: 0.4493935	total: 144ms	remaining: 5.4s
9:	learn: 0.4382173	total: 157ms	remaining: 5.3s
10:	learn: 0.4285934	total: 170ms	remaining: 5.19s
11:	learn: 0.4179923	total: 182ms	remaining: 5.09s
12:	learn: 0.4099866	total: 194ms	remaining: 4.99s
13:	learn: 0.4023063	total: 207ms	remaining: 4.93s
14:	learn: 0.3968127	total: 220ms	remaining: 4.87s
15:	learn: 0.3915051	total: 233ms	remaining: 4.81s
16:	learn: 0.3863405	total: 245ms	remaining: 4.76s
17:	learn: 0.3809243	total: 258ms	remaining: 4.72s
18:	learn: 0.3742987	total: 272ms	remaining: 4.7s
19:	learn: 0.3702991	total: 286ms	remaining: 4

In [4]:
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score

# 最適パラメータ（前ステップで得られたものを使用）
best_params = {
    'iterations': 347,
    'depth': 9,
    'learning_rate': 0.06955738908214144,
    'l2_leaf_reg': 1.8494721612197065,
    'verbose': False,
    'random_state': 42
}

# モデルを定義し、テストデータを評価用に設定
model = CatBoostClassifier(**best_params)

# early stopping で過学習を防ぐ（テストデータを eval_set に指定）
model.fit(
    X_train, y_train,
    eval_set=(X_test, y_test),
    early_stopping_rounds=30,
    verbose=100
)

# テストデータでのF1スコアを計算
y_pred = model.predict(X_test)
f1 = f1_score(y_test, y_pred)
print("Test F1 score (with eval_set & early stopping):", f1)


0:	learn: 0.6439203	test: 0.6466745	best: 0.6466745 (0)	total: 15.6ms	remaining: 5.39s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 0.4204784872
bestIteration = 36

Shrink model to first 37 iterations.
Test F1 score (with eval_set & early stopping): 0.5714285714285714


In [10]:
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score

model = model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.03,
    depth=9,
    l2_leaf_reg=2,
    class_weights=[1, 2],  # ← 変更
    eval_metric='F1',
    early_stopping_rounds=100,
    random_seed=42,
    verbose=100
)


model.fit(X_train, y_train, eval_set=(X_test, y_test))

# テストデータで F1 スコアを評価
y_pred = model.predict(X_test)
f1 = f1_score(y_test, y_pred)
print("Test F1 Score:", f1)



0:	learn: 0.7093890	test: 0.7095710	best: 0.7095710 (0)	total: 33.3ms	remaining: 33.2s
100:	learn: 0.8278705	test: 0.7074830	best: 0.7308320 (9)	total: 1.2s	remaining: 10.7s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.7308319739
bestIteration = 9

Shrink model to first 10 iterations.
Test F1 Score: 0.6372688477951636


In [11]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.03,
    depth=9,
    l2_leaf_reg=2,
    class_weights=[1, 2],
    eval_metric='F1',
    random_seed=42,
    verbose=0
)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, scoring='f1', cv=skf)

print("F1 scores (CV):", scores)
print("Mean F1 score:", scores.mean())


F1 scores (CV): [0.59305994 0.59038143 0.58374793 0.61363636 0.62345679]
Mean F1 score: 0.6008564887804337


In [12]:
from catboost import CatBoostClassifier
import pandas as pd

# CatBoostClassifier の設定（すでに X_train, y_train が定義されている前提）
model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.03,
    depth=9,
    l2_leaf_reg=2,
    class_weights=[1, 2],
    eval_metric='F1',
    random_seed=42,
    verbose=0
)

# モデルを学習
model.fit(X_train, y_train)

# 特徴量重要度を取得
importances = model.get_feature_importance()
features = X_train.columns

importance_df = pd.DataFrame({
    'feature': features,
    'importance': importances
}).sort_values(by='importance', ascending=False)

print(importance_df)


             feature  importance
4             tenure    9.728307
18      TotalCharges    8.480917
14          Contract    8.164364
16     PaymentMethod    8.136162
17    MonthlyCharges    7.587590
6      MultipleLines    6.108656
0             gender    5.866174
8     OnlineSecurity    5.130075
9       OnlineBackup    4.910939
11       TechSupport    4.862215
15  PaperlessBilling    4.815517
7    InternetService    4.229874
13   StreamingMovies    4.189710
12       StreamingTV    4.062276
10  DeviceProtection    4.042851
3         Dependents    3.785450
2            Partner    3.702617
1      SeniorCitizen    1.890724
5       PhoneService    0.305582


In [30]:
low_importance_features = [ 'PhoneService','SeniorCitizen']
X_train_reduced = X_train.drop(columns=low_importance_features)
X_test_reduced = X_test.drop(columns=low_importance_features)



In [32]:
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score

model = CatBoostClassifier(
    iterations=2000,
    learning_rate=0.01,
    depth=9,
    l2_leaf_reg=2,
    class_weights=[1, 3],
    eval_metric='F1',
    early_stopping_rounds=200,
    random_seed=42,
    verbose=100
)

model.fit(X_train_reduced, y_train, eval_set=(X_test_reduced, y_test))

# テストデータでのF1スコアを表示
y_pred = model.predict(X_test_reduced)
f1 = f1_score(y_test, y_pred)
print("Test F1 Score after feature selection:", f1)


0:	learn: 0.7837838	test: 0.7752443	best: 0.7752443 (0)	total: 15.8ms	remaining: 31.7s
100:	learn: 0.8259494	test: 0.7835821	best: 0.7913136 (21)	total: 1.11s	remaining: 21s
200:	learn: 0.8480929	test: 0.7939753	best: 0.7939753 (199)	total: 2.24s	remaining: 20.1s
300:	learn: 0.8710059	test: 0.7843137	best: 0.7965461 (238)	total: 3.4s	remaining: 19.2s
400:	learn: 0.8928806	test: 0.7788779	best: 0.7965461 (238)	total: 4.52s	remaining: 18s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.7965461414
bestIteration = 238

Shrink model to first 239 iterations.
Test F1 Score after feature selection: 0.6448230668414154


In [3]:
import pandas as pd

# CSV 読み込み
df = pd.read_csv("train_data.csv")

# 数値変換
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df.dropna(inplace=True)
df.drop("customerID", axis=1, inplace=True)

# 目的変数の変換
df["Churn"] = df["Churn"].map({"No": 0, "Yes": 1})
cat_features = df.select_dtypes(include="object").columns.tolist()
if "Churn" in cat_features:
    cat_features.remove("Churn")


# object 型（カテゴリカル）の列を抽出
cat_features = df.select_dtypes(include="object").columns.tolist()


from sklearn.model_selection import train_test_split

X = df.drop("Churn", axis=1)
y = df["Churn"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

from catboost import CatBoostClassifier
from sklearn.metrics import f1_score

model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.03,
    depth=9,
    l2_leaf_reg=2,
    class_weights=[1, 2],  # バランス調整
    eval_metric='F1',
    early_stopping_rounds=100,
    random_seed=42,
    cat_features=cat_features,  
    verbose=100
)

model.fit(X_train, y_train, eval_set=(X_test, y_test))

y_pred = model.predict(X_test)
f1 = f1_score(y_test, y_pred)
print("Test F1 Score (CatBoost with native categorical features):", f1)


0:	learn: 0.7064298	test: 0.7115544	best: 0.7115544 (0)	total: 183ms	remaining: 3m 3s
100:	learn: 0.7807109	test: 0.7290804	best: 0.7344262 (33)	total: 11.9s	remaining: 1m 45s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.7344262295
bestIteration = 33

Shrink model to first 34 iterations.
Test F1 Score (CatBoost with native categorical features): 0.642754662840746
