In [1]:
import pandas as pd

# CSVファイルの読み込み
df = pd.read_csv("train_data.csv")  # パスは実際の場所に合わせて調整

# TotalCharges を float に変換（文字列になっている可能性あり）
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

# 欠損値（NaN）を含む行を削除
df.dropna(inplace=True)

In [2]:
# 'customerID' は分析に不要なので削除
df.drop("customerID", axis=1, inplace=True)

# 'Churn'（目的変数）を 0/1 に変換
df["Churn"] = df["Churn"].map({"No": 0, "Yes": 1})

# One-Hot Encoding を適用（drop_first=True で多重共線性対策）
df = pd.get_dummies(df, drop_first=True)


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler  # ← 追加

X = df.drop("Churn", axis=1)
y = df["Churn"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# スケーリングをここで実行
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)



In [4]:
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # 正則化の強さ
    'solver': ['liblinear', 'lbfgs']
}

grid = GridSearchCV(
    LogisticRegression(max_iter=1000),
    param_grid,
    scoring='f1',
    cv=5,
    n_jobs=-1
)
grid.fit(X_train, y_train)

# 最良のモデルを取得
model = grid.best_estimator_



In [5]:
from sklearn.model_selection import cross_val_score

f1_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1')
print("F1 scores (cross validation):", f1_scores)
print("Mean F1 score:", f1_scores.mean())


F1 scores (cross validation): [0.59436009 0.58719647 0.59722222 0.59955257 0.55172414]
Mean F1 score: 0.5860110975238516


In [6]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_score

# パイプラインの作成
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(max_iter=1000))
])

# ハイパーパラメータの設定
param_grid = {
    'clf__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'clf__penalty': ['l1', 'l2'],
    'clf__solver': ['liblinear'],  
}

# グリッドサーチ
grid = GridSearchCV(
    pipeline,
    param_grid,
    scoring='f1',
    cv=5,
    n_jobs=-1
)
grid.fit(X_train, y_train)

# 最良のモデル
best_model = grid.best_estimator_

# F1スコア（交差検証）
f1_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='f1')
print("Best parameters:", grid.best_params_)
print("F1 scores (cross validation):", f1_scores)
print("Mean F1 score:", f1_scores.mean())


Best parameters: {'clf__C': 0.001, 'clf__penalty': 'l2', 'clf__solver': 'liblinear'}
F1 scores (cross validation): [0.62357414 0.63099631 0.62598425 0.62239089 0.57370518]
Mean F1 score: 0.6153301555083542


In [7]:
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

def objective(trial):
    # ハイパーパラメータの候補を定義
    penalty = trial.suggest_categorical("penalty", ["l1", "l2"])
    solver = trial.suggest_categorical("solver", ["liblinear", "saga"])
    
    # solver と penalty の組み合わせが無効な場合はスキップ
    if penalty == "l1" and solver not in ["liblinear", "saga"]:
        raise optuna.exceptions.TrialPruned()
    if penalty == "l2" and solver not in ["liblinear", "lbfgs", "saga"]:
        raise optuna.exceptions.TrialPruned()

    C = trial.suggest_loguniform("C", 1e-4, 1e2)

    model = Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(
            penalty=penalty,
            C=C,
            solver=solver,
            max_iter=1000,
            random_state=42
        ))
    ])

    score = cross_val_score(model, X_train, y_train, cv=5, scoring='f1')
    return score.mean()


  from .autonotebook import tqdm as notebook_tqdm


In [8]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=200)

print("Best parameters:", study.best_params)
print("Best F1 score:", study.best_value)


[I 2025-05-14 12:17:40,247] A new study created in memory with name: no-name-5c147386-033e-4cb4-b1bc-53f001531c3d
  C = trial.suggest_loguniform("C", 1e-4, 1e2)
[I 2025-05-14 12:17:41,314] Trial 0 finished with value: 0.5765499522605225 and parameters: {'penalty': 'l1', 'solver': 'liblinear', 'C': 1.2085123035893406}. Best is trial 0 with value: 0.5765499522605225.
  C = trial.suggest_loguniform("C", 1e-4, 1e2)
[I 2025-05-14 12:17:41,606] Trial 1 finished with value: 0.5761730078821771 and parameters: {'penalty': 'l2', 'solver': 'saga', 'C': 0.031528994903907044}. Best is trial 0 with value: 0.5765499522605225.
  C = trial.suggest_loguniform("C", 1e-4, 1e2)
[I 2025-05-14 12:17:45,592] Trial 2 finished with value: 0.5775895785551316 and parameters: {'penalty': 'l1', 'solver': 'liblinear', 'C': 6.351121281788449}. Best is trial 2 with value: 0.5775895785551316.
  C = trial.suggest_loguniform("C", 1e-4, 1e2)
[I 2025-05-14 12:17:45,760] Trial 3 finished with value: 0.5659604060948265 and p

Best parameters: {'penalty': 'l2', 'solver': 'liblinear', 'C': 0.0007884412251228605}
Best F1 score: 0.6206794419355391


In [9]:
from sklearn.linear_model import LogisticRegression
import numpy as np

# best C を Optuna の結果から取得
best_C = study.best_params["C"]

# L1正則化を使ったモデルで特徴量選択
model = LogisticRegression(penalty='l1', solver='saga', C=best_C, max_iter=1000)
model.fit(X_train, y_train)

# 係数が0でない特徴量を選択
selected_features = X_train.columns[np.abs(model.coef_[0]) > 1e-4]



AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [None]:
from sklearn.model_selection import cross_val_score

# 選択した特徴で再学習
model_selected = LogisticRegression(penalty='l2', solver='liblinear', C=best_C, max_iter=1000)
f1_scores = cross_val_score(model_selected, X_train_selected, y_train, cv=5, scoring='f1')

print("F1 scores:", f1_scores)
print("Mean F1 score:", f1_scores.mean())


In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

xgb_model = XGBClassifier(random_state=42, eval_metric="logloss")

f1_scores = cross_val_score(xgb_model, X_train, y_train, cv=5, scoring='f1')
print("F1 scores (cross validation):", f1_scores)
print("Mean F1 score:", f1_scores.mean())



F1 scores (cross validation): [0.53521127 0.54712644 0.53828306 0.54945055 0.51627907]
Mean F1 score: 0.5372700772500493


In [None]:
import optuna
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, f1_score

# Optunaの目的関数
def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'random_state': 42,
        'use_label_encoder': False,
        'eval_metric': 'logloss'
    }

    model = xgb.XGBClassifier(**params)
    f1 = cross_val_score(model, X_train, y_train, cv=5, scoring='f1').mean()
    return f1

# Optunaの実行
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# 結果表示
print("Best parameters:", study.best_params)
print("Best F1 score:", study.best_value)


[I 2025-04-30 11:21:54,457] A new study created in memory with name: no-name-d2c8503e-cd37-4c07-baf2-a5436688e2e8
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-04-30 11:21:55,515] Trial 0 finished with value: 0.5345753441006582 and parameters: {'max_depth': 6, 'learning_rate': 0.29311419889408585, 'n_estimators': 438, 'gamma': 0.5424241761199261, 'min_child_weight': 1, 'subsample': 0.7741159524218945, 'colsample_bytree': 0.7148926069223511}. Best is trial 0 with value: 0.5345753441006582.
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrai

Best parameters: {'max_depth': 9, 'learning_rate': 0.08566298030743864, 'n_estimators': 307, 'gamma': 3.8367175320893705, 'min_child_weight': 7, 'subsample': 0.847393462751068, 'colsample_bytree': 0.9758562300995555}
Best F1 score: 0.5889731012877728
