Connected to base (Python 3.13.5)

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import time
from tqdm import tqdm

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline as SkPipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold, ParameterGrid, cross_val_score
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import precision_recall_curve
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTENC
import matplotlib.pyplot as plt

from xgboost import XGBClassifier


# ======================
# 1) 커스텀 전처리
# ======================
class DatetimeFeatureExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): 
        return self
    def transform(self, X):
        df = X.copy()
        if "date" in df.columns and "time" in df.columns:
            df["datetime"] = pd.to_datetime(
                df["date"].astype(str) + " " + df["time"].astype(str),
                errors="coerce",
                infer_datetime_format=True
            )
        df["hour"] = df["datetime"].dt.hour
        df["shift"] = df["hour"].apply(lambda h: "Day" if 8 <= h <= 19 else "Night")
        prev_count = df["count"].iloc[0]
        global_counts, accum = [], 0
        for current_count in df["count"]:
            if current_count < prev_count:
                accum += prev_count
            global_counts.append(accum + current_count)
            prev_count = current_count
        df["global_count"] = global_counts
        df["year_month"] = df["datetime"].dt.to_period("M")
        df["monthly_count"] = df.groupby("year_month").cumcount() + 1
        return df

class FeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): 
        return self
    def transform(self, X):
        df = X.copy()
        df["speed_ratio"] = df["low_section_speed"] / df["high_section_speed"]
        df["pressure_speed_ratio"] = df["cast_pressure"] / df["high_section_speed"]
        df.loc[(df["low_section_speed"] == 0) & (df["high_section_speed"] == 0), "speed_ratio"] = -1
        df.loc[(df["low_section_speed"] != 0) & (df["high_section_speed"] == 0), "speed_ratio"] = -1
        df.loc[df["high_section_speed"] == 0, "pressure_speed_ratio"] = -1
        for col in ["heating_furnace", "emergency_stop", "tryshot_signal", "EMS_operation_time"]:
            if col in df.columns:
                df[col] = df[col].fillna("Unknown")
        if "molten_temp" in df.columns and df["molten_temp"].isna().any():
            df["molten_temp"] = df["molten_temp"].fillna(df["molten_temp"].mode()[0])
        if "molten_volume" in df.columns:
            df["molten_volume"] = df["molten_volume"].interpolate("linear").ffill().bfill()
        df = df.replace([np.inf, -np.inf], -1)
        return df

class DropColumns(BaseEstimator, TransformerMixin):
    def __init__(self, drop_cols=None):
        self.drop_cols = drop_cols or []
    def fit(self, X, y=None): 
        return self
    def transform(self, X):
        return X.drop(columns=[c for c in self.drop_cols if c in X.columns])


# ======================
# 2) Threshold Finder
# ======================
def find_best_threshold_fbeta(y_true, y_prob, beta=2.0):
    p, r, t = precision_recall_curve(y_true, y_prob)
    t = np.append(t, 1.0)
    fbeta = (1 + beta**2) * (p * r) / (beta**2 * p + r + 1e-12)
    best_idx = int(np.nanargmax(fbeta))
    return float(t[best_idx]), float(fbeta[best_idx])


# ======================
# 3) Main
# ======================
if __name__ == "__main__":
    # ----- 데이터 로드
    train = pd.read_csv("./data/train.csv")
    test  = pd.read_csv("./data/test.csv")

    y_train = train["passorfail"]
    X_train = train.drop(columns=["passorfail"])
    X_test  = test.copy()

    # ----- 커스텀 전처리
    drop_cols = ["id","line","name","mold_name","date","time","registration_time",
                 "year_month","hour","datetime","real_time","working"]
    base_preproc = SkPipeline(steps=[
        ("datetime", DatetimeFeatureExtractor()),
        ("engineer", FeatureEngineer()),
        ("drop", DropColumns(drop_cols=drop_cols)),
    ])

    tmp_after = base_preproc.fit_transform(X_train)
    expected_cats = ["mold_code","heating_furnace","EMS_operation_time","shift",
                     "emergency_stop","tryshot_signal"]
    present_cats = [c for c in expected_cats if c in tmp_after.columns]

    cat_pipe = SkPipeline(steps=[
        ("imp", SimpleImputer(strategy="most_frequent")),
        ("ord", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
    ])
    num_pipe = SkPipeline(steps=[("imp", SimpleImputer(strategy="median"))])
    num_selector = make_column_selector(dtype_include=np.number)

    model_preproc = ColumnTransformer(
        transformers=[("cat", cat_pipe, present_cats),
                      ("num", num_pipe, num_selector)],
        remainder="drop"
    )

    categorical_feature_indices = list(range(len(present_cats)))

    pipe = ImbPipeline(steps=[
        ("base", base_preproc),
        ("prep", model_preproc),
        ("smote", SMOTENC(categorical_features=categorical_feature_indices, random_state=42)),
        ("xgb", XGBClassifier(
            objective="binary:logistic",
            eval_metric="logloss",
            use_label_encoder=False,
            random_state=42,
            n_jobs=-1,
            tree_method="hist"
        ))
    ])

    # ----- 파라미터 그리드 (축소 가능)
    ratio_grid = [1/9, 2/8, 3/7, 4/6, 1.0]
    param_grid = {
        "smote__sampling_strategy": ratio_grid,
        "smote__k_neighbors": [3, 5],
        "xgb__n_estimators": [400, 800],
        "xgb__learning_rate": [0.05, 0.08],
        "xgb__max_depth": [4, 6],
        "xgb__min_child_weight": [1, 3],
        "xgb__subsample": [0.7, 1.0],
        "xgb__colsample_bytree": [0.6, 1.0],
        "xgb__gamma": [0.0, 0.1],
        "xgb__reg_lambda": [1.0, 5.0],
        "xgb__reg_alpha": [0.0, 0.5],
        "xgb__scale_pos_weight": [1.0, 3.0]
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # ----- tqdm Grid Search (Recall 기준)
    results = []
    param_list = list(ParameterGrid(param_grid))
    print(f"총 탐색 조합 수: {len(param_list)}")

    start_time = time.time()
    for params in tqdm(param_list, desc="GridSearch 진행중", unit="조합"):
        pipe.set_params(**params)
        scores = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="recall", n_jobs=-1)
        results.append((params, scores.mean()))
    elapsed = time.time() - start_time

    # ----- 최적 조합 선택
    best_params, best_recall = max(results, key=lambda x: x[1])
    print("\n===== GridSearchCV 결과 (tqdm 기반) =====")
    print("Best params :", best_params)
    print("Best Recall :", best_recall)
    print(f"총 소요 시간: {elapsed/60:.2f} 분")

    # ----- 최적 모델 학습
    pipe.set_params(**best_params)
    pipe.fit(X_train, y_train)

    # ----- 확률 보정
    calib = CalibratedClassifierCV(pipe, method="isotonic", cv=5)
    calib.fit(X_train, y_train)

    # ----- Threshold 탐색 (F2 기준)
    train_proba = calib.predict_proba(X_train)[:, 1]
    best_thr, best_f2 = find_best_threshold_fbeta(y_train.values, train_proba, beta=2.0)
    print(f"\nBest threshold by F2 on train: {best_thr:.4f} (F2={best_f2:.4f})")

    # ----- 테스트 예측
    test_proba = calib.predict_proba(X_test)[:, 1]
    pd.DataFrame({"id": X_test["id"], "passorfail": test_proba}) \
        .to_csv("submission_best_xgb_proba.csv", index=False)

    test_pred = (test_proba >= best_thr).astype(int)
    pd.DataFrame({"id": X_test["id"], "prediction": test_pred}) \
        .to_csv("submission_best_xgb_label.csv", index=False)

    print("\n📁 저장 완료: submission_best_xgb_proba.csv / submission_best_xgb_label.csv")

총 탐색 조합 수: 10240


GridSearch 진행중:   0%|          | 0/10240 [00:03<?, ?조합/s]


ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\kbk29\miniconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\kbk29\miniconda3\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\kbk29\miniconda3\Lib\site-packages\imblearn\pipeline.py", line 518, in fit
    Xt, yt = self._fit(X, y, routed_params, raw_params=params)
             ~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\kbk29\miniconda3\Lib\site-packages\imblearn\pipeline.py", line 400, in _fit
    self._validate_steps()
    ~~~~~~~~~~~~~~~~~~~~^^
  File "c:\Users\kbk29\miniconda3\Lib\site-packages\imblearn\pipeline.py", line 289, in _validate_steps
    raise TypeError(
        "All intermediate steps of the chain should not be Pipelines"
    )
TypeError: All intermediate steps of the chain should not be Pipelines


In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import time
from tqdm import tqdm

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline as SkPipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold, ParameterGrid, cross_val_score
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import precision_recall_curve
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTENC
import matplotlib.pyplot as plt

from xgboost import XGBClassifier


# ======================
# 1) 커스텀 전처리
# ======================
class DatetimeFeatureExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): 
        return self
    def transform(self, X):
        df = X.copy()
        if "date" in df.columns and "time" in df.columns:
            df["datetime"] = pd.to_datetime(
                df["date"].astype(str) + " " + df["time"].astype(str),
                errors="coerce",
                infer_datetime_format=True
            )
        df["hour"] = df["datetime"].dt.hour
        df["shift"] = df["hour"].apply(lambda h: "Day" if 8 <= h <= 19 else "Night")
        prev_count = df["count"].iloc[0]
        global_counts, accum = [], 0
        for current_count in df["count"]:
            if current_count < prev_count:
                accum += prev_count
            global_counts.append(accum + current_count)
            prev_count = current_count
        df["global_count"] = global_counts
        df["year_month"] = df["datetime"].dt.to_period("M")
        df["monthly_count"] = df.groupby("year_month").cumcount() + 1
        return df

class FeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): 
        return self
    def transform(self, X):
        df = X.copy()
        df["speed_ratio"] = df["low_section_speed"] / df["high_section_speed"]
        df["pressure_speed_ratio"] = df["cast_pressure"] / df["high_section_speed"]
        df.loc[(df["low_section_speed"] == 0) & (df["high_section_speed"] == 0), "speed_ratio"] = -1
        df.loc[(df["low_section_speed"] != 0) & (df["high_section_speed"] == 0), "speed_ratio"] = -1
        df.loc[df["high_section_speed"] == 0, "pressure_speed_ratio"] = -1
        for col in ["heating_furnace", "emergency_stop", "tryshot_signal", "EMS_operation_time"]:
            if col in df.columns:
                df[col] = df[col].fillna("Unknown")
        if "molten_temp" in df.columns and df["molten_temp"].isna().any():
            df["molten_temp"] = df["molten_temp"].fillna(df["molten_temp"].mode()[0])
        if "molten_volume" in df.columns:
            df["molten_volume"] = df["molten_volume"].interpolate("linear").ffill().bfill()
        df = df.replace([np.inf, -np.inf], -1)
        return df

class DropColumns(BaseEstimator, TransformerMixin):
    def __init__(self, drop_cols=None):
        self.drop_cols = drop_cols or []
    def fit(self, X, y=None): 
        return self
    def transform(self, X):
        return X.drop(columns=[c for c in self.drop_cols if c in X.columns])


# ======================
# 2) Threshold Finder
# ======================
def find_best_threshold_fbeta(y_true, y_prob, beta=2.0):
    p, r, t = precision_recall_curve(y_true, y_prob)
    t = np.append(t, 1.0)
    fbeta = (1 + beta**2) * (p * r) / (beta**2 * p + r + 1e-12)
    best_idx = int(np.nanargmax(fbeta))
    return float(t[best_idx]), float(fbeta[best_idx])


# ======================
# 3) Main
# ======================
if __name__ == "__main__":
    # ----- 데이터 로드
    train = pd.read_csv("./data/train.csv")
    test  = pd.read_csv("./data/test.csv")

    y_train = train["passorfail"]
    X_train = train.drop(columns=["passorfail"])
    X_test  = test.copy()

    # ----- 커스텀 전처리
    drop_cols = ["id","line","name","mold_name","date","time","registration_time",
                 "year_month","hour","datetime","real_time","working"]
    base_preproc = SkPipeline(steps=[
        ("datetime", DatetimeFeatureExtractor()),
        ("engineer", FeatureEngineer()),
        ("drop", DropColumns(drop_cols=drop_cols)),
    ])

    tmp_after = base_preproc.fit_transform(X_train)
    expected_cats = ["mold_code","heating_furnace","EMS_operation_time","shift",
                     "emergency_stop","tryshot_signal"]
    present_cats = [c for c in expected_cats if c in tmp_after.columns]

    cat_pipe = SkPipeline(steps=[
        ("imp", SimpleImputer(strategy="most_frequent")),
        ("ord", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
    ])
    num_pipe = SkPipeline(steps=[("imp", SimpleImputer(strategy="median"))])
    num_selector = make_column_selector(dtype_include=np.number)

    model_preproc = ColumnTransformer(
        transformers=[("cat", cat_pipe, present_cats),
                      ("num", num_pipe, num_selector)],
        remainder="drop"
    )

    categorical_feature_indices = list(range(len(present_cats)))

pipe = ImbPipeline(steps=[
    ("datetime", DatetimeFeatureExtractor()),
    ("engineer", FeatureEngineer()),
    ("drop", DropColumns(drop_cols=drop_cols)),
    ("prep", model_preproc),
    ("smote", SMOTENC(categorical_features=categorical_feature_indices, random_state=42)),
    ("xgb", XGBClassifier(
        objective="binary:logistic",
        eval_metric="logloss",
        use_label_encoder=False,
        random_state=42,
        n_jobs=-1,
        tree_method="hist"
    ))
])

    # ----- 파라미터 그리드 (축소 가능)
    ratio_grid = [1/9, 2/8, 3/7, 4/6, 1.0]
    param_grid = {
        "smote__sampling_strategy": ratio_grid,
        "smote__k_neighbors": [3, 5],
        "xgb__n_estimators": [400, 800],
        "xgb__learning_rate": [0.05, 0.08],
        "xgb__max_depth": [4, 6],
        "xgb__min_child_weight": [1, 3],
        "xgb__subsample": [0.7, 1.0],
        "xgb__colsample_bytree": [0.6, 1.0],
        "xgb__gamma": [0.0, 0.1],
        "xgb__reg_lambda": [1.0, 5.0],
        "xgb__reg_alpha": [0.0, 0.5],
        "xgb__scale_pos_weight": [1.0, 3.0]
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # ----- tqdm Grid Search (Recall 기준)
    results = []
    param_list = list(ParameterGrid(param_grid))
    print(f"총 탐색 조합 수: {len(param_list)}")

    start_time = time.time()
    for params in tqdm(param_list, desc="GridSearch 진행중", unit="조합"):
        pipe.set_params(**params)
        scores = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="recall", n_jobs=-1)
        results.append((params, scores.mean()))
    elapsed = time.time() - start_time

    # ----- 최적 조합 선택
    best_params, best_recall = max(results, key=lambda x: x[1])
    print("\n===== GridSearchCV 결과 (tqdm 기반) =====")
    print("Best params :", best_params)
    print("Best Recall :", best_recall)
    print(f"총 소요 시간: {elapsed/60:.2f} 분")

    # ----- 최적 모델 학습
    pipe.set_params(**best_params)
    pipe.fit(X_train, y_train)

    # ----- 확률 보정
    calib = CalibratedClassifierCV(pipe, method="isotonic", cv=5)
    calib.fit(X_train, y_train)

    # ----- Threshold 탐색 (F2 기준)
    train_proba = calib.predict_proba(X_train)[:, 1]
    best_thr, best_f2 = find_best_threshold_fbeta(y_train.values, train_proba, beta=2.0)
    print(f"\nBest threshold by F2 on train: {best_thr:.4f} (F2={best_f2:.4f})")

    # ----- 테스트 예측
    test_proba = calib.predict_proba(X_test)[:, 1]
    pd.DataFrame({"id": X_test["id"], "passorfail": test_proba}) \
        .to_csv("submission_best_xgb_proba.csv", index=False)

    test_pred = (test_proba >= best_thr).astype(int)
    pd.DataFrame({"id": X_test["id"], "prediction": test_pred}) \
        .to_csv("submission_best_xgb_label.csv", index=False)

    print("\n📁 저장 완료: submission_best_xgb_proba.csv / submission_best_xgb_label.csv")

IndentationError: unexpected indent (<ipython-input-2-6d7a7207990f>, line 150)

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import time
from tqdm import tqdm

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold, ParameterGrid, cross_val_score
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import precision_recall_curve
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTENC
import matplotlib.pyplot as plt

from xgboost import XGBClassifier


# ======================
# 1) 커스텀 전처리
# ======================
class DatetimeFeatureExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): 
        return self
    def transform(self, X):
        df = X.copy()
        if "date" in df.columns and "time" in df.columns:
            df["datetime"] = pd.to_datetime(
                df["date"].astype(str) + " " + df["time"].astype(str),
                errors="coerce",
                infer_datetime_format=True
            )
        df["hour"] = df["datetime"].dt.hour
        df["shift"] = df["hour"].apply(lambda h: "Day" if 8 <= h <= 19 else "Night")
        prev_count = df["count"].iloc[0]
        global_counts, accum = [], 0
        for current_count in df["count"]:
            if current_count < prev_count:
                accum += prev_count
            global_counts.append(accum + current_count)
            prev_count = current_count
        df["global_count"] = global_counts
        df["year_month"] = df["datetime"].dt.to_period("M")
        df["monthly_count"] = df.groupby("year_month").cumcount() + 1
        return df

class FeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): 
        return self
    def transform(self, X):
        df = X.copy()
        df["speed_ratio"] = df["low_section_speed"] / df["high_section_speed"]
        df["pressure_speed_ratio"] = df["cast_pressure"] / df["high_section_speed"]
        df.loc[(df["low_section_speed"] == 0) & (df["high_section_speed"] == 0), "speed_ratio"] = -1
        df.loc[(df["low_section_speed"] != 0) & (df["high_section_speed"] == 0), "speed_ratio"] = -1
        df.loc[df["high_section_speed"] == 0, "pressure_speed_ratio"] = -1
        for col in ["heating_furnace", "emergency_stop", "tryshot_signal", "EMS_operation_time"]:
            if col in df.columns:
                df[col] = df[col].fillna("Unknown")
        if "molten_temp" in df.columns and df["molten_temp"].isna().any():
            df["molten_temp"] = df["molten_temp"].fillna(df["molten_temp"].mode()[0])
        if "molten_volume" in df.columns:
            df["molten_volume"] = df["molten_volume"].interpolate("linear").ffill().bfill()
        df = df.replace([np.inf, -np.inf], -1)
        return df

class DropColumns(BaseEstimator, TransformerMixin):
    def __init__(self, drop_cols=None):
        self.drop_cols = drop_cols or []
    def fit(self, X, y=None): 
        return self
    def transform(self, X):
        return X.drop(columns=[c for c in self.drop_cols if c in X.columns])


# ======================
# 2) Threshold Finder
# ======================
def find_best_threshold_fbeta(y_true, y_prob, beta=2.0):
    p, r, t = precision_recall_curve(y_true, y_prob)
    t = np.append(t, 1.0)
    fbeta = (1 + beta**2) * (p * r) / (beta**2 * p + r + 1e-12)
    best_idx = int(np.nanargmax(fbeta))
    return float(t[best_idx]), float(fbeta[best_idx])


# ======================
# 3) Main
# ======================
if __name__ == "__main__":
    # ----- 데이터 로드
    train = pd.read_csv("./data/train.csv")
    test  = pd.read_csv("./data/test.csv")

    y_train = train["passorfail"]
    X_train = train.drop(columns=["passorfail"])
    X_test  = test.copy()

    # ----- 커스텀 전처리 후 컬럼 정의
    drop_cols = ["id","line","name","mold_name","date","time","registration_time",
                 "year_month","hour","datetime","real_time","working"]

    tmp_after = (DatetimeFeatureExtractor()
                 .fit_transform(X_train))
    tmp_after = (FeatureEngineer()
                 .fit_transform(tmp_after))
    tmp_after = DropColumns(drop_cols=drop_cols).fit_transform(tmp_after)

    expected_cats = ["mold_code","heating_furnace","EMS_operation_time","shift",
                     "emergency_stop","tryshot_signal"]
    present_cats = [c for c in expected_cats if c in tmp_after.columns]

    cat_pipe = SkPipeline(steps=[
        ("imp", SimpleImputer(strategy="most_frequent")),
        ("ord", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
    ])
    num_pipe = SkPipeline(steps=[("imp", SimpleImputer(strategy="median"))])
    num_selector = make_column_selector(dtype_include=np.number)

    model_preproc = ColumnTransformer(
        transformers=[("cat", cat_pipe, present_cats),
                      ("num", num_pipe, num_selector)],
        remainder="drop"
    )

    categorical_feature_indices = list(range(len(present_cats)))

    # ----- 최종 파이프라인 (중첩 Pipeline 제거)
    pipe = ImbPipeline(steps=[
        ("datetime", DatetimeFeatureExtractor()),
        ("engineer", FeatureEngineer()),
        ("drop", DropColumns(drop_cols=drop_cols)),
        ("prep", model_preproc),
        ("smote", SMOTENC(categorical_features=categorical_feature_indices, random_state=42)),
        ("xgb", XGBClassifier(
            objective="binary:logistic",
            eval_metric="logloss",
            use_label_encoder=False,
            random_state=42,
            n_jobs=-1,
            tree_method="hist"
        ))
    ])

    # ----- 파라미터 그리드
    ratio_grid = [1/9, 2/8, 3/7, 4/6, 1.0]
    param_grid = {
        "smote__sampling_strategy": ratio_grid,
        "smote__k_neighbors": [3, 5],
        "xgb__n_estimators": [400, 800],
        "xgb__learning_rate": [0.05, 0.08],
        "xgb__max_depth": [4, 6],
        "xgb__min_child_weight": [1, 3],
        "xgb__subsample": [0.7, 1.0],
        "xgb__colsample_bytree": [0.6, 1.0],
        "xgb__gamma": [0.0, 0.1],
        "xgb__reg_lambda": [1.0, 5.0],
        "xgb__reg_alpha": [0.0, 0.5],
        "xgb__scale_pos_weight": [1.0, 3.0]
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # ----- tqdm Grid Search (Recall 기준)
    results = []
    param_list = list(ParameterGrid(param_grid))
    print(f"총 탐색 조합 수: {len(param_list)}")

    start_time = time.time()
    for params in tqdm(param_list, desc="GridSearch 진행중", unit="조합"):
        pipe.set_params(**params)
        scores = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="recall", n_jobs=-1)
        results.append((params, scores.mean()))
    elapsed = time.time() - start_time

    # ----- 최적 조합 선택
    best_params, best_recall = max(results, key=lambda x: x[1])
    print("\n===== GridSearchCV 결과 (tqdm 기반) =====")
    print("Best params :", best_params)
    print("Best Recall :", best_recall)
    print(f"총 소요 시간: {elapsed/60:.2f} 분")

    # ----- 최적 모델 학습
    pipe.set_params(**best_params)
    pipe.fit(X_train, y_train)

    # ----- 확률 보정
    calib = CalibratedClassifierCV(pipe, method="isotonic", cv=5)
    calib.fit(X_train, y_train)

    # ----- Threshold 탐색 (F2 기준)
    train_proba = calib.predict_proba(X_train)[:, 1]
    best_thr, best_f2 = find_best_threshold_fbeta(y_train.values, train_proba, beta=2.0)
    print(f"\nBest threshold by F2 on train: {best_thr:.4f} (F2={best_f2:.4f})")

    # ----- 테스트 예측
    test_proba = calib.predict_proba(X_test)[:, 1]
    pd.DataFrame({"id": X_test["id"], "passorfail": test_proba}) \
        .to_csv("submission_best_xgb_proba.csv", index=False)

    test_pred = (test_proba >= best_thr).astype(int)
    pd.DataFrame({"id": X_test["id"], "prediction": test_pred}) \
        .to_csv("submission_best_xgb_label.csv", index=False)

    print("\n📁 저장 완료: submission_best_xgb_proba.csv / submission_best_xgb_label.csv")

총 탐색 조합 수: 10240


GridSearch 진행중:   1%|▏         | 137/10240 [18:42<23:00:10,  8.20s/조합]


KeyboardInterrupt: 

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import optuna
from tqdm import tqdm

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import precision_recall_curve, make_scorer, recall_score
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTENC
import matplotlib.pyplot as plt

from xgboost import XGBClassifier


# ======================
# 1) 커스텀 전처리
# ======================
class DatetimeFeatureExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): 
        return self
    def transform(self, X):
        df = X.copy()
        if "date" in df.columns and "time" in df.columns:
            df["datetime"] = pd.to_datetime(
                df["date"].astype(str) + " " + df["time"].astype(str),
                errors="coerce",
                infer_datetime_format=True
            )
        df["hour"] = df["datetime"].dt.hour
        df["shift"] = df["hour"].apply(lambda h: "Day" if 8 <= h <= 19 else "Night")
        prev_count = df["count"].iloc[0]
        global_counts, accum = [], 0
        for current_count in df["count"]:
            if current_count < prev_count:
                accum += prev_count
            global_counts.append(accum + current_count)
            prev_count = current_count
        df["global_count"] = global_counts
        df["year_month"] = df["datetime"].dt.to_period("M")
        df["monthly_count"] = df.groupby("year_month").cumcount() + 1
        return df

class FeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): 
        return self
    def transform(self, X):
        df = X.copy()
        df["speed_ratio"] = df["low_section_speed"] / df["high_section_speed"]
        df["pressure_speed_ratio"] = df["cast_pressure"] / df["high_section_speed"]
        df.loc[(df["low_section_speed"] == 0) & (df["high_section_speed"] == 0), "speed_ratio"] = -1
        df.loc[(df["low_section_speed"] != 0) & (df["high_section_speed"] == 0), "speed_ratio"] = -1
        df.loc[df["high_section_speed"] == 0, "pressure_speed_ratio"] = -1
        for col in ["heating_furnace", "emergency_stop", "tryshot_signal", "EMS_operation_time"]:
            if col in df.columns:
                df[col] = df[col].fillna("Unknown")
        if "molten_temp" in df.columns and df["molten_temp"].isna().any():
            df["molten_temp"] = df["molten_temp"].fillna(df["molten_temp"].mode()[0])
        if "molten_volume" in df.columns:
            df["molten_volume"] = df["molten_volume"].interpolate("linear").ffill().bfill()
        df = df.replace([np.inf, -np.inf], -1)
        return df

class DropColumns(BaseEstimator, TransformerMixin):
    def __init__(self, drop_cols=None):
        self.drop_cols = drop_cols or []
    def fit(self, X, y=None): 
        return self
    def transform(self, X):
        return X.drop(columns=[c for c in self.drop_cols if c in X.columns])


# ======================
# 2) Threshold Finder
# ======================
def find_best_threshold_fbeta(y_true, y_prob, beta=2.0):
    p, r, t = precision_recall_curve(y_true, y_prob)
    t = np.append(t, 1.0)
    fbeta = (1 + beta**2) * (p * r) / (beta**2 * p + r + 1e-12)
    best_idx = int(np.nanargmax(fbeta))
    return float(t[best_idx]), float(fbeta[best_idx])


# ======================
# 3) Main
# ======================
if __name__ == "__main__":
    # ----- 데이터 로드
    train = pd.read_csv("./data/train.csv")
    test  = pd.read_csv("./data/test.csv")

    y_train = train["passorfail"]
    X_train = train.drop(columns=["passorfail"])
    X_test  = test.copy()

    # ----- 컬럼 처리
    drop_cols = ["id","line","name","mold_name","date","time","registration_time",
                 "year_month","hour","datetime","real_time","working"]

    tmp_after = DatetimeFeatureExtractor().fit_transform(X_train)
    tmp_after = FeatureEngineer().fit_transform(tmp_after)
    tmp_after = DropColumns(drop_cols=drop_cols).fit_transform(tmp_after)

    expected_cats = ["mold_code","heating_furnace","EMS_operation_time","shift",
                     "emergency_stop","tryshot_signal"]
    present_cats = [c for c in expected_cats if c in tmp_after.columns]

    cat_pipe = ColumnTransformer(
        transformers=[("cat", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), present_cats)],
        remainder="passthrough"
    )

    categorical_feature_indices = list(range(len(present_cats)))

    # ----- 최종 파이프라인
    pipe = ImbPipeline(steps=[
        ("datetime", DatetimeFeatureExtractor()),
        ("engineer", FeatureEngineer()),
        ("drop", DropColumns(drop_cols=drop_cols)),
        ("prep", cat_pipe),
        ("smote", SMOTENC(categorical_features=categorical_feature_indices, random_state=42)),
        ("xgb", XGBClassifier(
            objective="binary:logistic",
            eval_metric="logloss",
            use_label_encoder=False,
            random_state=42,
            n_jobs=-1,
            tree_method="hist"
        ))
    ])

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # ======================
    # Optuna Objective
    # ======================
    def objective(trial):
        params = {
            "smote__sampling_strategy": trial.suggest_float("smote__sampling_strategy", 0.1, 1.0),
            "smote__k_neighbors": trial.suggest_int("smote__k_neighbors", 3, 7),
            "xgb__n_estimators": trial.suggest_int("xgb__n_estimators", 300, 1200),
            "xgb__learning_rate": trial.suggest_float("xgb__learning_rate", 0.01, 0.1, log=True),
            "xgb__max_depth": trial.suggest_int("xgb__max_depth", 3, 8),
            "xgb__min_child_weight": trial.suggest_int("xgb__min_child_weight", 1, 5),
            "xgb__subsample": trial.suggest_float("xgb__subsample", 0.6, 1.0),
            "xgb__colsample_bytree": trial.suggest_float("xgb__colsample_bytree", 0.6, 1.0),
            "xgb__gamma": trial.suggest_float("xgb__gamma", 0.0, 0.3),
            "xgb__reg_lambda": trial.suggest_float("xgb__reg_lambda", 0.0, 5.0),
            "xgb__reg_alpha": trial.suggest_float("xgb__reg_alpha", 0.0, 1.0),
            "xgb__scale_pos_weight": trial.suggest_float("xgb__scale_pos_weight", 1.0, 5.0),
        }
        pipe.set_params(**params)
        scores = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="recall", n_jobs=-1)
        return scores.mean()

    # ======================
    # Optuna 실행
    # ======================
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=50, show_progress_bar=True)  # trial 수 조절 가능

    print("\n===== Optuna 결과 =====")
    print("Best Params:", study.best_trial.params)
    print("Best Recall:", study.best_value)

    # ----- Top 10 Trials
    print("\nTop 10 Trials (Recall 기준):")
    top_trials = sorted(study.trials, key=lambda t: t.value, reverse=True)[:10]
    for i, t in enumerate(top_trials, 1):
        print(f"Rank {i} | Recall={t.value:.4f} | Params={t.params}")

    # ----- 최적 모델 학습
    pipe.set_params(**study.best_trial.params)
    pipe.fit(X_train, y_train)

    calib = CalibratedClassifierCV(pipe, method="isotonic", cv=5)
    calib.fit(X_train, y_train)

    train_proba = calib.predict_proba(X_train)[:, 1]
    best_thr, best_f2 = find_best_threshold_fbeta(y_train.values, train_proba, beta=2.0)
    print(f"\nBest threshold by F2 on train: {best_thr:.4f} (F2={best_f2:.4f})")

    # ----- 테스트 예측
    test_proba = calib.predict_proba(X_test)[:, 1]
    pd.DataFrame({"id": X_test["id"], "passorfail": test_proba}) \
        .to_csv("submission_best_xgb_proba.csv", index=False)
    test_pred = (test_proba >= best_thr).astype(int)
    pd.DataFrame({"id": X_test["id"], "prediction": test_pred}) \
        .to_csv("submission_best_xgb_label.csv", index=False)

    print("\n📁 저장 완료: submission_best_xgb_proba.csv / submission_best_xgb_label.csv")

ModuleNotFoundError: No module named 'optuna'

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import optuna
from tqdm import tqdm

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import precision_recall_curve, make_scorer, recall_score
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTENC
import matplotlib.pyplot as plt

from xgboost import XGBClassifier


# ======================
# 1) 커스텀 전처리
# ======================
class DatetimeFeatureExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): 
        return self
    def transform(self, X):
        df = X.copy()
        if "date" in df.columns and "time" in df.columns:
            df["datetime"] = pd.to_datetime(
                df["date"].astype(str) + " " + df["time"].astype(str),
                errors="coerce",
                infer_datetime_format=True
            )
        df["hour"] = df["datetime"].dt.hour
        df["shift"] = df["hour"].apply(lambda h: "Day" if 8 <= h <= 19 else "Night")
        prev_count = df["count"].iloc[0]
        global_counts, accum = [], 0
        for current_count in df["count"]:
            if current_count < prev_count:
                accum += prev_count
            global_counts.append(accum + current_count)
            prev_count = current_count
        df["global_count"] = global_counts
        df["year_month"] = df["datetime"].dt.to_period("M")
        df["monthly_count"] = df.groupby("year_month").cumcount() + 1
        return df

class FeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): 
        return self
    def transform(self, X):
        df = X.copy()
        df["speed_ratio"] = df["low_section_speed"] / df["high_section_speed"]
        df["pressure_speed_ratio"] = df["cast_pressure"] / df["high_section_speed"]
        df.loc[(df["low_section_speed"] == 0) & (df["high_section_speed"] == 0), "speed_ratio"] = -1
        df.loc[(df["low_section_speed"] != 0) & (df["high_section_speed"] == 0), "speed_ratio"] = -1
        df.loc[df["high_section_speed"] == 0, "pressure_speed_ratio"] = -1
        for col in ["heating_furnace", "emergency_stop", "tryshot_signal", "EMS_operation_time"]:
            if col in df.columns:
                df[col] = df[col].fillna("Unknown")
        if "molten_temp" in df.columns and df["molten_temp"].isna().any():
            df["molten_temp"] = df["molten_temp"].fillna(df["molten_temp"].mode()[0])
        if "molten_volume" in df.columns:
            df["molten_volume"] = df["molten_volume"].interpolate("linear").ffill().bfill()
        df = df.replace([np.inf, -np.inf], -1)
        return df

class DropColumns(BaseEstimator, TransformerMixin):
    def __init__(self, drop_cols=None):
        self.drop_cols = drop_cols or []
    def fit(self, X, y=None): 
        return self
    def transform(self, X):
        return X.drop(columns=[c for c in self.drop_cols if c in X.columns])


# ======================
# 2) Threshold Finder
# ======================
def find_best_threshold_fbeta(y_true, y_prob, beta=2.0):
    p, r, t = precision_recall_curve(y_true, y_prob)
    t = np.append(t, 1.0)
    fbeta = (1 + beta**2) * (p * r) / (beta**2 * p + r + 1e-12)
    best_idx = int(np.nanargmax(fbeta))
    return float(t[best_idx]), float(fbeta[best_idx])


# ======================
# 3) Main
# ======================
if __name__ == "__main__":
    # ----- 데이터 로드
    train = pd.read_csv("./data/train.csv")
    test  = pd.read_csv("./data/test.csv")

    y_train = train["passorfail"]
    X_train = train.drop(columns=["passorfail"])
    X_test  = test.copy()

    # ----- 컬럼 처리
    drop_cols = ["id","line","name","mold_name","date","time","registration_time",
                 "year_month","hour","datetime","real_time","working"]

    tmp_after = DatetimeFeatureExtractor().fit_transform(X_train)
    tmp_after = FeatureEngineer().fit_transform(tmp_after)
    tmp_after = DropColumns(drop_cols=drop_cols).fit_transform(tmp_after)

    expected_cats = ["mold_code","heating_furnace","EMS_operation_time","shift",
                     "emergency_stop","tryshot_signal"]
    present_cats = [c for c in expected_cats if c in tmp_after.columns]

    cat_pipe = ColumnTransformer(
        transformers=[("cat", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), present_cats)],
        remainder="passthrough"
    )

    categorical_feature_indices = list(range(len(present_cats)))

    # ----- 최종 파이프라인
    pipe = ImbPipeline(steps=[
        ("datetime", DatetimeFeatureExtractor()),
        ("engineer", FeatureEngineer()),
        ("drop", DropColumns(drop_cols=drop_cols)),
        ("prep", cat_pipe),
        ("smote", SMOTENC(categorical_features=categorical_feature_indices, random_state=42)),
        ("xgb", XGBClassifier(
            objective="binary:logistic",
            eval_metric="logloss",
            use_label_encoder=False,
            random_state=42,
            n_jobs=-1,
            tree_method="hist"
        ))
    ])

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # ======================
    # Optuna Objective
    # ======================
    def objective(trial):
        params = {
            "smote__sampling_strategy": trial.suggest_float("smote__sampling_strategy", 0.1, 1.0),
            "smote__k_neighbors": trial.suggest_int("smote__k_neighbors", 3, 7),
            "xgb__n_estimators": trial.suggest_int("xgb__n_estimators", 300, 1200),
            "xgb__learning_rate": trial.suggest_float("xgb__learning_rate", 0.01, 0.1, log=True),
            "xgb__max_depth": trial.suggest_int("xgb__max_depth", 3, 8),
            "xgb__min_child_weight": trial.suggest_int("xgb__min_child_weight", 1, 5),
            "xgb__subsample": trial.suggest_float("xgb__subsample", 0.6, 1.0),
            "xgb__colsample_bytree": trial.suggest_float("xgb__colsample_bytree", 0.6, 1.0),
            "xgb__gamma": trial.suggest_float("xgb__gamma", 0.0, 0.3),
            "xgb__reg_lambda": trial.suggest_float("xgb__reg_lambda", 0.0, 5.0),
            "xgb__reg_alpha": trial.suggest_float("xgb__reg_alpha", 0.0, 1.0),
            "xgb__scale_pos_weight": trial.suggest_float("xgb__scale_pos_weight", 1.0, 5.0),
        }
        pipe.set_params(**params)
        scores = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="recall", n_jobs=-1)
        return scores.mean()

    # ======================
    # Optuna 실행
    # ======================
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=50, show_progress_bar=True)  # trial 수 조절 가능

    print("\n===== Optuna 결과 =====")
    print("Best Params:", study.best_trial.params)
    print("Best Recall:", study.best_value)

    # ----- Top 10 Trials
    print("\nTop 10 Trials (Recall 기준):")
    top_trials = sorted(study.trials, key=lambda t: t.value, reverse=True)[:10]
    for i, t in enumerate(top_trials, 1):
        print(f"Rank {i} | Recall={t.value:.4f} | Params={t.params}")

    # ----- 최적 모델 학습
    pipe.set_params(**study.best_trial.params)
    pipe.fit(X_train, y_train)

    calib = CalibratedClassifierCV(pipe, method="isotonic", cv=5)
    calib.fit(X_train, y_train)

    train_proba = calib.predict_proba(X_train)[:, 1]
    best_thr, best_f2 = find_best_threshold_fbeta(y_train.values, train_proba, beta=2.0)
    print(f"\nBest threshold by F2 on train: {best_thr:.4f} (F2={best_f2:.4f})")

    # ----- 테스트 예측
    test_proba = calib.predict_proba(X_test)[:, 1]
    pd.DataFrame({"id": X_test["id"], "passorfail": test_proba}) \
        .to_csv("submission_best_xgb_proba.csv", index=False)
    test_pred = (test_proba >= best_thr).astype(int)
    pd.DataFrame({"id": X_test["id"], "prediction": test_pred}) \
        .to_csv("submission_best_xgb_label.csv", index=False)

    print("\n📁 저장 완료: submission_best_xgb_proba.csv / submission_best_xgb_label.csv")

[I 2025-09-30 16:13:02,341] A new study created in memory with name: no-name-528dbf85-ae65-4eb3-a3cf-9f1c9b24cd66


  0%|          | 0/50 [00:00<?, ?it/s]

[W 2025-09-30 16:13:06,611] Trial 0 failed with parameters: {'smote__sampling_strategy': 0.786177799372898, 'smote__k_neighbors': 7, 'xgb__n_estimators': 633, 'xgb__learning_rate': 0.07975169607480741, 'xgb__max_depth': 3, 'xgb__min_child_weight': 5, 'xgb__subsample': 0.9757404786443569, 'xgb__colsample_bytree': 0.9571795863020222, 'xgb__gamma': 0.2172406839150739, 'xgb__reg_lambda': 3.2023985514589084, 'xgb__reg_alpha': 0.898851173005305, 'xgb__scale_pos_weight': 4.576981515196467} because of the following error: ValueError('\nAll the 5 fits failed.\nIt is very likely that your model is misconfigured.\nYou can try to debug the error by setting error_score=\'raise\'.\n\nBelow are more details about the failures:\n--------------------------------------------------------------------------------\n5 fits failed with the following error:\nTraceback (most recent call last):\n  File "c:\\Users\\kbk29\\miniconda3\\Lib\\site-packages\\sklearn\\model_selection\\_validation.py", line 859, in _fit

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\kbk29\miniconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\kbk29\miniconda3\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\kbk29\miniconda3\Lib\site-packages\imblearn\pipeline.py", line 518, in fit
    Xt, yt = self._fit(X, y, routed_params, raw_params=params)
             ~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\kbk29\miniconda3\Lib\site-packages\imblearn\pipeline.py", line 440, in _fit
    X, y, fitted_transformer = fit_resample_one_cached(
                               ~~~~~~~~~~~~~~~~~~~~~~~^
        cloned_transformer,
        ^^^^^^^^^^^^^^^^^^^
    ...<4 lines>...
        params=routed_params[name],
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "c:\Users\kbk29\miniconda3\Lib\site-packages\joblib\memory.py", line 326, in __call__
    return self.func(*args, **kwargs)
           ~~~~~~~~~^^^^^^^^^^^^^^^^^
  File "c:\Users\kbk29\miniconda3\Lib\site-packages\imblearn\pipeline.py", line 1336, in _fit_resample_one
    X_res, y_res = sampler.fit_resample(X, y, **params.get("fit_resample", {}))
                   ~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\kbk29\miniconda3\Lib\site-packages\imblearn\base.py", line 202, in fit_resample
    return super().fit_resample(X, y, **params)
           ~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^
  File "c:\Users\kbk29\miniconda3\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\kbk29\miniconda3\Lib\site-packages\imblearn\base.py", line 105, in fit_resample
    output = self._fit_resample(X, y, **params)
  File "c:\Users\kbk29\miniconda3\Lib\site-packages\imblearn\over_sampling\_smote\base.py", line 599, in _fit_resample
    X_continuous = check_array(X_continuous, accept_sparse=["csr", "csc"])
  File "c:\Users\kbk29\miniconda3\Lib\site-packages\sklearn\utils\validation.py", line 1105, in check_array
    _assert_all_finite(
    ~~~~~~~~~~~~~~~~~~^
        array,
        ^^^^^^
    ...<2 lines>...
        allow_nan=ensure_all_finite == "allow-nan",
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "c:\Users\kbk29\miniconda3\Lib\site-packages\sklearn\utils\validation.py", line 120, in _assert_all_finite
    _assert_all_finite_element_wise(
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^
        X,
        ^^
    ...<4 lines>...
        input_name=input_name,
        ^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "c:\Users\kbk29\miniconda3\Lib\site-packages\sklearn\utils\validation.py", line 169, in _assert_all_finite_element_wise
    raise ValueError(msg_err)
ValueError: Input contains NaN.


In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import time
import optuna
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline as SkPipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import precision_recall_curve
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTENC
from xgboost import XGBClassifier


# ======================
# 1) 커스텀 전처리
# ======================
class DatetimeFeatureExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): 
        return self
    def transform(self, X):
        df = X.copy()
        if "date" in df.columns and "time" in df.columns:
            df["datetime"] = pd.to_datetime(
                df["date"].astype(str) + " " + df["time"].astype(str),
                errors="coerce",
                infer_datetime_format=True
            )
        if "datetime" in df.columns:
            df["hour"] = df["datetime"].dt.hour
            df["shift"] = df["hour"].apply(lambda h: "Day" if 8 <= h <= 19 else "Night")
            prev_count = df["count"].iloc[0]
            global_counts, accum = [], 0
            for current_count in df["count"]:
                if current_count < prev_count:
                    accum += prev_count
                global_counts.append(accum + current_count)
                prev_count = current_count
            df["global_count"] = global_counts
            df["year_month"] = df["datetime"].dt.to_period("M")
            df["monthly_count"] = df.groupby("year_month").cumcount() + 1
        return df

class FeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): 
        return self
    def transform(self, X):
        df = X.copy()
        if "low_section_speed" in df.columns and "high_section_speed" in df.columns:
            df["speed_ratio"] = df["low_section_speed"] / df["high_section_speed"]
            df["pressure_speed_ratio"] = df["cast_pressure"] / df["high_section_speed"]
            df.loc[(df["low_section_speed"] == 0) & (df["high_section_speed"] == 0), "speed_ratio"] = -1
            df.loc[(df["low_section_speed"] != 0) & (df["high_section_speed"] == 0), "speed_ratio"] = -1
            df.loc[df["high_section_speed"] == 0, "pressure_speed_ratio"] = -1

        for col in ["heating_furnace", "emergency_stop", "tryshot_signal", "EMS_operation_time"]:
            if col in df.columns:
                df[col] = df[col].fillna("Unknown")

        if "molten_temp" in df.columns and df["molten_temp"].isna().any():
            df["molten_temp"] = df["molten_temp"].fillna(df["molten_temp"].mode()[0])

        if "molten_volume" in df.columns:
            df["molten_volume"] = df["molten_volume"].interpolate("linear").ffill().bfill()

        df = df.replace([np.inf, -np.inf], -1)
        return df

class DropColumns(BaseEstimator, TransformerMixin):
    def __init__(self, drop_cols=None):
        self.drop_cols = drop_cols or []
    def fit(self, X, y=None): 
        return self
    def transform(self, X):
        return X.drop(columns=[c for c in self.drop_cols if c in X.columns])


# ======================
# 2) Threshold Finder
# ======================
def find_best_threshold_fbeta(y_true, y_prob, beta=2.0):
    p, r, t = precision_recall_curve(y_true, y_prob)
    t = np.append(t, 1.0)
    fbeta = (1 + beta**2) * (p * r) / (beta**2 * p + r + 1e-12)
    best_idx = int(np.nanargmax(fbeta))
    return float(t[best_idx]), float(fbeta[best_idx])


# ======================
# 3) Main
# ======================
if __name__ == "__main__":
    # ----- 데이터 로드
    train = pd.read_csv("./data/train.csv")
    test  = pd.read_csv("./data/test.csv")

    y_train = train["passorfail"]
    X_train = train.drop(columns=["passorfail"])
    X_test  = test.copy()

    # ----- 드랍할 컬럼
    drop_cols = ["id","line","name","mold_name","date","time","registration_time",
                 "year_month","hour","datetime","real_time","working"]

    # ----- 카테고리/수치형 구분
    tmp_after = (DatetimeFeatureExtractor().fit_transform(X_train))
    tmp_after = (FeatureEngineer().fit_transform(tmp_after))
    tmp_after = DropColumns(drop_cols=drop_cols).fit_transform(tmp_after)

    expected_cats = ["mold_code","heating_furnace","EMS_operation_time","shift",
                     "emergency_stop","tryshot_signal"]
    present_cats = [c for c in expected_cats if c in tmp_after.columns]

    cat_pipe = SkPipeline(steps=[
        ("imp", SimpleImputer(strategy="most_frequent")),
        ("ord", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
    ])
    num_pipe = SkPipeline(steps=[("imp", SimpleImputer(strategy="median"))])
    num_selector = make_column_selector(dtype_include=np.number)

    model_preproc = ColumnTransformer(
        transformers=[("cat", cat_pipe, present_cats),
                      ("num", num_pipe, num_selector)],
        remainder="drop"
    )

    categorical_feature_indices = list(range(len(present_cats)))

    # ----- 최종 파이프라인
    pipe = ImbPipeline(steps=[
        ("datetime", DatetimeFeatureExtractor()),
        ("engineer", FeatureEngineer()),
        ("drop", DropColumns(drop_cols=drop_cols)),
        ("prep", model_preproc),   # 결측치 처리 포함
        ("smote", SMOTENC(categorical_features=categorical_feature_indices, random_state=42)),
        ("xgb", XGBClassifier(
            objective="binary:logistic",
            eval_metric="logloss",
            use_label_encoder=False,
            random_state=42,
            n_jobs=-1,
            tree_method="hist"
        ))
    ])

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # ======================
    # Optuna Objective
    # ======================
    def objective(trial):
        params = {
            "smote__sampling_strategy": trial.suggest_float("smote__sampling_strategy", 0.1, 1.0),
            "smote__k_neighbors": trial.suggest_int("smote__k_neighbors", 3, 7),
            "xgb__n_estimators": trial.suggest_int("xgb__n_estimators", 300, 800),
            "xgb__learning_rate": trial.suggest_float("xgb__learning_rate", 0.01, 0.2, log=True),
            "xgb__max_depth": trial.suggest_int("xgb__max_depth", 3, 8),
            "xgb__min_child_weight": trial.suggest_int("xgb__min_child_weight", 1, 5),
            "xgb__subsample": trial.suggest_float("xgb__subsample", 0.5, 1.0),
            "xgb__colsample_bytree": trial.suggest_float("xgb__colsample_bytree", 0.5, 1.0),
            "xgb__gamma": trial.suggest_float("xgb__gamma", 0.0, 0.5),
            "xgb__reg_lambda": trial.suggest_float("xgb__reg_lambda", 0.5, 5.0, log=True),
            "xgb__reg_alpha": trial.suggest_float("xgb__reg_alpha", 0.0, 1.0),
            "xgb__scale_pos_weight": trial.suggest_float("xgb__scale_pos_weight", 1.0, 5.0),
        }
        pipe.set_params(**params)
        scores = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="recall", n_jobs=-1)
        return scores.mean()

    # ======================
    # Optuna 실행
    # ======================
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=50, show_progress_bar=True)  # trial 수 조절 가능

    # ======================
    # 결과 출력
    # ======================
    print("\n===== Optuna 결과 =====")
    print("Best Params:", study.best_trial.params)
    print("Best Recall:", study.best_value)

    print("\n===== Top 10 Trials (Recall 기준) =====")
    top_trials = sorted(study.trials, key=lambda t: t.value if t.value is not None else -1, reverse=True)[:10]
    for i, t in enumerate(top_trials, 1):
        print(f"Rank {i} | Recall={t.value:.4f} | Params={t.params}")

[I 2025-09-30 16:14:46,424] A new study created in memory with name: no-name-75caa3db-38ce-43f9-b020-78cb700a05e6


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-09-30 16:14:57,758] Trial 0 finished with value: 0.9490695401228821 and parameters: {'smote__sampling_strategy': 0.2751606068792357, 'smote__k_neighbors': 7, 'xgb__n_estimators': 658, 'xgb__learning_rate': 0.04155214197703763, 'xgb__max_depth': 3, 'xgb__min_child_weight': 2, 'xgb__subsample': 0.9597330990994736, 'xgb__colsample_bytree': 0.8650535461923865, 'xgb__gamma': 0.06963277657738065, 'xgb__reg_lambda': 2.3731210998357315, 'xgb__reg_alpha': 0.4038995452618023, 'xgb__scale_pos_weight': 3.7153452192257874}. Best is trial 0 with value: 0.9490695401228821.
[I 2025-09-30 16:15:12,153] Trial 1 finished with value: 0.9469377210947683 and parameters: {'smote__sampling_strategy': 0.4445858190561319, 'smote__k_neighbors': 7, 'xgb__n_estimators': 634, 'xgb__learning_rate': 0.1381954834916881, 'xgb__max_depth': 7, 'xgb__min_child_weight': 1, 'xgb__subsample': 0.8806395872838246, 'xgb__colsample_bytree': 0.5633127853482096, 'xgb__gamma': 0.10272571012008141, 'xgb__reg_lambda': 2.65231

In [None]:
from sklearn.model_selection import cross_val_score

# Top 10 trial들
top_trials = sorted(study.trials, key=lambda t: t.value, reverse=True)[:10]

print("\n===== Top 10 Trials (Recall/F1/Acc) =====")
for rank, t in enumerate(top_trials, 1):
    params = t.params
    pipe.set_params(**params)
    
    recall = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="recall", n_jobs=-1).mean()
    f1     = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="f1", n_jobs=-1).mean()
    acc    = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="accuracy", n_jobs=-1).mean()
    
    print(f"Rank {rank} | Recall={recall:.4f} | F1={f1:.4f} | Acc={acc:.4f} | Params={params}")


===== Top 10 Trials (Recall/F1/Acc) =====
Rank 1 | Recall=0.9665 | F1=0.8440 | Acc=0.9841 | Params={'smote__sampling_strategy': 0.8720679052176163, 'smote__k_neighbors': 3, 'xgb__n_estimators': 536, 'xgb__learning_rate': 0.011309626065281651, 'xgb__max_depth': 6, 'xgb__min_child_weight': 5, 'xgb__subsample': 0.7817757557752542, 'xgb__colsample_bytree': 0.7610341369684775, 'xgb__gamma': 0.317690601991742, 'xgb__reg_lambda': 1.4536900190258994, 'xgb__reg_alpha': 0.30245181417777384, 'xgb__scale_pos_weight': 4.511464146520451}
Rank 2 | Recall=0.9643 | F1=0.8450 | Acc=0.9842 | Params={'smote__sampling_strategy': 0.7335966172521562, 'smote__k_neighbors': 3, 'xgb__n_estimators': 467, 'xgb__learning_rate': 0.020633047621508027, 'xgb__max_depth': 5, 'xgb__min_child_weight': 5, 'xgb__subsample': 0.5004901203245256, 'xgb__colsample_bytree': 0.650126323739448, 'xgb__gamma': 0.17305910402530378, 'xgb__reg_lambda': 3.8722801203326043, 'xgb__reg_alpha': 0.3941824431159283, 'xgb__scale_pos_weight': 

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import optuna
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import precision_recall_curve
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTENC
from xgboost import XGBClassifier


# ======================
# 1) 커스텀 전처리
# ======================
class DatetimeFeatureExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): 
        return self
    def transform(self, X):
        df = X.copy()
        if "date" in df.columns and "time" in df.columns:
            df["datetime"] = pd.to_datetime(
                df["date"].astype(str) + " " + df["time"].astype(str),
                errors="coerce",
                infer_datetime_format=True
            )
        df["hour"] = df["datetime"].dt.hour
        df["shift"] = df["hour"].apply(lambda h: "Day" if 8 <= h <= 19 else "Night")
        prev_count = df["count"].iloc[0]
        global_counts, accum = [], 0
        for current_count in df["count"]:
            if current_count < prev_count:
                accum += prev_count
            global_counts.append(accum + current_count)
            prev_count = current_count
        df["global_count"] = global_counts
        df["year_month"] = df["datetime"].dt.to_period("M")
        df["monthly_count"] = df.groupby("year_month").cumcount() + 1
        return df

class FeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): 
        return self
    def transform(self, X):
        df = X.copy()
        df["speed_ratio"] = df["low_section_speed"] / df["high_section_speed"]
        df["pressure_speed_ratio"] = df["cast_pressure"] / df["high_section_speed"]
        df.loc[(df["low_section_speed"] == 0) & (df["high_section_speed"] == 0), "speed_ratio"] = -1
        df.loc[(df["low_section_speed"] != 0) & (df["high_section_speed"] == 0), "speed_ratio"] = -1
        df.loc[df["high_section_speed"] == 0, "pressure_speed_ratio"] = -1
        for col in ["heating_furnace", "emergency_stop", "tryshot_signal", "EMS_operation_time"]:
            if col in df.columns:
                df[col] = df[col].fillna("Unknown")
        if "molten_temp" in df.columns and df["molten_temp"].isna().any():
            df["molten_temp"] = df["molten_temp"].fillna(df["molten_temp"].mode()[0])
        if "molten_volume" in df.columns:
            df["molten_volume"] = df["molten_volume"].interpolate("linear").ffill().bfill()
        df = df.replace([np.inf, -np.inf], -1)
        return df

class DropColumns(BaseEstimator, TransformerMixin):
    def __init__(self, drop_cols=None):
        self.drop_cols = drop_cols or []
    def fit(self, X, y=None): 
        return self
    def transform(self, X):
        return X.drop(columns=[c for c in self.drop_cols if c in X.columns])


# ======================
# 2) Threshold Finder
# ======================
def find_best_threshold_fbeta(y_true, y_prob, beta=2.0):
    p, r, t = precision_recall_curve(y_true, y_prob)
    t = np.append(t, 1.0)
    fbeta = (1 + beta**2) * (p * r) / (beta**2 * p + r + 1e-12)
    best_idx = int(np.nanargmax(fbeta))
    return float(t[best_idx]), float(fbeta[best_idx])


# ======================
# 3) Main
# ======================
if __name__ == "__main__":
    # ----- 데이터 로드
    train = pd.read_csv("./data/train.csv")
    test  = pd.read_csv("./data/test.csv")

    y_train = train["passorfail"]
    X_train = train.drop(columns=["passorfail"])
    X_test  = test.copy()

    # ----- 커스텀 전처리 후 컬럼 정의
    drop_cols = ["id","line","name","mold_name","date","time","registration_time",
                 "year_month","hour","datetime","real_time","working"]

    tmp_after = DatetimeFeatureExtractor().fit_transform(X_train)
    tmp_after = FeatureEngineer().fit_transform(tmp_after)
    tmp_after = DropColumns(drop_cols=drop_cols).fit_transform(tmp_after)

    expected_cats = ["mold_code","heating_furnace","EMS_operation_time","shift",
                     "emergency_stop","tryshot_signal"]
    present_cats = [c for c in expected_cats if c in tmp_after.columns]

    cat_pipe = SimpleImputer(strategy="most_frequent")
    num_pipe = SimpleImputer(strategy="median")
    num_selector = make_column_selector(dtype_include=np.number)

    model_preproc = ColumnTransformer(
        transformers=[("cat", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), present_cats),
                      ("num", num_pipe, num_selector)],
        remainder="drop"
    )

    categorical_feature_indices = list(range(len(present_cats)))

    pipe = ImbPipeline(steps=[
        ("datetime", DatetimeFeatureExtractor()),
        ("engineer", FeatureEngineer()),
        ("drop", DropColumns(drop_cols=drop_cols)),
        ("prep", model_preproc),
        ("smote", SMOTENC(categorical_features=categorical_feature_indices, random_state=42)),
        ("xgb", XGBClassifier(
            objective="binary:logistic",
            eval_metric="logloss",
            use_label_encoder=False,
            random_state=42,
            n_jobs=-1,
            tree_method="hist"
        ))
    ])

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # ======================
    # Optuna objective (F1 최적화)
    # ======================
    def objective(trial):
        params = {
            "smote__sampling_strategy": trial.suggest_float("smote__sampling_strategy", 0.1, 1.0),
            "smote__k_neighbors": trial.suggest_int("smote__k_neighbors", 3, 7),
            "xgb__n_estimators": trial.suggest_int("xgb__n_estimators", 200, 800),
            "xgb__learning_rate": trial.suggest_float("xgb__learning_rate", 0.01, 0.3),
            "xgb__max_depth": trial.suggest_int("xgb__max_depth", 3, 8),
            "xgb__min_child_weight": trial.suggest_int("xgb__min_child_weight", 1, 6),
            "xgb__subsample": trial.suggest_float("xgb__subsample", 0.6, 1.0),
            "xgb__colsample_bytree": trial.suggest_float("xgb__colsample_bytree", 0.6, 1.0),
            "xgb__gamma": trial.suggest_float("xgb__gamma", 0.0, 0.5),
            "xgb__reg_lambda": trial.suggest_float("xgb__reg_lambda", 0.0, 5.0),
            "xgb__reg_alpha": trial.suggest_float("xgb__reg_alpha", 0.0, 1.0),
            "xgb__scale_pos_weight": trial.suggest_float("xgb__scale_pos_weight", 1.0, 5.0),
        }
        pipe.set_params(**params)

        # 교차검증 점수 계산
        f1     = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="f1", n_jobs=-1).mean()
        recall = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="recall", n_jobs=-1).mean()
        acc    = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="accuracy", n_jobs=-1).mean()

        trial.set_user_attr("recall", recall)
        trial.set_user_attr("accuracy", acc)

        return f1   # F1을 최적화 기준으로 사용

    # ======================
    # Optuna 실행
    # ======================
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=50, show_progress_bar=True)

    # ======================
    # Top 10 출력 (F1 기준)
    # ======================
    trials = sorted(study.trials, key=lambda t: t.value, reverse=True)[:10]
    print("\n===== Top 10 Trials (F1 기준, Recall/Acc 포함) =====")
    for rank, t in enumerate(trials, 1):
        print(f"Rank {rank} | F1={t.value:.4f} | Recall={t.user_attrs['recall']:.4f} | "
              f"Acc={t.user_attrs['accuracy']:.4f} | Params={t.params}")

[I 2025-09-30 17:22:29,205] A new study created in memory with name: no-name-37da6441-fe5a-400b-981e-d49262d9bdb0


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-09-30 17:23:19,088] Trial 0 finished with value: 0.9274846689280454 and parameters: {'smote__sampling_strategy': 0.5123153125572841, 'smote__k_neighbors': 4, 'xgb__n_estimators': 735, 'xgb__learning_rate': 0.2681302634628144, 'xgb__max_depth': 6, 'xgb__min_child_weight': 3, 'xgb__subsample': 0.7080121461970799, 'xgb__colsample_bytree': 0.9503672472420313, 'xgb__gamma': 0.01835207105159231, 'xgb__reg_lambda': 2.4345386568714247, 'xgb__reg_alpha': 0.9748765493561431, 'xgb__scale_pos_weight': 2.659892366437899}. Best is trial 0 with value: 0.9274846689280454.
[I 2025-09-30 17:23:49,632] Trial 1 finished with value: 0.9189639837763176 and parameters: {'smote__sampling_strategy': 0.9796673696001282, 'smote__k_neighbors': 7, 'xgb__n_estimators': 233, 'xgb__learning_rate': 0.22064148407980677, 'xgb__max_depth': 4, 'xgb__min_child_weight': 3, 'xgb__subsample': 0.8450044250447621, 'xgb__colsample_bytree': 0.9015613476941459, 'xgb__gamma': 0.28553033056082033, 'xgb__reg_lambda': 1.129097

In [None]:
X_train.columns

Index(['id', 'line', 'name', 'mold_name', 'time', 'date', 'count', 'working',
       'emergency_stop', 'molten_temp', 'facility_operation_cycleTime',
       'production_cycletime', 'low_section_speed', 'high_section_speed',
       'molten_volume', 'cast_pressure', 'biscuit_thickness',
       'upper_mold_temp1', 'upper_mold_temp2', 'upper_mold_temp3',
       'lower_mold_temp1', 'lower_mold_temp2', 'lower_mold_temp3',
       'sleeve_temperature', 'physical_strength', 'Coolant_temperature',
       'EMS_operation_time', 'registration_time', 'tryshot_signal',
       'mold_code', 'heating_furnace'],
      dtype='object')