In [1]:
import pandas as pd
import joblib
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split


# =====================
# Load Data
# =====================
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00601/ai4i2020.csv"
df = pd.read_csv(url)

drop_cols = ["UDI", "Product ID", "TWF", "HDF", "PWF", "OSF", "RNF"]
df = df.drop(columns=[c for c in drop_cols if c in df.columns])

y = df["Machine failure"].astype(int)
X = df.drop(columns=["Machine failure"])


# =====================
# Detect Columns
# =====================
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()


# =====================
# Feature Engineering (只針對數值欄位，且在補值之後)
# =====================
class NumericFeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self, numeric_feature_names):
        self.numeric_feature_names = numeric_feature_names

    def fit(self, X, y=None):
        return self

    def transform(self, X):

        X = pd.DataFrame(X, columns=self.numeric_feature_names)

        X["Temp_Diff"] = X["Process temperature [K]"] - X["Air temperature [K]"]
        X["Power"] = X["Torque [Nm]"] * X["Rotational speed [rpm]"]
        return X.values


# =====================
# Numeric Pipeline：補缺失 → 特徵工程 → 標準化
# =====================
numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("feat_eng", NumericFeatureEngineer(num_cols)),
    ("scaler", StandardScaler())
])

# =====================
# Categorical Pipeline：補缺失 → OneHot（不標準化）
# =====================
categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(drop="first", handle_unknown="ignore"))
])

# =====================
# ColumnTransformer：把兩條 pipe 合在一起
# =====================
preprocess = ColumnTransformer([
    ("num", numeric_pipeline, num_cols),
    ("cat", categorical_pipeline, cat_cols),
])


# =====================
# XGBoost Model（你的最佳參數）
# =====================
best_xgb = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    n_jobs=-1,
    tree_method="hist",
    colsample_bytree=0.7,
    learning_rate=0.1,
    max_depth=8,
    n_estimators=200,
    scale_pos_weight=28.52029520295203,
    subsample=0.8,
    random_state=42
)


# =====================
# Final Pipeline
# =====================
final_pipeline = Pipeline([
    ("preprocess", preprocess),
    ("model", best_xgb)
])


# =====================
# Train & Save
# =====================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

final_pipeline.fit(X_train, y_train)

joblib.dump(final_pipeline, "ai4i_xgb_pipeline_final.pkl")
print(" Pipeline 已儲存為 ai4i_xgb_pipeline_final.pkl")


 Pipeline 已儲存為 ai4i_xgb_pipeline_final.pkl


模型推論

In [12]:
import pandas as pd
import joblib
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

pipeline = joblib.load("/content/ai4i_xgb_pipeline_final.pkl")

# url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00601/ai4i2020.csv"
# df = pd.read_csv(url)

# drop_cols = ["UDI", "Product ID", "TWF", "HDF", "PWF", "OSF", "RNF"]
# df = df.drop(columns=[c for c in drop_cols if c in df.columns])

# y = df["Machine failure"].astype(int)
# X = df.drop(columns=["Machine failure"])

# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.2, stratify=y, random_state=42
# )

y_proba = pipeline.predict_proba(X_test)[:, 1]

threshold = 0.7
y_pred = (y_proba >= threshold).astype(int)


print("=== Confusion Matrix  ===")
print(confusion_matrix(y_test, y_pred))
print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred, digits=4))

=== Confusion Matrix  ===
[[1924    8]
 [  13   55]]

=== Classification Report ===
              precision    recall  f1-score   support

           0     0.9933    0.9959    0.9946      1932
           1     0.8730    0.8088    0.8397        68

    accuracy                         0.9895      2000
   macro avg     0.9332    0.9023    0.9171      2000
weighted avg     0.9892    0.9895    0.9893      2000

