In [4]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from collections import Counter
from lightgbm import LGBMClassifier  # LightGBM 的分类器

def load_and_merge_data(directory, filenames):
    all_dfs = []
    for file in filenames:
        path = os.path.join(directory, file)
        df = pd.read_csv(path)
        # 定义故障标签：如果 f17 或 f18 为 1，则为故障（fault=1），否则为正常（fault=0）
        df['fault'] = np.where((df['f17'] == 1) | (df['f18'] == 1), 1, 0)
        all_dfs.append(df)
    merged_df = pd.concat(all_dfs, ignore_index=True)
    print("数据合并完成，总行数：", len(merged_df))
    print("故障标签分布：\n", merged_df['fault'].value_counts())
    return merged_df

def build_features(df):
    # 去掉 f1, f2（比如时间和编号）以及 f17, f18（用于创建 fault 标签）列
    feature_cols = df.columns.difference(['f1', 'f2', 'f17', 'f18', 'fault'])
    X = df[feature_cols]
    y = df['fault']
    # 标准化，转换为均值为0、方差为1的数据
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled, y

def train_and_evaluate(X, y):
    print("原始类别分布：", Counter(y))
    
    # 使用 SMOTE 方法生成合成少数类样本以平衡数据
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    print("SMOTE后类别分布：", Counter(y_resampled))
    
    # 拆分数据集，80% 作为训练集，20% 作为测试集，使用 stratify 保持标签比例
    X_train, X_test, y_train, y_test = train_test_split(
        X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
    )
    
    # 使用 LightGBM 分类器，设置 100 棵树，最大深度为 12，固定随机种子保证结果可复现
    clf = LGBMClassifier(n_estimators=100, max_depth=11, random_state=42)
    clf.fit(X_train, y_train)
    
    y_test_pred = clf.predict(X_test)
    
    print("\n--- 优化后的分类报告 ---")
    # 输出评估报告，包含 precision, recall, f1-score 和 support
    report_dict = classification_report(y_test, y_test_pred, output_dict=True)
    print("{:<12} {:>10} {:>10} {:>10} {:>10}".format(
        "", "precision", "recall", "f1-score", "support"))
    for label in ["0", "1"]:
        row = report_dict[label]
        print("{:<12} {:10.4f} {:10.4f} {:10.4f} {:10.0f}".format(
            label, row["precision"], row["recall"], row["f1-score"], row["support"]))

def main():
    data_dir = r"E:\Datahw2"
    file_list = [f"tube{i}.csv" for i in range(1, 6)]
    df = load_and_merge_data(data_dir, file_list)
    X, y = build_features(df)
    train_and_evaluate(X, y)

if __name__ == "__main__":
    main()


数据合并完成，总行数： 942045
故障标签分布：
 fault
0    941946
1        99
Name: count, dtype: int64
原始类别分布： Counter({0: 941946, 1: 99})
SMOTE后类别分布： Counter({0: 941946, 1: 941946})
[LightGBM] [Info] Number of positive: 753557, number of negative: 753556
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.026505 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3306
[LightGBM] [Info] Number of data points in the train set: 1507113, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000001
[LightGBM] [Info] Start training from score 0.000001





--- 优化后的分类报告 ---
              precision     recall   f1-score    support
0                0.9994     0.9931     0.9962     188390
1                0.9931     0.9994     0.9963     188389
