In [4]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from collections import Counter

def load_and_merge_data(directory, filenames):
    all_dfs = []
    for file in filenames:
        path = os.path.join(directory, file)
        df = pd.read_csv(path)
        df['fault'] = np.where((df['f17'] == 1) | (df['f18'] == 1), 1, 0)
        all_dfs.append(df)
    merged_df = pd.concat(all_dfs, ignore_index=True)
    print("数据合并完成，总行数：", len(merged_df))
    print("故障标签分布：\n", merged_df['fault'].value_counts())
    return merged_df

def build_features(df):
    feature_cols = df.columns.difference(['f1', 'f2', 'f17', 'f18', 'fault'])
    X = df[feature_cols]
    y = df['fault']
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled, y

def train_and_evaluate(X, y):
    print("原始类别分布：", Counter(y))

    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)
    print("SMOTE后类别分布：", Counter(y_resampled))

    X_train, X_test, y_train, y_test = train_test_split(
        X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
    )

    clf = RandomForestClassifier(n_estimators=100, max_depth=12, random_state=42)
    clf.fit(X_train, y_train)

    y_test_pred = clf.predict(X_test)

    print("\n--- 优化后的分类报告 ---")
    report_dict = classification_report(y_test, y_test_pred, output_dict=True)
    print("{:<12} {:>10} {:>10} {:>10} {:>10}".format(
        "", "precision", "recall", "f1-score", "support"))
    for label in ["0", "1"]:
        row = report_dict[label]
        print("{:<12} {:10.4f} {:10.4f} {:10.4f} {:10.0f}".format(
            label, row["precision"], row["recall"], row["f1-score"], row["support"]))

def main():
    data_dir = r"E:\Datahw2"
    file_list = [f"tube{i}.csv" for i in range(1, 6)]
    df = load_and_merge_data(data_dir, file_list)
    X, y = build_features(df)
    train_and_evaluate(X, y)

if __name__ == "__main__":
    main()


数据合并完成，总行数： 942045
故障标签分布：
 fault
0    941946
1        99
Name: count, dtype: int64
原始类别分布： Counter({0: 941946, 1: 99})
SMOTE后类别分布： Counter({0: 941946, 1: 941946})

--- 优化后的分类报告 ---
              precision     recall   f1-score    support
0                0.9997     0.9854     0.9925     188390
1                0.9856     0.9997     0.9926     188389
