In [17]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, confusion_matrix

# 1. 加载并合并数据：读取 E:\Datahw2 目录下的 tube1.csv～tube5.csv
data_list = []
data_dir = r"E:\Datahw2"
for i in range(1, 6):
    file_path = os.path.join(data_dir, f"tube{i}.csv")
    df = pd.read_csv(file_path)
    data_list.append(df)
data = pd.concat(data_list, ignore_index=True)
print(f"合并后数据形状: {data.shape}")

# 2. 对 f1 到 f16 进行标准化
cols = [f"f{i}" for i in range(1, 17)]
scaler = StandardScaler()
data_std = scaler.fit_transform(data[cols])
data_std = pd.DataFrame(data_std, columns=cols)
data.update(data_std)

# 3. 构造新特征
epsilon = 1e-6

# 3.1 功率特征：power = f7 * f8
data['power'] = data['f7'] * data['f8']

# 3.2 部件B热效应特征：q_b = (f10)^2 * f14
data['q_b'] = (data['f10'] ** 2) * data['f14']

# 3.3 部件C Arrhenius 特征：采用改进公式
# arrhenius_c = ln(f16 + ε) - 1/(f15 + ε)
data['arrhenius_c'] = np.log(data['f16'] + epsilon) - 1 / (data['f15'] + epsilon)

# 4. 定义异常判断函数（收紧阈值因子至 0.5，使异常样本减少）
def mark_abnormal(series, factor=1):
    mean_val = series.mean()
    std_val = series.std()
    low, high = mean_val - factor * std_val, mean_val + factor * std_val
    abnormal_flag = ~series.between(low, high)  # 超出区间认为异常
    return abnormal_flag, mean_val, std_val, low, high

# 4.1 对功率特征判断异常
data['power_abnormal'], power_mean, power_std, power_low, power_high = mark_abnormal(data['power'], factor=0.5)
# 4.2 对 q_b 特征判断异常
data['q_b_abnormal'], qb_mean, qb_std, qb_low, qb_high = mark_abnormal(data['q_b'], factor=0.5)
# 4.3 对 arrhenius_c 特征判断异常
data['arrhenius_abnormal'], arr_mean, arr_std, arr_low, arr_high = mark_abnormal(data['arrhenius_c'], factor=0.5)

print(f"【功率特征】均值: {power_mean:.4f}, 标准差: {power_std:.4f}, 异常区间: <{power_low:.4f}, {power_high:.4f}>")
print(f"【部件B热效应】q_b 均值: {qb_mean:.4f}, 标准差: {qb_std:.4f}, 异常区间: <{qb_low:.4f}, {qb_high:.4f}>")
print(f"【部件C Arrhenius】均值: {arr_mean:.4f}, 标准差: {arr_std:.4f}, 异常区间: <{arr_low:.4f}, {arr_high:.4f}>")

# 5. 综合故障标签构造
# ① 基本故障指标：原始 f17 和 f18 指示异常，原逻辑为 ((f17==1)|(f18==1))，但这里我们希望故障标签为 0，所以取反
data['basic_normal'] = 1 - ((data['f17'] == 1) | (data['f18'] == 1)).astype(int)
# ② 综合各新特征的异常指标；如果任一新特征异常，则认为存在故障（即非正常）
#    这里用 or 逻辑得到一个异常标记（1 表示异常）
data['new_abnormal'] = data[['power_abnormal', 'q_b_abnormal', 'arrhenius_abnormal']].any(axis=1).astype(int)
# ③ 综合判断：设定全局标签为 1 表示正常，0 表示故障。
#     为了使故障样本很少，我们定义：只有当基本指标也正常且各新特征都正常时，才认定为正常，否则判定为故障。
#     即：global_fault = 1 if (basic_normal==1 and new_abnormal==0) else 0
data['global_fault'] = ((data['basic_normal'] == 1) & (data['new_abnormal'] == 0)).astype(int)
# 故障样本为 0，正常样本为 1
print("综合故障标签分布：\n", data['global_fault'].value_counts())

# 6. 选取所有特征：将原始特征 f1–f16 与新构造特征加入
feature_cols = [f"f{i}" for i in range(1, 17)] + ['power', 'q_b', 'arrhenius_c']
X = data[feature_cols]
y = data['global_fault']

# 7. 对所有选取特征进行标准化（保证特征在同一尺度上）
scaler2 = StandardScaler()
X_scaled = scaler2.fit_transform(X)

# 8. 划分训练集与测试集（不使用SMOTE，保持原始分布）
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# 9. 使用 LightGBM 模型进行训练和预测
from lightgbm import LGBMClassifier
lgbm = LGBMClassifier(random_state=42)
lgbm.fit(X_train, y_train)
y_pred = lgbm.predict(X_test)

# 10. 模型评估
from sklearn.metrics import classification_report, confusion_matrix
print("混淆矩阵：\n", confusion_matrix(y_test, y_pred))
print("分类报告：\n", classification_report(y_test, y_pred))



合并后数据形状: (942045, 18)


  result = getattr(ufunc, method)(*inputs, **kwargs)


【功率特征】均值: -0.1322, 标准差: 1.0158, 异常区间: <-0.6401, 0.3757>
【部件B热效应】q_b 均值: -0.0980, 标准差: 1.8813, 异常区间: <-1.0387, 0.8426>
【部件C Arrhenius】均值: -2.7169, 标准差: 10.4448, 异常区间: <-7.9393, 2.5055>
综合故障标签分布：
 global_fault
0    783881
1    158164
Name: count, dtype: int64
[LightGBM] [Info] Number of positive: 126531, number of negative: 627105
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020246 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2838
[LightGBM] [Info] Number of data points in the train set: 753636, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.167894 -> initscore=-1.600627
[LightGBM] [Info] Start training from score -1.600627




混淆矩阵：
 [[156772      4]
 [     2  31631]]
分类报告：
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    156776
           1       1.00      1.00      1.00     31633

    accuracy                           1.00    188409
   macro avg       1.00      1.00      1.00    188409
weighted avg       1.00      1.00      1.00    188409

