In [None]:
# 查看工作路径
import os

print(os.getcwd())


# 修改工作路径
import os
from google.colab import drive
drive.mount('/content/drive')

path = "/content/drive/MyDrive/ML"
os.chdir(path)
print(os.getcwd())

首先要对数据进行预处理。
这里先求出每个故障类型、每个特征的平均值，然后用这个平均值填入源文件。

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 加载数据
df = pd.read_csv('./train_10000.csv')

# 处理缺失值
for label in range(6):
    for feature in range(107):
        feature_name = f'feature{feature}'
        mean_value = df[df['label'] == label][feature_name].mean()
        df.loc[(df['label'] == label) & (df[feature_name].isnull()), feature_name] = mean_value

# 保存处理后的文件
df.to_csv('./train_10000_filled.csv', index=False)

# 分割数据集
train, validation_and_test = train_test_split(df, test_size=0.3, random_state=42)
validation, test = train_test_split(validation_and_test, test_size=0.5, random_state=42)

# 保存分割后的数据集
train.to_csv('./train_set.csv', index=False)
validation.to_csv('./validation_set.csv', index=False)
test.to_csv('./test_set.csv', index=False)


然后用随机森林，对每一个故障类型求出不同的特征对其重要性。

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
import numpy as np

# 初始一个空的DataFrame用于存储所有故障类型的特征重要性
df = pd.read_csv('./train_10000_filled.csv')

# 获取特征和标签
X = df.iloc[:, 1:-1]  # 取feature0到feature106作为特征
y = df['label']  # 取'label'列作为标签

all_importances = pd.DataFrame(index=X.columns)

# 使用分层K折交叉验证
skf = StratifiedKFold(n_splits=5)

for fault_type in range(6):
    # 对于每个故障类型，我们将该类型标记为1，其他类型标记为0
    y_binary = (y == fault_type).astype(int)

    feature_importances = []

    for train_index, test_index in skf.split(X, y_binary):
        # 训练随机森林分类器
        clf = RandomForestClassifier(n_estimators=100, random_state=0, max_depth=5, min_samples_split=10)
        clf.fit(X.iloc[train_index], y_binary.iloc[train_index])

        # 获取并存储特征重要性
        feature_importances.append(clf.feature_importances_)

    # 通过所有交叉验证的平均值获取特征重要性
    all_importances[f'FaultType {fault_type}'] = np.mean(feature_importances, axis=0)

# 保存特征重要性到CSV文件
all_importances.to_csv('./all_feature_importances.csv')



创建热力图，看的更直观哪些特征影响力大

In [None]:
# 创建热力图
plt.figure(figsize=(10, 20))
sns.heatmap(all_importances, cmap='Blues')
plt.title('Feature Importance for Each Fault Type')
plt.show()

下面开始构建模型，首先选出影响力大的所有特征

In [None]:
all_importances = pd.read_csv('./all_feature_importances.csv', index_col=0)
# 为每个故障类型选择前6个最重要的特征
important_features_per_fault = {}
for fault_type in all_importances.columns:
    top_6_features = all_importances[fault_type].sort_values(ascending=False)[:6]
    important_features_per_fault[fault_type] = list(top_6_features.index)


# 打印结果
for fault_type, features in important_features_per_fault.items():
    print(f"For {fault_type}, the most important features are {features}")

# 合并所有的重要特征到一个集合
selected_features = set()

for features in important_features_per_fault.values():
    selected_features.update(features)

print("最终选择的特征有：",selected_features)

然后开始构建模型训练

In [None]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler

# 加载数据
df_train = pd.read_csv('./train_set.csv')
df_val = pd.read_csv('./validation_set.csv')
df_test = pd.read_csv('./test_set.csv')

# 选择重要的特征
X_train = df_train[selected_features].values
y_train = df_train['label'].values
X_val = df_val[selected_features].values
y_val = df_val['label'].values
X_test = df_test[selected_features].values
y_test = df_test['label'].values

# 标准化数据
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# 创建数据加载器
train_loader = DataLoader(TensorDataset(torch.Tensor(X_train), torch.Tensor(y_train)), batch_size=32, shuffle=True)
val_loader = DataLoader(TensorDataset(torch.Tensor(X_val), torch.Tensor(y_val)), batch_size=32)
test_loader = DataLoader(TensorDataset(torch.Tensor(X_test), torch.Tensor(y_test)), batch_size=32)

# 创建模型
model = nn.Sequential(
    nn.Linear(len(selected_features), 64),
    nn.ReLU(),
    nn.Linear(64, 64),
    nn.ReLU(),
    nn.Linear(64, 6),
)

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.005)

# 训练模型
for epoch in range(500):
    for X, y in train_loader:
        # 前向传播
        outputs = model(X)
        loss = criterion(outputs, y.long())

        # 反向传播
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # 在验证集上检查性能
    with torch.no_grad():
        val_loss = sum(criterion(model(X), y.long()) for X, y in val_loader)
    print(f"Epoch {epoch + 1}, Validation Loss: {val_loss.item()}")

# 在测试集上检查性能
from sklearn.metrics import accuracy_score, f1_score

# 计算准确率和F1分值
def calculate_metrics(loader):
    all_preds = []
    all_true = []

    with torch.no_grad():
        for X, y in loader:
            outputs = model(X)
            _, predicted = torch.max(outputs, 1)
            all_preds.extend(predicted.numpy())
            all_true.extend(y.numpy())

    acc = accuracy_score(all_true, all_preds)
    f1 = f1_score(all_true, all_preds, average='macro')

    return acc, f1

In [None]:
import pickle #调用“腌制”库
model_filename = './ufo-model.pkl'#设定文件名
pickle.dump(model, open(model_filename,'wb'))#对模型进行“腌制”

In [None]:
test_model = pickle.load(open('ufo-model.pkl','rb'))#加载“腌制”好的模型

In [None]:
for x in X_train[:10]:
  print(x)
  outputs = test_model(torch.tensor(x).to(torch.float32))
  _, pre = torch.max(outputs, 0)
  print(pre.item())

In [None]:
torch.save(model.state_dict(), './save.pt')

mymodel = nn.Sequential(
    nn.Linear(len(selected_features), 64),
    nn.ReLU(),
    nn.Linear(64, 64),
    nn.ReLU(),
    nn.Linear(64, 6),
)

mymodel.load_state_dict(torch.load('./save.pt'))
mymodel.eval()

In [None]:
# 计算准确率和F1分值
def calculate_metrics(loader):
    all_preds = []
    all_true = []

    with torch.no_grad():
        for X, y in loader:
            outputs = test_model(X)
            _, predicted = torch.max(outputs, 1)
            all_preds.extend(predicted.numpy())
            all_true.extend(y.numpy())

    acc = accuracy_score(all_true, all_preds)
    f1 = f1_score(all_true, all_preds, average='macro')

    return acc, f1

# 在训练集、验证集和测试集上计算准确率和F1分值
train_acc, train_f1 = calculate_metrics(train_loader)
val_acc, val_f1 = calculate_metrics(val_loader)
test_acc, test_f1 = calculate_metrics(test_loader)

print(f"Training Accuracy: {train_acc}, Training F1 Score: {train_f1}")
print(f"Validation Accuracy: {val_acc}, Validation F1 Score: {val_f1}")
print(f"Test Accuracy: {test_acc}, Test F1 Score: {test_f1}")

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

# 计算预测值
y_test_pred = []
with torch.no_grad():
    for X, _ in test_loader:
        outputs = model(X)
        _, predicted = torch.max(outputs, 1)
        y_test_pred.extend(predicted.numpy())

# 计算混淆矩阵
cm = confusion_matrix(y_test, y_test_pred)

# 绘制混淆矩阵
plt.figure(figsize=(10, 10))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()


我们发现对故障类型1和2的预测不够好。第一个思路就是看看是不是还有特征对1和2影响较大，尝试把它们加入进我们的训练特征集中。
然后试过之后发现没卵用(╯▔皿▔)╯
但是我发现在原数据集中，出现0型故障的最多，超过了总数的一半。所以这是一个数据集不平衡的问题。我们可能要采用过采样的方法。
## 总结
    1.在数据预处理的时候，根据每一种故障类型的平均值进行填缺，可能不够科学？
    2.数据集不平衡
    3.如何更好的预测1，2类型的故障
    4.如何提高F1的分值，现在的分还是太低了