In [None]:
import os
import numpy as np
import pickle
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from tqdm import tqdm

# 方法1：使用SimHei字体
plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号

In [None]:
# 设置随机种子以确保结果可复现
torch.manual_seed(42)
np.random.seed(42)

In [None]:
# 配置参数
BATCH_SIZE = 64
EPOCHS = 100
LEARNING_RATE = 1e-3
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"使用设备: {DEVICE}")

In [None]:
# 自编码器模型定义
class AutoEncoder(nn.Module):
    def __init__(self, input_dim, latent_dim=32):
        super(AutoEncoder, self).__init__()
        
        # 编码器结构
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            
            nn.Linear(64, latent_dim),
            nn.BatchNorm1d(latent_dim),
            nn.ReLU()
        )
        
        # 解码器结构
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            
            nn.Linear(64, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            nn.Linear(128, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            nn.Linear(256, input_dim)
        )
    
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded
    
    def encode(self, x):
        return self.encoder(x)

In [None]:
# 实用函数：计算重构误差
def compute_reconstruction_error(model, data_loader, device):
    model.eval()
    all_errors = []
    all_labels = []
    
    with torch.no_grad():
        for inputs, labels in data_loader:
            inputs = inputs.to(device)
            outputs = model(inputs)
            
            # 计算每个样本的MSE重构误差
            errors = torch.mean((outputs - inputs) ** 2, dim=1).cpu().numpy()
            all_errors.extend(errors)
            all_labels.extend(labels.numpy())
    
    return np.array(all_errors), np.array(all_labels)

In [None]:
# 加载数据函数
def load_cicids_data(data_dir, first_day_name, second_day_name, window_size=10000, step_size=1000):
    """
    加载CICIDS数据集的两天数据
    
    参数:
    - data_dir: 包含处理后窗口数据的目录
    - first_day_name: 第一天的数据目录名
    - second_day_name: 第二天的数据目录名
    
    返回:
    - train_data: 第一天的窗口特征数据
    - train_labels: 第一天的窗口标签（0=正常，1=异常）
    - test_data: 第二天的窗口特征数据
    - test_labels: 第二天的窗口标签（0=正常，1=异常）
    - feature_names: 特征名称列表
    """
    # 加载第一天数据
    day1_dir = os.path.join(data_dir, first_day_name)
    X_train = np.load(os.path.join(day1_dir, f'X_windows_w{window_size}_s{step_size}.npy'))
    
    # 加载元数据以获取特征名称和窗口标签信息
    with open(os.path.join(day1_dir, f'metadata_w{window_size}_s{step_size}.pkl'), 'rb') as f:
        day1_metadata = pickle.load(f)
    
    # 获取特征名称
    feature_names = day1_metadata['feature_names']
    
    # 从元数据中提取窗口级别的标签（0=正常，1=异常）
    train_labels = np.array([m['is_malicious'] for m in day1_metadata['window_metadata']])
    
    # 加载第二天数据
    day2_dir = os.path.join(data_dir, second_day_name)
    X_test = np.load(os.path.join(day2_dir, f'X_windows_w{window_size}_s{step_size}.npy'))
    
    with open(os.path.join(day2_dir, f'metadata_w{window_size}_s{step_size}.pkl'), 'rb') as f:
        day2_metadata = pickle.load(f)
    
    # 从元数据中提取窗口级别的标签
    test_labels = np.array([m['is_malicious'] for m in day2_metadata['window_metadata']])
    
    print(f"加载完成: {first_day_name} 包含 {len(train_labels)} 个窗口, {second_day_name} 包含 {len(test_labels)} 个窗口")
    print(f"特征维度: {X_train.shape}")
    
    return X_train, train_labels, X_test, test_labels, feature_names


In [None]:
# 预处理数据函数
def preprocess_data(X_train, y_train, X_test, y_test):
    """
    对窗口数据进行预处理，包括重塑和标准化
    
    参数:
    - X_train: 训练窗口数据 [n_windows, window_size, n_features]
    - y_train: 训练标签
    - X_test: 测试窗口数据 [n_windows, window_size, n_features]
    - y_test: 测试标签
    
    返回:
    - 预处理后的数据
    """
    n_train_windows, window_size, n_features = X_train.shape
    n_test_windows = X_test.shape[0]
    
    # 计算每个窗口的特征均值，简化为[n_windows, n_features]形状
    # 这是一个简单的方法来处理窗口中的时间序列数据
    X_train_mean = np.mean(X_train, axis=1)
    X_test_mean = np.mean(X_test, axis=1)
    
    # 标准化
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train_mean)
    X_test_scaled = scaler.transform(X_test_mean)
    
    print(f"预处理后形状: 训练集 {X_train_scaled.shape}, 测试集 {X_test_scaled.shape}")
    
    return X_train_scaled, y_train, X_test_scaled, y_test, scaler

In [None]:
# 训练函数
def train_model(model, train_loader, val_loader, epochs, learning_rate, device):
    """
    训练自编码器模型
    
    参数:
    - model: 自编码器模型
    - train_loader: 训练数据加载器
    - val_loader: 验证数据加载器
    - epochs: 训练轮数
    - learning_rate: 学习率
    - device: 训练设备
    
    返回:
    - 训练历史
    """
    # 损失函数为MSE均方损失函数
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5, factor=0.5, verbose=True)
    
    history = {
        'train_loss': [],
        'val_loss': []
    }
    
    # 获取只包含正常样本的训练加载器
    benign_train_loader = train_loader
    
    for epoch in range(epochs):
        # 训练阶段
        model.train()
        train_loss = 0
        
        for data, _ in tqdm(benign_train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            data = data.to(device)
            
            # 前向传播
            outputs = model(data)
            loss = criterion(outputs, data)
            
            # 反向传播和优化
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        train_loss /= len(benign_train_loader)
        
        # 验证阶段
        model.eval()
        val_loss = 0
        
        with torch.no_grad():
            for data, _ in val_loader:
                data = data.to(device)
                outputs = model(data)
                loss = criterion(outputs, data)
                val_loss += loss.item()
        
        val_loss /= len(val_loader)
        
        # 更新学习率
        scheduler.step(val_loss)
        
        # 记录历史
        history['train_loss'].append(train_loss)
        history['val_loss'].append(val_loss)
        
        # 打印进度
        print(f"Epoch [{epoch+1}/{epochs}] - Train Loss: {train_loss:.6f}, Val Loss: {val_loss:.6f}")
        
    return history



In [None]:
# 评估函数
def evaluate_model(model, test_loader, y_test, device):
    """
    评估模型性能
    
    参数:
    - model: 训练好的自编码器模型
    - test_loader: 测试数据加载器
    - y_test: 测试标签
    - device: 设备
    """
    # 计算重构误差
    reconstruction_errors, _ = compute_reconstruction_error(model, test_loader, device)
    
    # 使用重构误差作为异常分数
    y_score = reconstruction_errors
    
    # 计算ROC和PR曲线
    fpr, tpr, thresholds = roc_curve(y_test, y_score)
    roc_auc = auc(fpr, tpr)
    
    precision, recall, _ = precision_recall_curve(y_test, y_score)
    pr_auc = average_precision_score(y_test, y_score)
    
    # 绘制ROC曲线
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    plt.plot(fpr, tpr, 'b', label=f'AUC = {roc_auc:.2f}')
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc='lower right')
    
    # 绘制PR曲线
    plt.subplot(1, 2, 2)
    plt.plot(recall, precision, 'g', label=f'AP = {pr_auc:.2f}')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend(loc='lower left')
    
    plt.tight_layout()
    plt.savefig('evaluation_curves.png')
    plt.show()
    
    print(f"ROC-AUC Score: {roc_auc:.4f}")
    print(f"PR-AUC Score: {pr_auc:.4f}")
    
    # 找出最佳阈值（使用F1分数）
    thresholds = np.linspace(min(y_score), max(y_score), 100)
    best_f1 = 0
    best_threshold = 0
    
    for threshold in thresholds:
        y_pred = (y_score >= threshold).astype(int)
        TP = np.sum((y_pred == 1) & (y_test == 1))
        FP = np.sum((y_pred == 1) & (y_test == 0))
        FN = np.sum((y_pred == 0) & (y_test == 1))
        
        precision = TP / (TP + FP) if TP + FP > 0 else 0
        recall = TP / (TP + FN) if TP + FN > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
    
    # 使用最佳阈值进行预测
    y_pred = (y_score >= best_threshold).astype(int)
    
    # 计算混淆矩阵
    TP = np.sum((y_pred == 1) & (y_test == 1))
    TN = np.sum((y_pred == 0) & (y_test == 0))
    FP = np.sum((y_pred == 1) & (y_test == 0))
    FN = np.sum((y_pred == 0) & (y_test == 1))
    
    # 计算各种评估指标
    accuracy = (TP + TN) / (TP + TN + FP + FN)
    precision = TP / (TP + FP) if TP + FP > 0 else 0
    recall = TP / (TP + FN) if TP + FN > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
    
    print(f"\n最佳阈值: {best_threshold:.6f}")
    print(f"准确率: {accuracy:.4f}")
    print(f"精确率: {precision:.4f}")
    print(f"召回率: {recall:.4f}")
    print(f"F1分数: {f1:.4f}")
    
    # 分析错误分类的窗口
    error_indices = np.where(y_pred != y_test)[0]
    print(f"\n错误分类的窗口数量: {len(error_indices)}")
    
    if len(error_indices) > 0:
        # 显示一些错误分类窗口的重构误差
        print("\n错误分类窗口的重构误差:")
        for i in range(min(5, len(error_indices))):
            idx = error_indices[i]
            print(f"窗口 {idx}: 真实标签 = {y_test[idx]}, 预测标签 = {y_pred[idx]}, 重构误差 = {y_score[idx]:.6f}")
    
    # 绘制重构误差分布
    plt.figure(figsize=(10, 6))
    
    benign_errors = y_score[y_test == 0]
    malicious_errors = y_score[y_test == 1]
    
    plt.hist(benign_errors, bins=50, alpha=0.7, label='正常', density=True)
    plt.hist(malicious_errors, bins=50, alpha=0.7, label='异常', density=True)
    plt.axvline(best_threshold, color='r', linestyle='--', label=f'阈值 = {best_threshold:.4f}')
    
    plt.xlabel('重构误差')
    plt.ylabel('密度')
    plt.title('重构误差分布')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.savefig('reconstruction_error_distribution.png')
    plt.show()
    
    return y_score, best_threshold


In [None]:
# 数据目录设置
data_dir = r"cicids2017/flow_windows_w10000_s1000"  # 修改为你的数据目录
first_day = "Monday-WorkingHours"
second_day = "Tuesday-WorkingHours"
window_size = 10000
step_size = 1000

In [None]:
# 加载数据
X_train, y_train, X_test, y_test, feature_names = load_cicids_data(
    data_dir, first_day, second_day, 
    window_size=window_size, step_size=step_size
)
    
# 数据预处理
X_train_processed, y_train, X_test_processed, y_test, scaler = preprocess_data(
    X_train, y_train, X_test, y_test
)

In [None]:
# 分离正常样本进行训练 (仅使用正常样本进行训练)
benign_indices = np.where(y_train == 0)[0]
X_train_benign = X_train_processed[benign_indices]
    

In [None]:
# 准备数据加载器
# 训练集（只使用正常样本）
train_dataset = TensorDataset(torch.FloatTensor(X_train_benign), 
                                  torch.FloatTensor(np.zeros(len(X_train_benign))))
    

In [None]:
# 将训练集拆分为训练和验证集
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

In [None]:
# 测试集（包含正常和异常样本）
test_dataset = TensorDataset(torch.FloatTensor(X_test_processed), 
                                torch.FloatTensor(y_test))
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [None]:
# 初始化模型
input_dim = X_train_processed.shape[1]
latent_dim = 16
model = AutoEncoder(input_dim=input_dim, latent_dim=latent_dim).to(DEVICE)
print(model)
    
# 训练模型
history = train_model(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    epochs=EPOCHS,
    learning_rate=LEARNING_RATE,
    device=DEVICE
)
    
# 绘制损失曲线
plt.figure(figsize=(10, 6))
plt.plot(history['train_loss'], label='训练损失')
plt.plot(history['val_loss'], label='验证损失')
plt.xlabel('Epoch')
plt.ylabel('MSE损失')
plt.title('训练和验证损失')
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig('training_history.png')
plt.show()
    
# 评估模型
y_score, threshold = evaluate_model(model, test_loader, y_test, DEVICE)
    
# 保存模型和阈值
torch.save({
    'model_state_dict': model.state_dict(),
    'input_dim': input_dim,
    'latent_dim': latent_dim,
    'threshold': threshold,
    'scaler': scaler
}, 'cicids_autoencoder.pth')
    
print("模型已保存至 'cicids_autoencoder.pth'")

In [32]:
def main():
    # 数据目录设置
    data_dir = "cicids2017/flow_windows_w10000_s1000"  # 修改为你的数据目录
    first_day = "Monday-WorkingHours"
    second_day = "Tuesday-WorkingHours"
    window_size = 1000
    step_size = 100
    
    # 加载数据
    X_train, y_train, X_test, y_test, feature_names = load_cicids_data(
        data_dir, first_day, second_day, 
        window_size=window_size, step_size=step_size
    )
    
    # 数据预处理
    # X_train_processed, y_train, X_test_processed, y_test, scaler = preprocess_data(
    #     X_train, y_train, X_test, y_test
    # )
    
    # 分离正常样本进行训练 (仅使用正常样本进行训练)
    benign_indices = np.where(y_train == 0)[0]
    X_train_benign = X_train[benign_indices]
    
    # 准备数据加载器
    # 训练集（只使用正常样本）
    train_dataset = TensorDataset(torch.FloatTensor(X_train_benign), 
                                  torch.FloatTensor(np.zeros(len(X_train_benign))))
    
    # 将训练集拆分为训练和验证集
    train_size = int(0.8 * len(train_dataset))
    val_size = len(train_dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [train_size, val_size])
    
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
    
    # 测试集（包含正常和异常样本）
    test_dataset = TensorDataset(torch.FloatTensor(X_test), 
                                torch.FloatTensor(y_test))
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)
    
    # 初始化模型
    input_dim = X_train.shape[1]
    latent_dim = 16
    model = AutoEncoder(input_dim=input_dim, latent_dim=latent_dim).to(DEVICE)
    print(model)
    
    # 训练模型
    history = train_model(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        epochs=EPOCHS,
        learning_rate=LEARNING_RATE,
        device=DEVICE
    )
    
    # 绘制损失曲线
    plt.figure(figsize=(10, 6))
    plt.plot(history['train_loss'], label='训练损失')
    plt.plot(history['val_loss'], label='验证损失')
    plt.xlabel('Epoch')
    plt.ylabel('MSE损失')
    plt.title('训练和验证损失')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.savefig('training_history.png')
    plt.show()
    
    # 评估模型
    y_score, threshold = evaluate_model(model, test_loader, y_test, DEVICE)
    
    # 保存模型和阈值
    torch.save({
        'model_state_dict': model.state_dict(),
        'input_dim': input_dim,
        'latent_dim': latent_dim,
        'threshold': threshold,
        'scaler': scaler
    }, 'cicids_autoencoder.pth')
    
    print("模型已保存至 'cicids_autoencoder.pth'")


if __name__ == "__main__":
    main()

加载完成: Monday-WorkingHours 包含 5017 个窗口, Tuesday-WorkingHours 包含 4207 个窗口
特征维度: (5017, 1000, 68)
AutoEncoder(
  (encoder): Sequential(
    (0): Linear(in_features=1000, out_features=256, bias=True)
    (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.2, inplace=False)
    (4): Linear(in_features=256, out_features=128, bias=True)
    (5): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0.2, inplace=False)
    (8): Linear(in_features=128, out_features=64, bias=True)
    (9): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU()
    (11): Linear(in_features=64, out_features=16, bias=True)
    (12): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (13): ReLU()
  )
  (decoder): Sequential(
    (0): Linear(in_features=16, out_features=64, bias=True)
    (1): BatchNorm1d(64, eps

Epoch 1/100:   0%|          | 0/63 [00:00<?, ?it/s]


RuntimeError: mat1 and mat2 shapes cannot be multiplied (64000x68 and 1000x256)