In [None]:
# CTDAPD网络攻击检测模型比较实验 (Colab版本)

本notebook包含了在CTDAPD数据集上进行的网络攻击检测模型比较实验。

## 环境设置
1. 检查GPU可用性
2. 安装必要的包
3. 导入数据
4. 训练和评估模型

## 使用说明
1. 点击"运行时" -> "更改运行时类型" -> 选择"GPU"
2. 按顺序运行所有单元格
3. 观察不同模型的性能比较


In [None]:
# 检查GPU是否可用
import torch
print("PyTorch版本:", torch.__version__)
print("CUDA是否可用:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU型号:", torch.cuda.get_device_name(0))
    print("GPU数量:", torch.cuda.device_count())

# 安装必要的包
!pip install -q pandas numpy scikit-learn xgboost lightgbm imbalanced-learn


In [None]:
# 导入必要的库
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import classification_report, confusion_matrix
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE
import time

# 设置随机种子
np.random.seed(42)
torch.manual_seed(42)


In [None]:
# 定义CNN模型
class CNNModel(torch.nn.Module):
    def __init__(self, input_size):
        super(CNNModel, self).__init__()
        # 第一个卷积层
        self.conv1 = torch.nn.Conv1d(1, 32, kernel_size=3, padding=1)
        self.bn1 = torch.nn.BatchNorm1d(32)
        
        # 第二个卷积层
        self.conv2 = torch.nn.Conv1d(32, 64, kernel_size=3, padding=1)
        self.bn2 = torch.nn.BatchNorm1d(64)
        
        # 计算全连接层的输入维度
        # 经过两次池化，特征维度会减半两次
        self.feature_size = input_size
        self.fc_input_size = 64 * (self.feature_size // 4)
        
        # 全连接层
        self.fc1 = torch.nn.Linear(self.fc_input_size, 128)
        self.fc2 = torch.nn.Linear(128, 64)
        self.fc3 = torch.nn.Linear(64, 1)
        
        # Dropout层
        self.dropout = torch.nn.Dropout(0.3)
        
        # 池化层
        self.pool = torch.nn.MaxPool1d(kernel_size=2, stride=2)
        
    def forward(self, x):
        # 添加通道维度 [batch_size, 1, features]
        x = x.unsqueeze(1)
        
        # 第一个卷积块
        x = self.conv1(x)
        x = self.bn1(x)
        x = torch.relu(x)
        x = self.pool(x)
        
        # 第二个卷积块
        x = self.conv2(x)
        x = self.bn2(x)
        x = torch.relu(x)
        x = self.pool(x)
        
        # 展平
        x = x.view(x.size(0), -1)
        
        # 全连接层
        x = self.dropout(x)
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        
        return x

# 准备数据函数
def prepare_data_for_cnn(X):
    return torch.FloatTensor(X.values if isinstance(X, pd.DataFrame) else X)

# 准备数据函数
def prepare_data_for_cnn(X):
    return torch.FloatTensor(X.values if isinstance(X, pd.DataFrame) else X)


In [None]:
# 从Google Drive挂载数据集
from google.colab import drive
drive.mount('/content/drive')

# 设置数据集路径（假设你已经在Google Drive中创建了相应的文件夹）
DRIVE_PATH = '/content/drive/MyDrive/Cybersecurity-Analytics'
DATASET_PATH = f'{DRIVE_PATH}/CTDAPD_cleaned.csv'

# 检查数据集是否存在
import os
if not os.path.exists(DRIVE_PATH):
    print(f"创建目录: {DRIVE_PATH}")
    os.makedirs(DRIVE_PATH)
    
if not os.path.exists(DATASET_PATH):
    print(f"警告: 数据集文件不存在于 {DATASET_PATH}")
    print("请确保已将CTDAPD_cleaned.csv复制到Google Drive的Cybersecurity-Analytics文件夹中")
    raise FileNotFoundError(f"找不到数据集文件: {DATASET_PATH}")

# 读取数据集
print(f"从Google Drive读取数据集: {DATASET_PATH}")
data = pd.read_csv(DATASET_PATH)

# 显示数据集信息
print("\n数据集基本信息:")
print(data.info())

print("\n特征列表:")
feature_columns = [col for col in data.columns if col != 'Label']
print(feature_columns)

print("\n数据集形状:", data.shape)
print("\n类别分布:\n", data['Label'].value_counts())

# 显示一些基本统计信息
print("\n数值特征的基本统计信息:")
print(data.describe())


In [None]:
# 数据预处理
print("开始数据预处理...")

# 分离特征和标签
X = data.drop('Label', axis=1)
y = data['Label']

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 特征缩放
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 应用SMOTE平衡数据集
print("\n应用SMOTE平衡数据集...")
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)

print("训练集形状:", X_train_balanced.shape)
print("测试集形状:", X_test_scaled.shape)
print("\n平衡后的类别分布:\n", pd.Series(y_train_balanced).value_counts())


In [None]:
# 定义评估函数
def evaluate_model(y_true, y_pred, model_name, classifier_type, train_time, predict_time):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    auc_roc = roc_auc_score(y_true, y_pred)
    
    print(f"\n{model_name} 结果:")
    print(f"分类器类型: {classifier_type}")
    print(f"准确率: {accuracy:.4f}")
    print(f"精确率: {precision:.4f}")
    print(f"召回率: {recall:.4f}")
    print(f"F1分数: {f1:.4f}")
    print(f"AUC-ROC: {auc_roc:.4f}")
    print(f"训练时间: {train_time:.2f}s")
    print(f"预测时间: {predict_time:.2f}s")
    
    print("\n分类报告:")
    print(classification_report(y_true, y_pred))
    
    print("混淆矩阵:")
    conf_matrix = confusion_matrix(y_true, y_pred)
    print(pd.DataFrame(conf_matrix, 
                      columns=['预测:攻击', '预测:正常'],
                      index=['实际:攻击', '实际:正常']))
    
    return {
        'model_name': model_name,
        'classifier_type': classifier_type,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'auc_roc': auc_roc,
        'train_time': train_time,
        'predict_time': predict_time,
        'confusion_matrix': conf_matrix
    }


In [None]:
# 训练和评估传统机器学习模型
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': XGBClassifier(random_state=42),
    'LightGBM': LGBMClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Neural Network': MLPClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42)
}

results = []

for name, model in models.items():
    print(f"\n训练 {name}...")
    
    # 训练
    start_time = time.time()
    model.fit(X_train_balanced, y_train_balanced)
    train_time = time.time() - start_time
    
    # 预测
    start_time = time.time()
    y_pred = model.predict(X_test_scaled)
    predict_time = time.time() - start_time
    
    # 评估
    result = evaluate_model(y_test, y_pred, name, type(model).__name__, 
                          train_time, predict_time)
    results.append(result)

# 找出最佳模型
best_model = max(results, key=lambda x: x['f1'])
print("\n最佳模型性能总结")
print("-" * 50)
print(f"模型: {best_model['model_name']}")
print(f"分类器类型: {best_model['classifier_type']}")
print(f"准确率: {best_model['accuracy']:.4f}")
print(f"F1分数: {best_model['f1']:.4f}")
print(f"训练时间: {best_model['train_time']:.2f}s")


In [None]:
# 训练和评估CNN模型
print("\n评估CNN模型...")
print(f"训练数据形状: {X_train_balanced.shape}")
print(f"测试数据形状: {X_test_scaled.shape}")

# 检查是否有可用的GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"使用设备: {device}")

# 准备数据
print("准备数据...")
X_train_cnn = prepare_data_for_cnn(X_train_balanced)
X_test_cnn = prepare_data_for_cnn(X_test_scaled)
y_train_tensor = torch.FloatTensor(y_train_balanced)
y_test_tensor = torch.FloatTensor(y_test)

# 将数据移动到GPU
X_train_cnn = X_train_cnn.to(device)
X_test_cnn = X_test_cnn.to(device)
y_train_tensor = y_train_tensor.to(device)
y_test_tensor = y_test_tensor.to(device)

print(f"CNN输入数据形状: {X_train_cnn.shape}")

# 创建数据加载器
print("创建数据加载器...")
train_dataset = torch.utils.data.TensorDataset(X_train_cnn, y_train_tensor)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)

# 创建验证集数据加载器
val_dataset = torch.utils.data.TensorDataset(X_test_cnn, y_test_tensor)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=64, shuffle=False)

# 创建模型
print("初始化CNN模型...")
model = CNNModel(X_train_balanced.shape[1])
model = model.to(device)
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3, verbose=True)

# 训练模型
print("开始训练...")
start_time = time.time()
best_val_loss = float('inf')
patience = 10
patience_counter = 0

for epoch in range(50):
    # 训练阶段
    model.train()
    total_train_loss = 0
    correct_train = 0
    total_train = 0
    
    print(f"\nEpoch {epoch+1}/50")
    for batch_idx, (batch_X, batch_y) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(batch_X).squeeze()
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()
        
        # 计算训练准确率
        predicted = (outputs > 0.5).float()
        correct_train += (predicted == batch_y).sum().item()
        total_train += batch_y.size(0)
        
        if batch_idx % 10 == 0:
            print(f"Batch {batch_idx}/{len(train_loader)}, "
                  f"Loss: {loss.item():.4f}, "
                  f"Progress: {batch_idx/len(train_loader)*100:.1f}%")
    
    # 验证阶段
    model.eval()
    total_val_loss = 0
    correct_val = 0
    total_val = 0
    
    with torch.no_grad():
        for batch_X, batch_y in val_loader:
            outputs = model(batch_X).squeeze()
            loss = criterion(outputs, batch_y)
            total_val_loss += loss.item()
            
            # 计算验证准确率
            predicted = (outputs > 0.5).float()
            correct_val += (predicted == batch_y).sum().item()
            total_val += batch_y.size(0)
    
    # 计算平均损失和准确率
    avg_train_loss = total_train_loss / len(train_loader)
    avg_val_loss = total_val_loss / len(val_loader)
    train_accuracy = correct_train / total_train
    val_accuracy = correct_val / total_val
    
    print(f"Epoch {epoch+1} 结果:")
    print(f"训练损失: {avg_train_loss:.4f}, 训练准确率: {train_accuracy:.4f}")
    print(f"验证损失: {avg_val_loss:.4f}, 验证准确率: {val_accuracy:.4f}")
    
    # 学习率调整
    scheduler.step(avg_val_loss)
    
    # 早停检查
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        # 保存最佳模型
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break

train_time = time.time() - start_time

# 预测
print("\n开始预测...")
start_time = time.time()
model.eval()
with torch.no_grad():
    y_pred = model(X_test_cnn).squeeze().cpu().numpy()
y_pred_binary = (y_pred > 0.5).astype(int)
predict_time = time.time() - start_time

# 评估CNN模型
cnn_result = evaluate_model(y_test, y_pred_binary, 'CNN', 'Deep Learning', 
                          train_time, predict_time)

# 比较CNN与最佳传统模型
print("\nCNN vs 最佳传统模型比较")
print("-" * 50)
print(f"CNN - F1分数: {cnn_result['f1']:.4f}, 训练时间: {cnn_result['train_time']:.2f}s")
print(f"最佳传统模型 ({best_model['model_name']}) - F1分数: {best_model['f1']:.4f}, "
      f"训练时间: {best_model['train_time']:.2f}s")
