In [None]:
import pandas as pd
import numpy as np
import os

# 实验配置
print("="*80)
print("实验配置")
print("="*80)

# 采样方法列表
sampling_methods = ['train_TLink_RUS', 'train_CBMP', 'train_SMOTE']

# 定义特征列：Timestamp, Protocol, TTL, Length, SYN, ACK, RST, PSH, FIN
feature_cols = ['Timestamp', 'TTL', 'Length', 'SYN', 'ACK', 'RST', 'PSH', 'FIN',
                'Protocol_DNS', 'Protocol_Generic Routing Encapsulation', 
                'Protocol_HTTP', 'Protocol_ICMP', 'Protocol_ICMP,ICMP', 
                'Protocol_ICMP,TCP', 'Protocol_ICMP,UDP', 'Protocol_IGMP', 
                'Protocol_IPv6', 'Protocol_TCP', 'Protocol_UDP']

label_col = 'Label_code'

# 决策树配置
dt_configs = [
    {'max_depth': 5, 'criterion': 'entropy', 'name': 'DT(5,entropy)'},
    {'max_depth': 5, 'criterion': 'gini', 'name': 'DT(5,gini)'},
    {'max_depth': 7, 'criterion': 'entropy', 'name': 'DT(7,entropy)'},
    {'max_depth': 7, 'criterion': 'gini', 'name': 'DT(7,gini)'}
]
    
# 存储所有结果
all_results = []

print(f"\n采样方法: {sampling_methods}")
print(f"决策树配置数量: {len(dt_configs)}")
print("\n配置完成！")

实验配置
测试集路径: data\splitted\test.csv
测试集形状: (3018445, 20)
测试集类别分布:
Label_code
3    2150710
4     427052
5     234513
2      87393
0      71921
1      46856
Name: count, dtype: int64

采样方法: ['train_TLink_RUS', 'train_CBMP', 'train_SMOTE']
决策树配置数量: 4

配置完成！


In [22]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.preprocessing import label_binarize
import time
import warnings
warnings.filterwarnings('ignore')

# 查看数据文件
data_dir = os.path.join('data','resampled')

# 列出所有文件
files = os.listdir(data_dir)
print("数据文件列表:")
for f in files:
    print(f"  - {f}")
    
# 查看类别分布
print("检查类别分布：")
for method in sampling_methods:
    filepath = os.path.join(data_dir, f'{method}.csv')
    df = pd.read_csv(filepath)
    print(f"\n{method}:")
    print(df['Label_code'].value_counts().sort_index())

数据文件列表:
  - train_CBMP.csv
  - train_SMOTE.csv
  - train_TLink_RUS.csv
检查类别分布：

train_TLink_RUS:
Label_code
0    20081
1    20081
2    20081
3    20081
4    20081
5    20081
Name: count, dtype: int64

train_CBMP:
Label_code
0     30824
1     20081
2     37454
3    743762
4    183022
5    100505
Name: count, dtype: int64

train_SMOTE:
Label_code
0    921732
1    921732
2    921732
3    921732
4    921732
5    921732
Name: count, dtype: int64


In [24]:
# 加载测试集
print("="*80)
print("加载测试集")
print("="*80)

# 测试集文件路径
test_file_path = os.path.join('data', 'splitted', 'test.csv')
print(f"测试集路径: {test_file_path}")

if not os.path.exists(test_file_path):
    print(f"\n⚠️  测试集文件不存在: {test_file_path}")
    print("请提供正确的测试集路径")
else:
    test_df = pd.read_csv(test_file_path)
    X_test = test_df[feature_cols].values
    y_test = test_df[label_col].values
    
    print(f"\n✓ 测试集加载成功")
    print(f"  形状: {X_test.shape}")
    print(f"  特征数: {X_test.shape[1]}")
    print(f"  样本数: {X_test.shape[0]}")
    print(f"\n类别分布:")
    print(pd.Series(y_test).value_counts().sort_index())

加载测试集
测试集路径: data\splitted\test.csv

✓ 测试集加载成功
  形状: (3018445, 19)
  特征数: 19
  样本数: 3018445

类别分布:
0      71921
1      46856
2      87393
3    2150710
4     427052
5     234513
Name: count, dtype: int64


In [None]:
# 定义模型训练和评估函数
def train_and_evaluate(model, X_train, y_train, X_test, y_test, model_name):
    """训练模型并评估性能"""
    results = {}
    
    # 训练时间
    start_train = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start_train
    
    # 测试时间
    start_test = time.time()
    y_pred = model.predict(X_test)
    test_time = time.time() - start_test
    
    # 计算分类指标
    # Micro-averaged F1 Score (等同于Accuracy)
    f1_micro = f1_score(y_test, y_pred, average='micro')
    
    # AUC (需要one-vs-rest策略)
    n_classes = len(np.unique(y_train))
    y_test_bin = label_binarize(y_test, classes=range(n_classes))
    
    # 获取预测概率
    if hasattr(model, 'predict_proba'):
        y_score = model.predict_proba(X_test)
        auc = roc_auc_score(y_test_bin, y_score, average='micro', multi_class='ovr')
    else:
        auc = None
    
    # 计算DR (Detection Rate) - TPR for anomaly classes
    cm = confusion_matrix(y_test, y_pred)
    
    results = {
        'model_name': model_name,
        'train_time': train_time,
        'test_time': test_time,
        'f1_micro': f1_micro,
        'auc': auc,
        'confusion_matrix': cm
    }
    
    return results, model

# 计算AIC
def calculate_aic(model, X_test, y_test):
    """计算Akaike Information Criterion"""
    from sklearn.metrics import mean_squared_error
    
    n_samples = X_test.shape[0]
    y_pred = model.predict(X_test)
    
    # 计算MSE
    mse = mean_squared_error(y_test, y_pred)
    
    # 估计模型参数数量
    if hasattr(model, 'tree_'):
        # 决策树：节点数量可作为复杂度估计
        k = model.tree_.node_count
    elif hasattr(model, 'coef_'):
        # 线性模型：参数数量
        k = np.prod(model.coef_.shape)
    elif hasattr(model, 'coefs_'):
        # 神经网络：所有权重数量
        k = sum([np.prod(coef.shape) for coef in model.coefs_])
    else:
        k = 0
    
    # 计算log-likelihood (假设高斯误差)
    log_likelihood = -n_samples/2 * np.log(2*np.pi*mse) - n_samples/2
    
    # AIC = 2k - 2ln(L)
    aic = 2*k - 2*log_likelihood
    
    return aic

In [9]:
# 训练第一个采样方法的所有模型
sampling_method = 'train_TLink_RUS'

print("="*80)
print(f"采样方法: {sampling_method}")
print("="*80)

# 加载训练数据
train_path = os.path.join(data_dir, f'{sampling_method}.csv')
train_df = pd.read_csv(train_path)
X_train = train_df[feature_cols].values
y_train = train_df[label_col].values

print(f"\n训练集大小: {X_train.shape}")
print(f"类别分布:\n{pd.Series(y_train).value_counts().sort_index()}\n")

采样方法: train_TLink_RUS

训练集大小: (120486, 19)
类别分布:
0    20081
1    20081
2    20081
3    20081
4    20081
5    20081
Name: count, dtype: int64



In [25]:
# CELL5: 训练所有决策树配置并评估
print("="*80)
print("训练决策树模型")
print("="*80)

results = []

for config in dt_configs:
    print(f"\n训练 {config['name']}...")
    print("-" * 40)
    
    # 初始化模型
    dt_model = DecisionTreeClassifier(
        max_depth=config['max_depth'],
        criterion=config['criterion'],
        random_state=42
    )
    
    # 训练
    start_time = time.time()
    dt_model.fit(X_train, y_train)
    train_time = time.time() - start_time
    
    # 测试
    start_time = time.time()
    y_pred = dt_model.predict(X_test)
    test_time = time.time() - start_time
    
    # 计算性能指标
    f1_micro = f1_score(y_test, y_pred, average='micro')
    
    # 计算 AUC（multi-class）
    y_test_bin = label_binarize(y_test, classes=np.unique(y_train))
    y_pred_proba = dt_model.predict_proba(X_test)
    auc = roc_auc_score(y_test_bin, y_pred_proba, average='macro')
    
    # 检查是否满足 DR >= 97.5% 的要求
    meets_threshold = f1_micro >= 0.975
    
    # 保存结果
    result = {
        'sampling_method': sampling_method,
        'model_name': config['name'],
        'max_depth': config['max_depth'],
        'criterion': config['criterion'],
        'train_time': train_time,
        'test_time': test_time,
        'f1_score': f1_micro,
        'auc': auc,
        'meets_threshold': meets_threshold,
        'model': dt_model  # 保存模型对象用于后续规则生成
    }
    results.append(result)
    
    # 打印结果
    print(f"  训练时间: {train_time:.4f}s")
    print(f"  测试时间: {test_time:.4f}s")
    print(f"  F1 Score: {f1_micro:.4f} ({f1_micro*100:.2f}%)")
    print(f"  AUC: {auc:.4f}")
    print(f"  满足 DR>=97.5%: {'是' if meets_threshold else '否'}")

print("\n" + "="*80)
print("训练完成汇总")
print("="*80)

# 创建结果 DataFrame
results_df = pd.DataFrame([{
    'Model': r['model_name'],
    'Train Time (s)': f"{r['train_time']:.4f}",
    'Test Time (s)': f"{r['test_time']:.4f}",
    'F1 Score': f"{r['f1_score']:.4f}",
    'AUC': f"{r['auc']:.4f}",
    'Meets DR>=97.5%': '✓' if r['meets_threshold'] else '✗'
} for r in results])

print(results_df.to_string(index=False))

训练决策树模型

训练 DT(5,entropy)...
----------------------------------------
  训练时间: 2.3145s
  测试时间: 0.5073s
  F1 Score: 0.9934 (99.34%)
  AUC: 0.9995
  满足 DR>=97.5%: 是

训练 DT(5,gini)...
----------------------------------------
  训练时间: 1.8408s
  测试时间: 0.3395s
  F1 Score: 0.9936 (99.36%)
  AUC: 0.9994
  满足 DR>=97.5%: 是

训练 DT(7,entropy)...
----------------------------------------
  训练时间: 2.3130s
  测试时间: 0.3028s
  F1 Score: 0.9974 (99.74%)
  AUC: 0.9998
  满足 DR>=97.5%: 是

训练 DT(7,gini)...
----------------------------------------
  训练时间: 1.9325s
  测试时间: 0.3519s
  F1 Score: 0.9972 (99.72%)
  AUC: 0.9997
  满足 DR>=97.5%: 是

训练完成汇总
        Model Train Time (s) Test Time (s) F1 Score    AUC Meets DR>=97.5%
DT(5,entropy)         2.3145        0.5073   0.9934 0.9995               ✓
   DT(5,gini)         1.8408        0.3395   0.9936 0.9994               ✓
DT(7,entropy)         2.3130        0.3028   0.9974 0.9998               ✓
   DT(7,gini)         1.9325        0.3519   0.9972 0.9997               ✓

In [26]:
# CELL6: 从决策树提取路径并生成规则

from sklearn.tree import _tree

def extract_rules_from_dt(dt_model, feature_names, class_names):
    """
    从决策树中提取所有决策路径
    """
    tree = dt_model.tree_
    rules = []
    
    def recurse(node, path):
        if tree.feature[node] != _tree.TREE_UNDEFINED:  # 非叶子节点
            feature_name = feature_names[tree.feature[node]]
            threshold = tree.threshold[node]
            
            # 左子树 (<=)
            left_path = path + [(feature_name, '<=', threshold)]
            recurse(tree.children_left[node], left_path)
            
            # 右子树 (>)
            right_path = path + [(feature_name, '>', threshold)]
            recurse(tree.children_right[node], right_path)
        else:
            # 叶子节点
            class_id = np.argmax(tree.value[node][0])
            sample_count = int(tree.value[node][0].sum())
            rules.append({
                'path': path,
                'class': class_id,
                'class_name': class_names[class_id],
                'samples': sample_count
            })
    
    recurse(0, [])
    return rules

# 选择最优模型
best_model_result = max(results, key=lambda x: x['f1_score'])
best_model = best_model_result['model']
best_model_name = best_model_result['model_name']

print("="*80)
print(f"使用最优模型生成规则: {best_model_name}")
print(f"F1 Score: {best_model_result['f1_score']:.4f}")
print("="*80)

# 使用正确的类别映射
class_names = {
    0: 'DNS-flood',
    1: 'HTTP-flood', 
    2: 'ICMP-flood',
    3: 'Legitimate',
    4: 'TCPSYN-flood',
    5: 'UDP-flood'
}

# 提取决策规则
all_rules = extract_rules_from_dt(best_model, feature_cols, class_names)

print(f"\n决策树总共有 {len(all_rules)} 个叶子节点（决策路径）")

# 只保留攻击类别的规则（排除 Legitimate = 3）
attack_rules = [r for r in all_rules if r['class'] != 3]

print(f"攻击类别的规则数: {len(attack_rules)}")

# 按攻击类型统计
from collections import Counter
attack_type_counts = Counter([r['class_name'] for r in attack_rules])

print("\n各攻击类型的规则数:")
for attack_type, count in sorted(attack_type_counts.items()):
    print(f"  {attack_type}: {count} 条规则")

# 显示前3条规则示例
print("\n" + "="*80)
print("规则示例（前3条）")
print("="*80)

for i, rule in enumerate(attack_rules[:3], 1):
    print(f"\n规则 {i} - {rule['class_name']} (样本数: {rule['samples']})")
    print("条件:")
    for feature, op, threshold in rule['path']:
        print(f"  {feature} {op} {threshold:.2f}")

使用最优模型生成规则: DT(7,entropy)
F1 Score: 0.9974

决策树总共有 25 个叶子节点（决策路径）
攻击类别的规则数: 10

各攻击类型的规则数:
  DNS-flood: 1 条规则
  HTTP-flood: 2 条规则
  ICMP-flood: 1 条规则
  TCPSYN-flood: 4 条规则
  UDP-flood: 2 条规则

规则示例（前3条）

规则 1 - ICMP-flood (样本数: 1)
条件:
  SYN <= 0.50
  Length <= 64.50
  Protocol_UDP <= 0.50
  Protocol_ICMP > 0.50
  Timestamp <= 123.44

规则 2 - UDP-flood (样本数: 1)
条件:
  SYN <= 0.50
  Length <= 64.50
  Protocol_UDP > 0.50
  Timestamp <= 400.00
  TTL <= 0.50

规则 3 - UDP-flood (样本数: 1)
条件:
  SYN <= 0.50
  Length <= 64.50
  Protocol_UDP > 0.50
  Timestamp <= 400.00
  TTL > 0.50
  Timestamp > 80.88
  TTL > 62.50


In [27]:
# CELL7: 转换为 Suricata 规则格式

def path_to_suricata_rule(rule_info, sid):
    """
    将决策树路径转换为 Suricata 规则
    """
    path = rule_info['path']
    attack_type = rule_info['class_name']
    
    # 初始化规则组件
    protocol = 'ip'  # 默认
    options = []
    
    # 用于存储条件值
    conditions = {
        'ttl': None,
        'length': None,
        'flags': [],
        'timestamp': None
    }
    
    # 解析路径条件
    for feature, operator, threshold in path:
        # 处理 Protocol (one-hot编码)
        if feature.startswith('Protocol_'):
            proto_name = feature.replace('Protocol_', '')
            if operator == '>' and threshold > 0.5:
                # 该协议被激活
                protocol_map = {
                    'TCP': 'tcp',
                    'UDP': 'udp',
                    'ICMP': 'icmp',
                    'HTTP': 'http',
                    'DNS': 'dns'
                }
                protocol = protocol_map.get(proto_name, 'ip')
        
        # 处理 TCP Flags
        elif feature in ['SYN', 'ACK', 'RST', 'PSH', 'FIN']:
            flag_map = {'SYN': 'S', 'ACK': 'A', 'RST': 'R', 'PSH': 'P', 'FIN': 'F'}
            if operator == '>' and threshold > 0.5:
                conditions['flags'].append(flag_map[feature])
        
        # 处理 TTL
        elif feature == 'TTL':
            if operator == '<=':
                conditions['ttl'] = ('<=', int(threshold))
            else:
                conditions['ttl'] = ('>', int(threshold))
        
        # 处理 Length (需要转换为 dsize)
        elif feature == 'Length':
            # 假设最小IP+TCP头部 = 40字节
            dsize = max(0, int(threshold) - 40)
            if operator == '<=':
                conditions['length'] = ('<=', dsize)
            else:
                conditions['length'] = ('>', dsize)
        
        # 处理 Timestamp (用于 threshold)
        elif feature == 'Timestamp':
            if conditions['timestamp'] is None:
                conditions['timestamp'] = int(threshold)
            else:
                conditions['timestamp'] = max(conditions['timestamp'], int(threshold))
    
    # 构建 Suricata 规则选项
    rule_opts = [
        f'msg:"{attack_type}_attack"',
        'classtype:attempted-dos',
        f'sid:{sid}'
    ]
    
    # 添加 TTL
    if conditions['ttl']:
        op, val = conditions['ttl']
        if op == '<=':
            rule_opts.append(f'ttl:<{val+1}')
        else:
            rule_opts.append(f'ttl:>{val}')
    
    # 添加 dsize
    if conditions['length']:
        op, val = conditions['length']
        if op == '<=':
            rule_opts.append(f'dsize:<{val}')
        else:
            rule_opts.append(f'dsize:>{val}')
    
    # 添加 flags
    if conditions['flags']:
        flags_str = ','.join(sorted(set(conditions['flags'])))
        rule_opts.append(f'flags:{flags_str}')
    
    # 添加 flow (对于TCP/HTTP)
    if protocol in ['tcp', 'http']:
        rule_opts.append('flow:to_server')
    
    # 添加 threshold (使用样本数估算)
    if conditions['timestamp']:
        # 简化：使用timestamp作为时间窗口
        time_window = min(conditions['timestamp'], 10)  # 最多10秒
        count = rule_info['samples'] * 10  # 估算
        rule_opts.append(f'threshold:type both,track by_dst,count {count},seconds {time_window}')
    
    # 组合完整规则
    rule_header = f'alert {protocol} $ZERO_TRUST any -> $NET_TO_PROTECT $PORT_GROUP'
    rule_body = '; '.join(rule_opts) + ';'
    
    return f'{rule_header} ({rule_body})'


# 生成所有 Suricata 规则
print("="*80)
print("生成 Suricata 规则")
print("="*80)

suricata_rules = []
for i, rule_info in enumerate(attack_rules, start=1):
    suricata_rule = path_to_suricata_rule(rule_info, sid=i)
    suricata_rules.append({
        'sid': i,
        'attack_type': rule_info['class_name'],
        'rule': suricata_rule
    })

print(f"\n成功生成 {len(suricata_rules)} 条 Suricata 规则\n")

# 按攻击类型分组显示
for attack_type in sorted(attack_type_counts.keys()):
    type_rules = [r for r in suricata_rules if r['attack_type'] == attack_type]
    print(f"\n{attack_type} ({len(type_rules)} 条规则):")
    print("-" * 80)
    for r in type_rules:
        print(f"SID {r['sid']}:")
        print(f"  {r['rule']}")

生成 Suricata 规则

成功生成 10 条 Suricata 规则


DNS-flood (1 条规则):
--------------------------------------------------------------------------------
SID 6:
  alert ip $ZERO_TRUST any -> $NET_TO_PROTECT $PORT_GROUP (msg:"DNS-flood_attack"; classtype:attempted-dos; sid:6; dsize:>24; threshold:type both,track by_dst,count 10,seconds 10;)

HTTP-flood (2 条规则):
--------------------------------------------------------------------------------
SID 4:
  alert ip $ZERO_TRUST any -> $NET_TO_PROTECT $PORT_GROUP (msg:"HTTP-flood_attack"; classtype:attempted-dos; sid:4; dsize:<316; threshold:type both,track by_dst,count 10,seconds 10;)
SID 5:
  alert ip $ZERO_TRUST any -> $NET_TO_PROTECT $PORT_GROUP (msg:"HTTP-flood_attack"; classtype:attempted-dos; sid:5; dsize:>316; threshold:type both,track by_dst,count 10,seconds 10;)

ICMP-flood (1 条规则):
--------------------------------------------------------------------------------
SID 1:
  alert ip $ZERO_TRUST any -> $NET_TO_PROTECT $PORT_GROUP (msg:"ICMP-flood_attack

In [28]:
# CELL8: 修复后的 Suricata 规则生成

def path_to_suricata_rule_fixed(rule_info, sid):
    """
    改进版：正确处理协议和标志位
    """
    path = rule_info['path']
    attack_type = rule_info['class_name']
    samples = rule_info['samples']
    
    # 初始化
    protocol = 'ip'
    options = []
    
    conditions = {
        'ttl': None,
        'length': None,
        'flags': [],
        'timestamp_max': 0,
        'protocol_detected': None
    }
    
    # 解析路径
    for feature, operator, threshold in path:
        # Protocol (one-hot)
        if feature.startswith('Protocol_'):
            proto_name = feature.replace('Protocol_', '')
            if operator == '>' and threshold > 0.5:
                conditions['protocol_detected'] = proto_name
        
        # TCP Flags
        elif feature in ['SYN', 'ACK', 'RST', 'PSH', 'FIN']:
            flag_map = {'SYN': 'S', 'ACK': 'A', 'RST': 'R', 'PSH': 'P', 'FIN': 'F'}
            if operator == '>' and threshold > 0.5:
                conditions['flags'].append(flag_map[feature])
        
        # TTL
        elif feature == 'TTL':
            if operator == '<=':
                conditions['ttl'] = ('<=', int(threshold))
            else:
                conditions['ttl'] = ('>', int(threshold))
        
        # Length -> dsize
        elif feature == 'Length':
            dsize = max(0, int(threshold) - 40)
            if operator == '<=':
                conditions['length'] = ('<=', dsize)
            else:
                conditions['length'] = ('>', dsize)
        
        # Timestamp
        elif feature == 'Timestamp':
            conditions['timestamp_max'] = max(conditions['timestamp_max'], threshold)
    
    # 根据攻击类型和检测到的协议设置正确的协议
    protocol_map = {
        'TCP': 'tcp',
        'UDP': 'udp', 
        'ICMP': 'icmp',
        'HTTP': 'http',
        'DNS': 'udp'  # DNS通常用UDP
    }
    
    if conditions['protocol_detected']:
        protocol = protocol_map.get(conditions['protocol_detected'], 'ip')
    else:
        # 根据攻击类型推断
        attack_protocol_map = {
            'TCPSYN-flood': 'tcp',
            'UDP-flood': 'udp',
            'ICMP-flood': 'icmp',
            'DNS-flood': 'udp',
            'HTTP-flood': 'http'
        }
        protocol = attack_protocol_map.get(attack_type, 'ip')
    
    # 构建选项
    rule_opts = [
        f'msg:"{attack_type}_attack"',
        'classtype:attempted-dos',
        f'sid:{sid}'
    ]
    
    # TTL
    if conditions['ttl']:
        op, val = conditions['ttl']
        if op == '<=':
            rule_opts.append(f'ttl:<{val+1}')
        else:
            rule_opts.append(f'ttl:>{val}')
    
    # dsize
    if conditions['length']:
        op, val = conditions['length']
        if op == '<=':
            rule_opts.append(f'dsize:<{val}')
        else:
            rule_opts.append(f'dsize:>{val}')
    
    # Flags (重要：SYN flood必须有flags:S)
    if conditions['flags']:
        flags_str = ','.join(sorted(set(conditions['flags'])))
        rule_opts.append(f'flags:{flags_str}')
    elif attack_type == 'TCPSYN-flood':
        # 强制添加SYN标志
        rule_opts.append('flags:S')
    
    # Flow
    if protocol in ['tcp', 'http']:
        rule_opts.append('flow:to_server')
    
    # Threshold (基于实际样本数和时间窗口)
    time_window = min(int(conditions['timestamp_max']) if conditions['timestamp_max'] > 0 else 5, 10)
    # 估算每秒包数：假设样本数代表time_window内的包数
    count = max(samples, 100)  # 至少100个包
    rule_opts.append(f'threshold:type both,track by_dst,count {count},seconds {time_window}')
    
    # 组合规则
    rule_header = f'alert {protocol} $ZERO_TRUST any -> $NET_TO_PROTECT $PORT_GROUP'
    rule_body = '; '.join(rule_opts) + ';'
    
    return f'{rule_header} ({rule_body})'


# 重新生成规则
print("="*80)
print("生成修复后的 Suricata 规则")
print("="*80)

suricata_rules_fixed = []
for i, rule_info in enumerate(attack_rules, start=1):
    suricata_rule = path_to_suricata_rule_fixed(rule_info, sid=i)
    suricata_rules_fixed.append({
        'sid': i,
        'attack_type': rule_info['class_name'],
        'samples': rule_info['samples'],
        'rule': suricata_rule
    })

print(f"\n成功生成 {len(suricata_rules_fixed)} 条修复后的 Suricata 规则\n")

# 按攻击类型显示
for attack_type in sorted(attack_type_counts.keys()):
    type_rules = [r for r in suricata_rules_fixed if r['attack_type'] == attack_type]
    print(f"\n{attack_type} ({len(type_rules)} 条规则):")
    print("-" * 80)
    for r in type_rules:
        print(f"SID {r['sid']} (样本数: {r['samples']}):")
        print(f"  {r['rule']}")

生成修复后的 Suricata 规则

成功生成 10 条修复后的 Suricata 规则


DNS-flood (1 条规则):
--------------------------------------------------------------------------------
SID 6 (样本数: 1):
  alert udp $ZERO_TRUST any -> $NET_TO_PROTECT $PORT_GROUP (msg:"DNS-flood_attack"; classtype:attempted-dos; sid:6; dsize:>24; threshold:type both,track by_dst,count 100,seconds 10;)

HTTP-flood (2 条规则):
--------------------------------------------------------------------------------
SID 4 (样本数: 1):
  alert http $ZERO_TRUST any -> $NET_TO_PROTECT $PORT_GROUP (msg:"HTTP-flood_attack"; classtype:attempted-dos; sid:4; dsize:<316; flow:to_server; threshold:type both,track by_dst,count 100,seconds 10;)
SID 5 (样本数: 1):
  alert http $ZERO_TRUST any -> $NET_TO_PROTECT $PORT_GROUP (msg:"HTTP-flood_attack"; classtype:attempted-dos; sid:5; dsize:>316; flow:to_server; threshold:type both,track by_dst,count 100,seconds 10;)

ICMP-flood (1 条规则):
--------------------------------------------------------------------------------
SID 1 (样本数: 1

In [29]:
# CELL9: 保存规则到文件并生成报告

import json
from datetime import datetime

# 1. 保存为 .rules 文件（Suricata格式）
output_dir = 'output'
os.makedirs(output_dir, exist_ok=True)

rules_file = os.path.join(output_dir, 'anomaly2sign_rules.rules')
with open(rules_file, 'w') as f:
    f.write(f"# Anomaly2Sign 自动生成的 Suricata 规则\n")
    f.write(f"# 生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    f.write(f"# 模型: {best_model_name}\n")
    f.write(f"# F1 Score: {best_model_result['f1_score']:.4f}\n")
    f.write(f"# 总规则数: {len(suricata_rules_fixed)}\n")
    f.write("#\n")
    f.write("# 变量定义（需根据实际网络环境配置）:\n")
    f.write("# $ZERO_TRUST = [192.168.0.0/16, 10.0.0.0/8, 172.16.0.0/12, !$NET_TO_PROTECT]\n")
    f.write("# $NET_TO_PROTECT = 你的内部网络\n")
    f.write("# $PORT_GROUP = any (或指定端口列表)\n")
    f.write("#\n\n")
    
    for attack_type in sorted(attack_type_counts.keys()):
        type_rules = [r for r in suricata_rules_fixed if r['attack_type'] == attack_type]
        f.write(f"\n# {attack_type} ({len(type_rules)} 条规则)\n")
        for r in type_rules:
            f.write(f"{r['rule']}\n")

print(f"✓ 规则已保存到: {rules_file}\n")

# 2. 保存为 JSON（方便程序读取）
json_file = os.path.join(output_dir, 'rules_metadata.json')
rules_data = {
    'generation_time': datetime.now().isoformat(),
    'model_name': best_model_name,
    'model_config': {
        'max_depth': best_model_result['max_depth'],
        'criterion': best_model_result['criterion']
    },
    'performance': {
        'f1_score': float(best_model_result['f1_score']),
        'auc': float(best_model_result['auc']),
        'train_time': float(best_model_result['train_time']),
        'test_time': float(best_model_result['test_time'])
    },
    'rules': [
        {
            'sid': r['sid'],
            'attack_type': r['attack_type'],
            'samples': r['samples'],
            'rule': r['rule']
        } for r in suricata_rules_fixed
    ],
    'statistics': {
        'total_rules': len(suricata_rules_fixed),
        'rules_by_type': dict(attack_type_counts)
    }
}

with open(json_file, 'w', encoding='utf-8') as f:
    json.dump(rules_data, f, indent=2, ensure_ascii=False)

print(f"✓ 元数据已保存到: {json_file}\n")

# 3. 生成实验报告
report_file = os.path.join(output_dir, 'generation_report.txt')
with open(report_file, 'w', encoding='utf-8') as f:
    f.write("="*80 + "\n")
    f.write("Anomaly2Sign 规则生成实验报告\n")
    f.write("="*80 + "\n\n")
    
    f.write(f"生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    f.write(f"数据集: BUET (T-Link with RUS 采样)\n\n")
    
    f.write("1. 模型配置\n")
    f.write("-" * 40 + "\n")
    f.write(f"   模型类型: {best_model_name}\n")
    f.write(f"   最大深度: {best_model_result['max_depth']}\n")
    f.write(f"   分裂准则: {best_model_result['criterion']}\n\n")
    
    f.write("2. 模型性能\n")
    f.write("-" * 40 + "\n")
    f.write(f"   F1 Score: {best_model_result['f1_score']:.4f} ({best_model_result['f1_score']*100:.2f}%)\n")
    f.write(f"   AUC: {best_model_result['auc']:.4f}\n")
    f.write(f"   训练时间: {best_model_result['train_time']:.4f}s\n")
    f.write(f"   测试时间: {best_model_result['test_time']:.4f}s\n")
    f.write(f"   满足 DR>=97.5%: 是\n\n")
    
    f.write("3. 生成的规则统计\n")
    f.write("-" * 40 + "\n")
    f.write(f"   总规则数: {len(suricata_rules_fixed)}\n")
    f.write(f"   决策树叶子节点数: {len(all_rules)}\n")
    f.write(f"   攻击类别规则数: {len(attack_rules)}\n\n")
    
    f.write("   各攻击类型规则数:\n")
    for attack_type, count in sorted(attack_type_counts.items()):
        f.write(f"     - {attack_type}: {count} 条\n")
    
    f.write("\n4. 规则列表\n")
    f.write("-" * 40 + "\n")
    for attack_type in sorted(attack_type_counts.keys()):
        type_rules = [r for r in suricata_rules_fixed if r['attack_type'] == attack_type]
        f.write(f"\n   {attack_type}:\n")
        for r in type_rules:
            f.write(f"     SID {r['sid']}: {r['rule']}\n")

print(f"✓ 实验报告已保存到: {report_file}\n")

# 4. 显示文件摘要
print("="*80)
print("生成完成摘要")
print("="*80)
print(f"\n生成的文件:")
print(f"  1. Suricata规则文件: {rules_file}")
print(f"  2. JSON元数据: {json_file}")
print(f"  3. 实验报告: {report_file}")

print(f"\n规则统计:")
print(f"  - 总规则数: {len(suricata_rules_fixed)}")
print(f"  - 模型性能: F1={best_model_result['f1_score']:.4f}, AUC={best_model_result['auc']:.4f}")
print(f"  - 生成时间: {best_model_result['train_time']:.2f}s (训练) + {best_model_result['test_time']:.2f}s (测试)")

print("\n各攻击类型:")
for attack_type, count in sorted(attack_type_counts.items()):
    print(f"  - {attack_type}: {count} 条规则")

✓ 规则已保存到: output\anomaly2sign_rules.rules

✓ 元数据已保存到: output\rules_metadata.json

✓ 实验报告已保存到: output\generation_report.txt

生成完成摘要

生成的文件:
  1. Suricata规则文件: output\anomaly2sign_rules.rules
  2. JSON元数据: output\rules_metadata.json
  3. 实验报告: output\generation_report.txt

规则统计:
  - 总规则数: 10
  - 模型性能: F1=0.9974, AUC=0.9998
  - 生成时间: 2.31s (训练) + 0.30s (测试)

各攻击类型:
  - DNS-flood: 1 条规则
  - HTTP-flood: 2 条规则
  - ICMP-flood: 1 条规则
  - TCPSYN-flood: 4 条规则
  - UDP-flood: 2 条规则


In [10]:
# 1. 决策树模型
print("\n" + "-"*80)
print("决策树模型")
print("-"*80)

for dt_config in dt_configs:
    model_name = f"{dt_config['name']}+{sampling_method}"
    print(f"\n训练 {model_name}...")
    
    dt = DecisionTreeClassifier(
        max_depth=dt_config['max_depth'],
        criterion=dt_config['criterion'],
        random_state=42
    )
    
    results, trained_model = train_and_evaluate(
        dt, X_train, y_train, X_test, y_test, model_name
    )
    
    # 计算AIC
    aic = calculate_aic(trained_model, X_test, y_test)
    results['aic'] = aic
    results['sampling_method'] = sampling_method
    results['n_params'] = trained_model.tree_.node_count
    
    all_results.append(results)
    
    print(f"  ✓ 训练时间: {results['train_time']:.4f}s")
    print(f"  ✓ 测试时间: {results['test_time']:.4f}s")
    print(f"  ✓ F1 Score: {results['f1_micro']:.4f}")
    print(f"  ✓ AUC: {results['auc']:.4f}" if results['auc'] else "  ✓ AUC: N/A")
    print(f"  ✓ AIC: {results['aic']:.2f}")
    print(f"  ✓ 节点数: {results['n_params']}")


--------------------------------------------------------------------------------
决策树模型
--------------------------------------------------------------------------------

训练 DT(5,entropy)+train_TLink_RUS...
  ✓ 训练时间: 0.1691s
  ✓ 测试时间: 0.2848s
  ✓ F1 Score: 0.9921
  ✓ AUC: 0.9997
  ✓ AIC: -3134972.78
  ✓ 节点数: 25

训练 DT(5,gini)+train_TLink_RUS...
  ✓ 训练时间: 0.1863s
  ✓ 测试时间: 0.3200s
  ✓ F1 Score: 0.8797
  ✓ AUC: 0.9949
  ✓ AIC: 6262142.39
  ✓ 节点数: 21

训练 DT(7,entropy)+train_TLink_RUS...
  ✓ 训练时间: 0.2078s
  ✓ 测试时间: 0.3233s
  ✓ F1 Score: 0.9966
  ✓ AUC: 0.9999
  ✓ AIC: -5011101.78
  ✓ 节点数: 35

训练 DT(7,gini)+train_TLink_RUS...
  ✓ 训练时间: 0.1636s
  ✓ 测试时间: 0.3240s
  ✓ F1 Score: 0.9933
  ✓ AUC: 0.9997
  ✓ AIC: -3629336.62
  ✓ 节点数: 29


In [11]:
# 2. Logistic Regression
print("\n" + "-"*80)
print("Logistic Regression")
print("-"*80)

print(f"\n训练 LR+{sampling_method}...")
lr = LogisticRegression(max_iter=1000, random_state=42)
results, _ = train_and_evaluate(
    lr, X_train, y_train, X_test, y_test, f"LR+{sampling_method}"
)
results['aic'] = calculate_aic(lr, X_test, y_test)
results['sampling_method'] = sampling_method
all_results.append(results)

print(f"  ✓ 训练时间: {results['train_time']:.4f}s")
print(f"  ✓ 测试时间: {results['test_time']:.4f}s") 
print(f"  ✓ F1 Score: {results['f1_micro']:.4f}")
print(f"  ✓ AUC: {results['auc']:.4f}" if results['auc'] else "  ✓ AUC: N/A")


--------------------------------------------------------------------------------
Logistic Regression
--------------------------------------------------------------------------------

训练 LR+train_TLink_RUS...
  ✓ 训练时间: 25.5445s
  ✓ 测试时间: 0.2315s
  ✓ F1 Score: 0.9868
  ✓ AUC: 0.9993


In [None]:
# 3. SVM (如果数据量合适)
print("\n" + "-"*80)
print("Support Vector Machine")
print("-"*80)

if X_train.shape[0] < 500000:
    print(f"\n训练 SVM+{sampling_method}...")
    svm = SVC(kernel='rbf', probability=True, random_state=42)
    results, _ = train_and_evaluate(
        svm, X_train, y_train, X_test, y_test, f"SVM+{sampling_method}"
    )
    results['aic'] = calculate_aic(svm, X_test, y_test)
    results['sampling_method'] = sampling_method
    all_results.append(results)
    
    print(f"  ✓ 训练时间: {results['train_time']:.4f}s")
    print(f"  ✓ 测试时间: {results['test_time']:.4f}s")
    print(f"  ✓ F1 Score: {results['f1_micro']:.4f}")
    print(f"  ✓ AUC: {results['auc']:.4f}" if results['auc'] else "  ✓ AUC: N/A")
else:
    print(f"\n⚠️ 跳过SVM (数据量: {X_train.shape[0]} > 500000)")


--------------------------------------------------------------------------------
Support Vector Machine
--------------------------------------------------------------------------------

训练 SVM+train_TLink_RUS...


In [12]:
# 4. MLP
print("\n" + "-"*80)
print("Multi-Layer Perceptron")
print("-"*80)

print(f"\n训练 MLP+{sampling_method}...")
n_features = X_train.shape[1]
hidden_layer_size = int(0.8 * n_features)
mlp = MLPClassifier(
    hidden_layer_sizes=(hidden_layer_size,),
    activation='relu',
    max_iter=200,
    random_state=42
)
results, _ = train_and_evaluate(
    mlp, X_train, y_train, X_test, y_test, f"MLP+{sampling_method}"
)
results['aic'] = calculate_aic(mlp, X_test, y_test)
results['sampling_method'] = sampling_method
all_results.append(results)

print(f"  ✓ 训练时间: {results['train_time']:.4f}s")
print(f"  ✓ 测试时间: {results['test_time']:.4f}s")
print(f"  ✓ F1 Score: {results['f1_micro']:.4f}")
print(f"  ✓ AUC: {results['auc']:.4f}" if results['auc'] else "  ✓ AUC: N/A")

print(f"\n✓ {sampling_method} 完成!")


--------------------------------------------------------------------------------
Multi-Layer Perceptron
--------------------------------------------------------------------------------

训练 MLP+train_TLink_RUS...
  ✓ 训练时间: 26.5721s
  ✓ 测试时间: 0.8886s
  ✓ F1 Score: 0.9955
  ✓ AUC: 0.9999

✓ train_TLink_RUS 完成!


In [13]:
# 训练第二个采样方法的所有模型
sampling_method = 'train_CBMP'

print("="*80)
print(f"采样方法: {sampling_method}")
print("="*80)

# 加载训练数据
train_path = os.path.join(data_dir, f'{sampling_method}.csv')
train_df = pd.read_csv(train_path)
X_train = train_df[feature_cols].values
y_train = train_df[label_col].values

print(f"\n训练集大小: {X_train.shape}")
print(f"类别分布:\n{pd.Series(y_train).value_counts().sort_index()}\n")

采样方法: train_CBMP

训练集大小: (1115648, 19)
类别分布:
0     30824
1     20081
2     37454
3    743762
4    183022
5    100505
Name: count, dtype: int64



In [14]:
# 1. 决策树模型
print("\n" + "-"*80)
print("决策树模型")
print("-"*80)

for dt_config in dt_configs:
    model_name = f"{dt_config['name']}+{sampling_method}"
    print(f"\n训练 {model_name}...")
    
    dt = DecisionTreeClassifier(
        max_depth=dt_config['max_depth'],
        criterion=dt_config['criterion'],
        random_state=42
    )
    
    results, trained_model = train_and_evaluate(
        dt, X_train, y_train, X_test, y_test, model_name
    )
    
    aic = calculate_aic(trained_model, X_test, y_test)
    results['aic'] = aic
    results['sampling_method'] = sampling_method
    results['n_params'] = trained_model.tree_.node_count
    
    all_results.append(results)
    
    print(f"  ✓ 训练时间: {results['train_time']:.4f}s")
    print(f"  ✓ 测试时间: {results['test_time']:.4f}s")
    print(f"  ✓ F1 Score: {results['f1_micro']:.4f}")
    print(f"  ✓ AUC: {results['auc']:.4f}" if results['auc'] else "  ✓ AUC: N/A")
    print(f"  ✓ AIC: {results['aic']:.2f}")


--------------------------------------------------------------------------------
决策树模型
--------------------------------------------------------------------------------

训练 DT(5,entropy)+train_CBMP...
  ✓ 训练时间: 2.1437s
  ✓ 测试时间: 0.2946s
  ✓ F1 Score: 0.9934
  ✓ AUC: 0.9999
  ✓ AIC: -3777458.65

训练 DT(5,gini)+train_CBMP...
  ✓ 训练时间: 2.0552s
  ✓ 测试时间: 0.2814s
  ✓ F1 Score: 0.9936
  ✓ AUC: 0.9999
  ✓ AIC: -3931785.07

训练 DT(7,entropy)+train_CBMP...
  ✓ 训练时间: 2.6359s
  ✓ 测试时间: 0.3297s
  ✓ F1 Score: 0.9974
  ✓ AUC: 1.0000
  ✓ AIC: -5616068.58

训练 DT(7,gini)+train_CBMP...
  ✓ 训练时间: 2.3180s
  ✓ 测试时间: 0.3363s
  ✓ F1 Score: 0.9972
  ✓ AUC: 0.9999
  ✓ AIC: -5356807.20


In [15]:
# 2. LR
print("\n" + "-"*80)
print("Logistic Regression")
print("-"*80)

print(f"\n训练 LR+{sampling_method}...")
lr = LogisticRegression(max_iter=1000, random_state=42)
results, _ = train_and_evaluate(
    lr, X_train, y_train, X_test, y_test, f"LR+{sampling_method}"
)
results['aic'] = calculate_aic(lr, X_test, y_test)
results['sampling_method'] = sampling_method
all_results.append(results)

print(f"  ✓ 训练时间: {results['train_time']:.4f}s")
print(f"  ✓ 测试时间: {results['test_time']:.4f}s") 
print(f"  ✓ F1 Score: {results['f1_micro']:.4f}")
print(f"  ✓ AUC: {results['auc']:.4f}" if results['auc'] else "  ✓ AUC: N/A")


--------------------------------------------------------------------------------
Logistic Regression
--------------------------------------------------------------------------------

训练 LR+train_CBMP...
  ✓ 训练时间: 227.6691s
  ✓ 测试时间: 0.1935s
  ✓ F1 Score: 0.9909
  ✓ AUC: 0.9993


In [None]:
# 3. SVM (如果数据量合适)
print("\n" + "-"*80)
print("Support Vector Machine")
print("-"*80)

if X_train.shape[0] < 500000:
    print(f"\n训练 SVM+{sampling_method}...")
    svm = SVC(kernel='rbf', probability=True, random_state=42)
    results, _ = train_and_evaluate(
        svm, X_train, y_train, X_test, y_test, f"SVM+{sampling_method}"
    )
    results['aic'] = calculate_aic(svm, X_test, y_test)
    results['sampling_method'] = sampling_method
    all_results.append(results)
    
    print(f"  ✓ 训练时间: {results['train_time']:.4f}s")
    print(f"  ✓ 测试时间: {results['test_time']:.4f}s")
    print(f"  ✓ F1 Score: {results['f1_micro']:.4f}")
    print(f"  ✓ AUC: {results['auc']:.4f}" if results['auc'] else "  ✓ AUC: N/A")
else:
    print(f"\n⚠️ 跳过SVM (数据量: {X_train.shape[0]} > 500000)")

In [16]:
# 4. MLP
print("\n" + "-"*80)
print("Multi-Layer Perceptron")
print("-"*80)

print(f"\n训练 MLP+{sampling_method}...")
n_features = X_train.shape[1]
hidden_layer_size = int(0.8 * n_features)
mlp = MLPClassifier(
    hidden_layer_sizes=(hidden_layer_size,),
    activation='relu',
    max_iter=200,
    random_state=42
)
results, _ = train_and_evaluate(
    mlp, X_train, y_train, X_test, y_test, f"MLP+{sampling_method}"
)
results['aic'] = calculate_aic(mlp, X_test, y_test)
results['sampling_method'] = sampling_method
all_results.append(results)

print(f"  ✓ 训练时间: {results['train_time']:.4f}s")
print(f"  ✓ 测试时间: {results['test_time']:.4f}s")
print(f"  ✓ F1 Score: {results['f1_micro']:.4f}")
print(f"  ✓ AUC: {results['auc']:.4f}" if results['auc'] else "  ✓ AUC: N/A")

print(f"\n✓ {sampling_method} 完成!")


--------------------------------------------------------------------------------
Multi-Layer Perceptron
--------------------------------------------------------------------------------

训练 MLP+train_CBMP...
  ✓ 训练时间: 252.1903s
  ✓ 测试时间: 0.8733s
  ✓ F1 Score: 0.9970
  ✓ AUC: 0.9999

✓ train_CBMP 完成!
