In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np

In [99]:
#=================================================
# 1. 读取数据并做逗号分割
#=================================================
data_file = "NSL-KDD-Test.csv"  # 请修改为你的实际文件路径
df = pd.read_csv(data_file)

# 假设文件至少包含 'flow' 列和 'class' 列
flows = df["flow"].astype(str).tolist()
labels = df["class"].tolist()

# flows 每一行示例: "13,tcp,telnet,SF,118,2425,..." 
# 先用逗号切分成列表
split_flows = [flow_str.split(",") for flow_str in flows]
# 现在 split_flows[i] 可能是 ["13", "tcp", "telnet", "SF", ...]

In [100]:
#=================================================
# 2. 建立 token 词表（跳过逗号）
#=================================================
# 收集所有出现的 token
all_tokens = set()
for token_list in split_flows:
    for tk in token_list:
        all_tokens.add(tk.strip())

# 可以加入特殊 PAD/UNK 等标记
PAD_TOKEN = "<PAD>"
UNK_TOKEN = "<UNK>"

all_tokens = list(all_tokens)
all_tokens.sort()  # 为了保证可复现，先排序
all_tokens = [PAD_TOKEN, UNK_TOKEN] + all_tokens

token2idx = {tk: i for i, tk in enumerate(all_tokens)}
idx2token = {i: tk for tk, i in token2idx.items()}

vocab_size = len(all_tokens)
print("Vocabulary size:", vocab_size)

Vocabulary size: 4714


In [101]:
#=================================================
# 3. 编码 & 解码 函数
#=================================================
# 可以设置一个最大 token 长度(依据数据分布进行调整)
max_length = 50

def encode_tokens(token_list):
    """将 token 列表编码为固定长度的索引序列"""
    idx_list = []
    for tk in token_list[:max_length]:
        if tk in token2idx:
            idx_list.append(token2idx[tk])
        else:
            idx_list.append(token2idx[UNK_TOKEN])
    # 不足补PAD
    while len(idx_list) < max_length:
        idx_list.append(token2idx[PAD_TOKEN])
    return idx_list

def decode_tokens(idx_list):
    """将索引序列还原为 token 列表（去掉末尾的 PAD）"""
    tokens = []
    for idx in idx_list:
        if idx == token2idx[PAD_TOKEN]:
            break
        tokens.append(idx2token[idx])
    return tokens

encoded_flows = [encode_tokens(tk_list) for tk_list in split_flows]
encoded_flows = np.array(encoded_flows, dtype=np.int64)

In [102]:
#=================================================
# 4. 构建 PyTorch Dataset
#=================================================
class FlowDataset(Dataset):
    def __init__(self, flows_idx):
        self.flows_idx = flows_idx

    def __len__(self):
        return len(self.flows_idx)

    def __getitem__(self, idx):
        return self.flows_idx[idx]

dataset = FlowDataset(encoded_flows)
batch_size = 64
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [104]:
#=================================================
# 5. 定义生成器 (Generator) 和 判别器 (Discriminator)
#=================================================
class Generator(nn.Module):
    """
    简易序列生成器：将随机噪声映射到 token 序列
    用一个 LSTM 将噪声输入映射到 vocab_size 维度
    """
    def __init__(self, latent_dim, hidden_dim, vocab_size):
        super(Generator, self).__init__()
        self.latent_dim = latent_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size

        self.lstm = nn.LSTM(latent_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, z):
        """
        z: [batch_size, max_length, latent_dim]
        """
        h, _ = self.lstm(z)    # -> [batch_size, max_length, hidden_dim]
        logits = self.fc(h)    # -> [batch_size, max_length, vocab_size]
        return logits


class Discriminator(nn.Module):
    """
    简易序列判别器：判断序列是真/假
    """
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super(Discriminator, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=token2idx[PAD_TOKEN])
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        """
        x: [batch_size, max_length] 的 token 索引序列
        """
        emb = self.embedding(x)  # -> [batch_size, max_length, embed_dim]
        h, _ = self.lstm(emb)    # -> [batch_size, max_length, hidden_dim]
        # 取最后时刻 hidden state 进行二分类
        out = h[:, -1, :]
        logit = self.fc(out)     # -> [batch_size, 1]
        prob = self.sigmoid(logit)
        return prob

In [105]:
#=================================================
# 6. 初始化模型与训练参数
#=================================================
latent_dim = 32
g_hidden_dim = 64
d_hidden_dim = 64
embed_dim = 32
num_epochs = 5
lr = 0.0002

generator = Generator(latent_dim, g_hidden_dim, vocab_size).to(device)
discriminator = Discriminator(vocab_size, embed_dim, d_hidden_dim).to(device)

criterion = nn.BCELoss()
optimizer_g = optim.Adam(generator.parameters(), lr=lr, betas=(0.5, 0.999))
optimizer_d = optim.Adam(discriminator.parameters(), lr=lr, betas=(0.5, 0.999))

In [106]:
#=================================================
# 7. 训练GAN
#=================================================
for epoch in range(num_epochs):
    for real_batch in dataloader:
        real_batch = real_batch.to(device)  # [batch_size, max_length]

        #======= 训练判别器 D =======#
        optimizer_d.zero_grad()

        # 对真实样本打上label=1
        real_labels = torch.ones((real_batch.size(0), 1), device=device)
        # 对生成样本打上label=0
        fake_labels = torch.zeros((real_batch.size(0), 1), device=device)

        # (1) 判别真实样本
        real_output = discriminator(real_batch)
        d_loss_real = criterion(real_output, real_labels)

        # (2) 生成“假”样本并判别
        z = torch.randn(real_batch.size(0), max_length, latent_dim, device=device)
        fake_logits = generator(z)  # -> [batch_size, max_length, vocab_size]

        # 从 fake_logits 中采样或 argmax
        fake_probs = torch.softmax(fake_logits, dim=-1)
        fake_indices = torch.argmax(fake_probs, dim=-1)  # -> [batch_size, max_length]

        fake_output = discriminator(fake_indices)
        d_loss_fake = criterion(fake_output, fake_labels)

        d_loss = d_loss_real + d_loss_fake
        d_loss.backward()
        optimizer_d.step()

        #======= 训练生成器 G =======#
        optimizer_g.zero_grad()
        z = torch.randn(real_batch.size(0), max_length, latent_dim, device=device)
        fake_logits = generator(z)
        fake_probs = torch.softmax(fake_logits, dim=-1)
        fake_indices = torch.argmax(fake_probs, dim=-1)

        # 生成器希望骗过判别器，因此希望判别器输出接近1
        g_output = discriminator(fake_indices)
        g_loss = criterion(g_output, real_labels)

        g_loss.backward()
        optimizer_g.step()

    print(f"Epoch [{epoch+1}/{num_epochs}] | d_loss: {d_loss.item():.4f}, g_loss: {g_loss.item():.4f}")

Epoch [1/5] | d_loss: 0.0251, g_loss: 4.1674
Epoch [2/5] | d_loss: 0.0071, g_loss: 5.4155
Epoch [3/5] | d_loss: 0.0034, g_loss: 6.1989
Epoch [4/5] | d_loss: 0.0020, g_loss: 6.8289
Epoch [5/5] | d_loss: 0.0011, g_loss: 7.2434


In [None]:
#=================================================
# 8. 用训练好的生成器生成新flow并写回CSV
#=================================================
generator.eval()

new_flows = []
gen_batch_size = 64
total_samples = len(df)

with torch.no_grad():
    i = 0
    while i < total_samples:
        cur_batch_size = min(gen_batch_size, total_samples - i)
        z = torch.randn(cur_batch_size, max_length, latent_dim, device=device)
        fake_logits = generator(z)
        fake_probs = torch.softmax(fake_logits, dim=-1)
        fake_indices = torch.argmax(fake_probs, dim=-1)

        # 解码为 token 列表，再用逗号拼接
        for idx_seq in fake_indices:
            idx_seq = idx_seq.cpu().numpy().tolist()
            tokens = decode_tokens(idx_seq)
            flow_str = ",".join(tokens)  # 用逗号拼接回来
            new_flows.append(flow_str)

        i += cur_batch_size

# 与原数据长度一致
assert len(new_flows) == len(df)

# 组合成新的数据
new_df = pd.DataFrame({
    "flow": new_flows,
    "class": labels
})

output_file = "NSL-KDD-GAN-Adversarial.csv"
new_df.to_csv(output_file, index=False)
print(f"已生成新的对抗流量，并保存到: {output_file}")

已生成新的对抗流量，并保存到: NSL-KDD_GAN_generated.csv
