In [1]:
import sys
sys.path.append('..')
import pandas as pd
import numpy as np
from scipy.stats import entropy
import torch
from seqCGAN.generator import Generator  # 假设你有一个定义好的 Discriminator 类
import json
import random
import math
import struct
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
from scipy.stats import wasserstein_distance
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from dtaidistance import dtw

In [2]:
TOTAL_LEN = 114
# NPRINT_REAL_WIDTH = 50*8
NPRINT_REAL_WIDTH = 22*8
# LABEL_DICT = {'facebook': 0, 'skype': 1, 'aim': 2, 'email': 3, 'voipbuster': 4, 'hangouts': 5, 'youtube': 6, 'sftp': 7, 'icq': 8,  'ftps': 9, 'vimeo': 10, 'spotify': 11, 'netflix': 12, 'bittorrent': 13}
LABEL_DICT = {'facebook': 0, 'skype': 1}
# LABEL_DICT = {'facebook': 0, 'skype': 1, 'email': 2, 'voipbuster': 3, 'hangouts': 4, 'youtube': 5, 'ftps': 6, 'vimeo': 7, 'spotify': 8, 'netflix': 9, 'bittorrent': 10}
# LABEL_DICT = {'facebook': 0, 'skype': 1, 'email': 2, 'voipbuster': 3, 'youtube': 4, 'ftps': 5, 'vimeo': 6, 'spotify': 7, 'netflix': 8, 'bittorrent': 9}
# LABEL_DICT = {'email': 0, 'youtube': 1, 'ftps': 2, 'vimeo': 3, 'spotify': 4, 'netflix': 5, 'bittorrent': 6}

SEQ_DIM = 2
MAX_PKT_LEN = 3001
MAX_TIME = 10000
MAX_PORT = 65535
MAX_SEQ_LEN = 16
WORD_VEC_SIZE = 8
# SEQ_DIM = WORD_VEC_SIZE * 2 + 1

label_dim = len(LABEL_DICT) 
image_dim = (1, NPRINT_REAL_WIDTH, NPRINT_REAL_WIDTH)  # 生成单通道图像
noise_dim = 100  # 噪声维度
batch_size = 128
source_name = '../data/vpn_data_small.json'
source_dir = '../data/netshare'

In [3]:
def get_real_data(file_name):
    with open(file_name,'r') as f:
        json_data = json.load(f)['data']
    data_dic = {}
    
    for item in json_data:
        label_str = item['labels'][0]
        if label_str not in data_dic:
            data_dic[label_str] = []

        packets_len = item['meta']['packets']
        im = bytes.fromhex(item['nprint'])

        line = im[0:TOTAL_LEN]
        tcp_dport = line[32:34]
        udp_dport = line[92:94]

        dport = bytearray(a | b for a, b in zip(tcp_dport, udp_dport))
        dport = int.from_bytes(dport, 'big')
    
        data_item = []
        for i in range(min(16,packets_len)):
            line = im[i*TOTAL_LEN:i*TOTAL_LEN+TOTAL_LEN]
            # time_h, time_l, pl = struct.unpack("IIh", line[:10])
            # time_l //= 1e4
            # time = time_h + time_l/100
            time_h,time_l, pkt_len = struct.unpack("IIh", line[:10])
            time_l //= 1e4
            time = time_h + time_l/100
            time = time_h % 1000 + time_l / 10
            
            data_item.append([time, pkt_len])
            
        data_dic[label_str].append(data_item)
        
    return data_dic

def get_fake_data(source_dir, label_dict):
    final_seqs = {}
    for label_name in label_dict.keys():
        filename = source_dir + '/' + label_name + '.csv'
        df = pd.read_csv(filename)
    
        flows = {}      
        for index, row in df.iterrows():
            flow_o = (row['srcip'], row['srcport'], row['dstip'], row['dstport'], row['proto'])
        # flow_p = (row['dstip'], row['dstport'], row['srcip'], row['srcport'], row['proto'])
        
            time = round(row['time'])/1e6
            pkt_len = round(row['pkt_len'])
        
            if flow_o not in flows:
                flows[flow_o] = [[time,pkt_len]]
                continue
            if flow_o in flows:
                flows[flow_o].append([time,pkt_len])
                continue
    
        final_seqs[label_name] = list(flows.values())
        for i in range(len(final_seqs[label_name])):
            tmp = final_seqs[label_name][i][0][0]
            now = 0
            for j in range(len(final_seqs[label_name][i])):
                now = final_seqs[label_name][i][j][0]
                final_seqs[label_name][i][j][0] -= tmp
                tmp = now
    return final_seqs

def pad(sequence, target_length, pad_value=np.nan):
    seq_len = len(sequence)
    if seq_len < target_length:
        padding = [[pad_value] * len(sequence[0])] * (target_length - seq_len)
        return sequence + padding  # 填充
    return sequence

def get_training_dataset(data_dic):
    x = []
    y = []
    label_count = 0
    for label, data in data_dic.items():
        x.extend(data)  # 添加所有序列
        y.extend([label_count] * len(data))
        label_count += 1
    return x,y

In [4]:
real_datas = get_real_data(source_name)

fake_datas = get_fake_data(source_dir, LABEL_DICT)

# fake_datas = {}
# for label, data in real_datas.items():
#     fake_data = get_fake_data(data,label)
#     fake_datas[label] = fake_data
    
x_real, y_real = get_training_dataset(real_datas)
x_fake, y_fake = get_training_dataset(fake_datas)
# print(x_real)
# print(y_real)
# print(x_fake)
# print(y_fake)

In [5]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score

# 提取统计特征（均值、方差、最大值、最小值等）
def extract_features(X):
    features = []
    for seq in X:
        seq = np.array(seq)
        mean_feat = np.mean(seq, axis=0)
        std_feat = np.std(seq, axis=0)
        max_feat = np.max(seq, axis=0)
        min_feat = np.min(seq, axis=0)
        first_feat = seq[0]
        length = len(seq)
        features.append(np.hstack([mean_feat, std_feat, max_feat, min_feat, first_feat, length]))
    return np.array(features)

def test_xgboost(X_train, Y_train, X_test, Y_test):
    X_train_features = extract_features(X_train)
    X_test_features = extract_features(X_test)
    model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    model.fit(X_train_features, Y_train)
    Y_pred = model.predict(X_test_features)
    accuracy = accuracy_score(Y_test, Y_pred)
    return accuracy

In [6]:
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

class TrafficDataset(Dataset):
    def __init__(self, X, Y):
        self.X = [torch.tensor(x, dtype=torch.float32) for x in X]
        self.Y = torch.tensor(Y, dtype=torch.long)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]
    
def collate_fn(batch):
    X, Y = zip(*batch)
    X_padded = pad_sequence(X, batch_first=True, padding_value=0)  # 填充变长序列
    lengths = torch.tensor([len(x) for x in X])  # 记录原始长度
    return X_padded, lengths, torch.tensor(Y)

class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes, device):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)
        self.device = device

    def forward(self, x, lengths):
        packed_x = nn.utils.rnn.pack_padded_sequence(x, lengths.to('cpu'), batch_first=True, enforce_sorted=False).to(self.device)
        _, (hidden, _) = self.lstm(packed_x)
        return self.fc(hidden[-1])  # 取最终隐藏状态
    
def test_lstm(X_train, Y_train, X_test, Y_test):
    train_dataset = TrafficDataset(X_train, Y_train)
    train_loader = DataLoader(train_dataset, batch_size=128, collate_fn=collate_fn, shuffle=True)
    test_dataset = TrafficDataset(X_test, Y_test)
    test_loader = DataLoader(test_dataset, batch_size=128, collate_fn=collate_fn)
    
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # print(device)
    model = LSTMClassifier(SEQ_DIM, 128, label_dim, device).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    for epoch in range(100):
        model.train()
        for X, lengths, Y in train_loader:
            X, Y, lengths = X.to(device), Y.to(device), lengths.to(device)
            Y_pred = model(X, lengths)
            loss = criterion(Y_pred, Y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for X, lengths, Y in test_loader:
                X, Y, lengths = X.to(device), Y.to(device), lengths.to(device)
                Y_pred = model(X, lengths)
                _, predicted = torch.max(Y_pred, 1)
                total += Y.to('cpu').size(0)
                correct += (predicted == Y).sum().item()
        accuracy = correct / total
    return accuracy

In [7]:
class CNNClassifier(nn.Module):
    def __init__(self, input_dim, num_classes, device):
        super(CNNClassifier, self).__init__()
        self.conv1 = nn.Conv1d(input_dim, 64, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(64, 128, kernel_size=3, padding=1)
        self.pool = nn.AdaptiveAvgPool1d(1)  # 全局池化
        self.fc = nn.Linear(128, num_classes)
        self.device = device

    def forward(self, x):
        x = x.permute(0, 2, 1)  # 交换维度以适应 Conv1d (batch, feature_dim, seq_len)
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = self.pool(x).squeeze(-1)
        return self.fc(x)
    
def test_cnn(X_train, Y_train, X_test, Y_test):
    train_dataset = TrafficDataset(X_train, Y_train)
    train_loader = DataLoader(train_dataset, batch_size=128, collate_fn=collate_fn, shuffle=True)
    test_dataset = TrafficDataset(X_test, Y_test)
    test_loader = DataLoader(test_dataset, batch_size=128, collate_fn=collate_fn)
    
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # print(device)
    model = CNNClassifier(SEQ_DIM, label_dim, device).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    for epoch in range(100):
        model.train()
        for X, lengths, Y in train_loader:
            X, Y, lengths = X.to(device), Y.to(device), lengths.to(device)
            Y_pred = model(X)
            loss = criterion(Y_pred, Y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for X, lengths, Y in test_loader:
                X, Y, lengths = X.to(device), Y.to(device), lengths.to(device)
                Y_pred = model(X)
                _, predicted = torch.max(Y_pred, 1)
                total += Y.to('cpu').size(0)
                correct += (predicted == Y).sum().item()
        accuracy = correct / total
    return accuracy
    

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(x_real, y_real, test_size=0.2, random_state=42)
X_train_fake, X_test_fake, Y_train_fake, Y_test_fake = train_test_split(x_fake, y_fake, test_size=0.2, random_state=42)

methods = [test_xgboost, test_cnn, test_lstm]

for func in methods:
    accuracy_real = func(X_train, Y_train, X_test, Y_test)
    accuracy_fake = func(X_train_fake, Y_train_fake, X_test, Y_test)
    accuracy_mix = func(X_train_fake + X_train, Y_train_fake + Y_train, X_test, Y_test)
    print(f'Accuracy on real data of {func.__name__}:', accuracy_real)
    print(f'Accuracy on fake data of {func.__name__}:', accuracy_fake)
    print(f'Accuracy on mixed data of {func.__name__}:', accuracy_mix)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy on real data of test_xgboost: 0.930715935334873
Accuracy on fake data of test_xgboost: 0.2702078521939954
Accuracy on mixed data of test_xgboost: 0.9284064665127021
Accuracy on real data of test_cnn: 0.8579676674364896
Accuracy on fake data of test_cnn: 0.3787528868360277
Accuracy on mixed data of test_cnn: 0.848729792147806
Accuracy on real data of test_lstm: 0.8845265588914549
Accuracy on fake data of test_lstm: 0.3579676674364896
Accuracy on mixed data of test_lstm: 0.8822170900692841
