In [50]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import torch
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
import ast
import pickle
import json

# # 设置CUDA_VISIBLE_DEVICES环境变量
# os.environ['CUDA_VISIBLE_DEVICES'] = '0'

seed = 11415
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.manual_seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

Desired_Length = 100
batch_size = 16
class YtData(object):

    def __init__(self, batch_size, file_name1, file_name2):
        data1 = pd.read_csv(file_name1, header=None)
        data2 = pd.read_csv(file_name2, header=None)
        self.desired_length = Desired_Length
        
        
        self._encoder = {
            'label':    LabelEncoder()
        }
        self.batch_size = batch_size

        target = np.array(data1.iloc[:, 0])
        features = data1.iloc[:, -1]
        features = features.apply(lambda x: [int(float(i)) for i in x.split('/') if i]).to_list()
        features = np.array([self.pad_or_truncate(lst) for lst in features])

        features = np.array(features)
        data = np.array(features)
        data = np.array(data)
        data = np.abs(data)
        print(data.shape)


        data_X, data_y = self.__encode_data(data, target)
        self.train_dataset = TensorDataset(
            torch.from_numpy(data_X.astype(np.float32)),
            torch.from_numpy(data_y.astype(np.int64))
        )

        target = np.array(data2.iloc[:, 0])
        features = data2.iloc[:, -1]
        features = features.apply(lambda x: [int(float(i)) for i in x.split('/') if i]).to_list()
        features = np.array([self.pad_or_truncate(lst) for lst in features])

        features = np.array(features)
        data = np.array(features)
        data = np.array(data)
        data = np.abs(data)
        print(data.shape)

        
        data_X, data_y = self.__encode_data(data, target)
        self.test_dataset = TensorDataset(
            torch.from_numpy(data_X.astype(np.float32)),
            torch.from_numpy(data_y.astype(np.int64))
        )

 
        self.train_dataloader = DataLoader(self.train_dataset, self.batch_size, shuffle=True)
        self.test_dataloader = DataLoader(self.test_dataset, self.batch_size, shuffle=True)

    def pad_or_truncate(self, lst):
        if len(lst) < self.desired_length:
            return lst + [0] * (self.desired_length - len(lst))
        else:
            return lst[:self.desired_length]
        
    def __encode_data(self, data_X, data_y):
        label_encoder = self._encoder['label']
        # 保存 label_encoder 到文件
        label_encoder.fit(list(set(data_y)))
        with open('label_encoder.pkl', 'wb') as file:
            pickle.dump(label_encoder, file)

        # 获取标签和对应的数值
        label_to_num = {label: idx for idx, label in enumerate(label_encoder.classes_)}

        # 保存到 JSON 文件
        with open('label_encoder.json', 'w') as file:
            json.dump(label_to_num, file)

        data_X = np.pad(data_X, ((0, 0), (0, self.desired_length - len(data_X[0]))), 'constant').reshape(-1, 1, self.desired_length)
        data_y = label_encoder.transform(data_y)
        return data_X, data_y

    """将数据拆分为训练集和测试集，并转换为TensorDataset对象"""
    def __split_data_to_tensor(self, data_X, data_y):
        X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, test_size=0.3)
        train_dataset = TensorDataset(
            torch.from_numpy(X_train.astype(np.float32)),
            torch.from_numpy(y_train.astype(np.int64))
        )
        test_dataset = TensorDataset(
            torch.from_numpy(X_test.astype(np.float32)),
            torch.from_numpy(y_test.astype(np.int64))
        )
        return train_dataset, test_dataset


dataset = YtData(batch_size, r'E:\ZLJ_code\yt-url-har-pcap\data\quic_chunk\train_chunk.csv', r'E:\ZLJ_code\yt-url-har-pcap\data\quic_chunk\test_chunk.csv')

(379, 100)
(220, 100)


In [57]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from tqdm import tqdm
import os

# 设置CUDA_VISIBLE_DEVICES环境变量为0，这将使得CUDA程序只能看到第0个GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# 神经网络参数
batch_size = 16
learning_rate = 1e-2
num_epoches = 100
USE_GPU = False
num_class = 191

# 定义Self-Attention模块
class SelfAttention(nn.Module):
    def __init__(self, in_dim):
        super(SelfAttention, self).__init__()
        self.query_conv = nn.Conv1d(in_dim, in_dim // 8, kernel_size=1)
        self.key_conv = nn.Conv1d(in_dim, in_dim // 8, kernel_size=1)
        self.value_conv = nn.Conv1d(in_dim, in_dim, kernel_size=1)
        self.gamma = nn.Parameter(torch.zeros(1))
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x):
        batch_size, C, width = x.size()
        query = self.query_conv(x).view(batch_size, -1, width).permute(0, 2, 1)
        key = self.key_conv(x).view(batch_size, -1, width)
        energy = torch.bmm(query, key)
        attention = self.softmax(energy)
        value = self.value_conv(x).view(batch_size, -1, width)
        out = torch.bmm(value, attention.permute(0, 2, 1))
        out = out.view(batch_size, C, width)
        out = self.gamma * out + x
        return out

# 定义包含Self-Attention的CNN模型
class CNN(nn.Module):
    def __init__(self, in_dim, num_class):
        super(CNN, self).__init__()

        self.conv1 = nn.Sequential(       
            nn.Conv1d(1, 32, 5, 1, 2),
            nn.BatchNorm1d(32), 
            nn.ELU(),                     
            nn.Conv1d(32, 32, 5, 1, 2),
            nn.BatchNorm1d(32), 
            nn.ELU(),           
            nn.MaxPool1d(3, 3, 0),
            nn.Dropout(0.1)
        )
        self.attention1 = SelfAttention(32)

        self.conv2 = nn.Sequential(       
            nn.Conv1d(32, 64, 5, 1, 2),
            nn.BatchNorm1d(64), 
            nn.ReLU(),                     
            nn.Conv1d(64, 64, 5, 1, 2),
            nn.BatchNorm1d(64), 
            nn.ReLU(),           
            nn.MaxPool1d(3, 3, 0),
            nn.Dropout(0.1)
        )
        self.attention2 = SelfAttention(64)

        self.out1 = nn.Sequential(
            nn.Flatten(),
            nn.Linear(704, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.1)
        )
        self.out2 = nn.Sequential(
            nn.Linear(256, num_class)
        )

    def forward(self, x):
        x = self.conv1(x)
        x = self.attention1(x)
        x = self.conv2(x)
        x = self.attention2(x)
        output = self.out1(x)
        output = self.out2(output)
        return output

# 实例化模型
model = CNN(in_dim=1, num_class=191)

def train():
    global model
    best_acc = 0.0
    patience = 20
    patience_counter = 0

    if USE_GPU:
        model = model.cuda()

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)

    for epoch in range(num_epoches):
        print('epoch {}'.format(epoch + 1))
        print('*' * 10)
        running_loss = 0.0
        running_acc = 0.0
        model.train()  # 设置模型为训练模式
        for i, data in tqdm(enumerate(dataset.train_dataloader, 1)):
            img, label = data
            if USE_GPU:
                img = img.cuda()
                label = label.cuda()
            img = Variable(img)
            label = Variable(label)
            # 向前传播
            out = model(img)
            loss = criterion(out, label)
            running_loss += loss.item() * label.size(0)
            _, pred = torch.max(out, 1)
            num_correct = (pred == label).sum()
            accuracy = (pred == label).float().mean()
            running_acc += num_correct.item()
            # 向后传播
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print('Finish {} epoch, Loss: {:.6f}, Acc: {:.6f}'.format(
            epoch + 1, running_loss / (len(dataset.train_dataset)), running_acc / (len(
                dataset.train_dataset))))
        
        model.eval()  # 设置模型为评估模式
        eval_loss = 0.0
        eval_acc = 0.0
        for data in dataset.test_dataloader:
            img, label = data
            if USE_GPU:
                img = img.cuda()
                label = label.cuda()
            with torch.no_grad():
                out = model(img)
                loss = criterion(out, label)
            eval_loss += loss.item() * label.size(0)
            _, pred = torch.max(out, 1)
            num_correct = (pred == label).sum()
            eval_acc += num_correct.item()
        print('Validation Loss: {:.6f}, Acc: {:.6f}'.format(
            eval_loss / (len(dataset.test_dataloader)), eval_acc / (len(dataset.test_dataloader))))
        
        # 保存最好的模型
        if eval_acc > best_acc:
            best_acc = eval_acc
            torch.save(model.state_dict(), r'E:\ZLJ_code\yt-url-har-pcap\data\model\model_p100.pth')
            patience_counter = 0
        else:
            patience_counter += 1
            # 如果验证准确率在10个epoch内没有提高，停止训练
            if patience_counter >= patience:
                print('Early stopping')
                break

train()

# 创建一个示例输入张量，假设输入为单通道1D数据
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
dummy_input = torch.randn(batch_size, 1, 100)  # 调整为符合实际输入维度
dummy_input = dummy_input.to(device)

# 保存为ONNX格式
onnx_model_path = r"E:\ZLJ_code\yt-url-har-pcap\data\model\model_p100.onnx"
torch.onnx.export(model, dummy_input, onnx_model_path,
                  input_names=['input'], output_names=['output'],
                  dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}})

print(f"Model saved in ONNX format at {onnx_model_path}")


epoch 1
**********


24it [00:00, 39.65it/s]


Finish 1 epoch, Loss: 5.359348, Acc: 0.002639
Validation Loss: 82.613076, Acc: 0.071429
epoch 2
**********


24it [00:00, 40.99it/s]


Finish 2 epoch, Loss: 5.206365, Acc: 0.015831
Validation Loss: 82.118307, Acc: 0.000000
epoch 3
**********


24it [00:00, 41.56it/s]


Finish 3 epoch, Loss: 5.095560, Acc: 0.018470
Validation Loss: 82.026542, Acc: 0.000000
epoch 4
**********


24it [00:00, 36.80it/s]


Finish 4 epoch, Loss: 4.992971, Acc: 0.026385
Validation Loss: 81.574766, Acc: 0.142857
epoch 5
**********


24it [00:00, 38.63it/s]


Finish 5 epoch, Loss: 4.928925, Acc: 0.044855
Validation Loss: 81.402228, Acc: 0.071429
epoch 6
**********


24it [00:00, 37.60it/s]


Finish 6 epoch, Loss: 4.850379, Acc: 0.058047
Validation Loss: 80.930936, Acc: 0.142857
epoch 7
**********


24it [00:00, 38.94it/s]


Finish 7 epoch, Loss: 4.763184, Acc: 0.060686
Validation Loss: 80.687536, Acc: 0.357143
epoch 8
**********


24it [00:00, 39.84it/s]


Finish 8 epoch, Loss: 4.701479, Acc: 0.092348
Validation Loss: 80.769786, Acc: 0.071429
epoch 9
**********


24it [00:00, 39.84it/s]


Finish 9 epoch, Loss: 4.594361, Acc: 0.121372
Validation Loss: 80.541719, Acc: 0.142857
epoch 10
**********


24it [00:00, 40.04it/s]


Finish 10 epoch, Loss: 4.551549, Acc: 0.102902
Validation Loss: 80.403694, Acc: 0.000000
epoch 11
**********


24it [00:00, 35.70it/s]


Finish 11 epoch, Loss: 4.443057, Acc: 0.131926
Validation Loss: 80.417292, Acc: 0.214286
epoch 12
**********


24it [00:00, 39.45it/s]


Finish 12 epoch, Loss: 4.388747, Acc: 0.142480
Validation Loss: 80.208597, Acc: 0.142857
epoch 13
**********


24it [00:00, 39.64it/s]


Finish 13 epoch, Loss: 4.331512, Acc: 0.163588
Validation Loss: 80.220732, Acc: 0.214286
epoch 14
**********


24it [00:00, 40.24it/s]


Finish 14 epoch, Loss: 4.231518, Acc: 0.205805
Validation Loss: 79.715177, Acc: 0.142857
epoch 15
**********


24it [00:00, 39.64it/s]


Finish 15 epoch, Loss: 4.177845, Acc: 0.189974
Validation Loss: 79.712695, Acc: 0.285714
epoch 16
**********


24it [00:00, 41.14it/s]


Finish 16 epoch, Loss: 4.072469, Acc: 0.253298
Validation Loss: 79.424798, Acc: 0.357143
epoch 17
**********


24it [00:00, 39.91it/s]


Finish 17 epoch, Loss: 4.021356, Acc: 0.277045
Validation Loss: 79.903173, Acc: 0.142857
epoch 18
**********


24it [00:00, 40.31it/s]


Finish 18 epoch, Loss: 3.972945, Acc: 0.292876
Validation Loss: 79.859862, Acc: 0.285714
epoch 19
**********


24it [00:00, 39.71it/s]


Finish 19 epoch, Loss: 3.930997, Acc: 0.271768
Validation Loss: 80.029800, Acc: 0.357143
epoch 20
**********


24it [00:00, 39.00it/s]


Finish 20 epoch, Loss: 3.842627, Acc: 0.319261
Validation Loss: 79.613483, Acc: 0.285714
epoch 21
**********


24it [00:00, 39.97it/s]


Finish 21 epoch, Loss: 3.769375, Acc: 0.319261
Validation Loss: 79.339731, Acc: 0.357143
epoch 22
**********


24it [00:00, 40.31it/s]


Finish 22 epoch, Loss: 3.714817, Acc: 0.361478
Validation Loss: 79.315769, Acc: 0.428571
epoch 23
**********


24it [00:00, 33.94it/s]


Finish 23 epoch, Loss: 3.623687, Acc: 0.424802
Validation Loss: 79.110590, Acc: 0.357143
epoch 24
**********


24it [00:00, 40.51it/s]


Finish 24 epoch, Loss: 3.574956, Acc: 0.416887
Validation Loss: 79.095878, Acc: 0.428571
epoch 25
**********


24it [00:00, 39.97it/s]


Finish 25 epoch, Loss: 3.510256, Acc: 0.451187
Validation Loss: 79.162334, Acc: 0.428571
epoch 26
**********


24it [00:00, 40.04it/s]


Finish 26 epoch, Loss: 3.436777, Acc: 0.440633
Validation Loss: 79.156963, Acc: 0.428571
epoch 27
**********


24it [00:00, 39.51it/s]


Finish 27 epoch, Loss: 3.374279, Acc: 0.459103
Validation Loss: 79.211061, Acc: 0.285714
epoch 28
**********


24it [00:00, 39.91it/s]


Finish 28 epoch, Loss: 3.334465, Acc: 0.488127
Validation Loss: 79.438284, Acc: 0.357143
epoch 29
**********


24it [00:00, 40.65it/s]


Finish 29 epoch, Loss: 3.276384, Acc: 0.503958
Validation Loss: 79.187579, Acc: 0.285714
epoch 30
**********


24it [00:00, 38.94it/s]


Finish 30 epoch, Loss: 3.202941, Acc: 0.530343
Validation Loss: 79.473958, Acc: 0.357143
epoch 31
**********


24it [00:00, 39.13it/s]


Finish 31 epoch, Loss: 3.104949, Acc: 0.601583
Validation Loss: 79.123076, Acc: 0.357143
epoch 32
**********


24it [00:00, 39.26it/s]


Finish 32 epoch, Loss: 3.066061, Acc: 0.567282
Validation Loss: 79.265842, Acc: 0.428571
epoch 33
**********


24it [00:00, 37.48it/s]


Finish 33 epoch, Loss: 2.982900, Acc: 0.588391
Validation Loss: 79.259558, Acc: 0.285714
epoch 34
**********


24it [00:00, 39.00it/s]


Finish 34 epoch, Loss: 2.931835, Acc: 0.609499
Validation Loss: 79.032227, Acc: 0.428571
epoch 35
**********


24it [00:00, 39.13it/s]


Finish 35 epoch, Loss: 2.876575, Acc: 0.630607
Validation Loss: 79.120670, Acc: 0.357143
epoch 36
**********


24it [00:00, 39.13it/s]


Finish 36 epoch, Loss: 2.830942, Acc: 0.643799
Validation Loss: 79.294799, Acc: 0.357143
epoch 37
**********


24it [00:00, 38.32it/s]


Finish 37 epoch, Loss: 2.735603, Acc: 0.643799
Validation Loss: 79.142222, Acc: 0.428571
epoch 38
**********


24it [00:00, 39.58it/s]


Finish 38 epoch, Loss: 2.668643, Acc: 0.680739
Validation Loss: 79.143026, Acc: 0.357143
epoch 39
**********


24it [00:00, 41.92it/s]


Finish 39 epoch, Loss: 2.627559, Acc: 0.691293
Validation Loss: 78.963112, Acc: 0.357143
epoch 40
**********


24it [00:00, 38.88it/s]


Finish 40 epoch, Loss: 2.559274, Acc: 0.701847
Validation Loss: 79.564207, Acc: 0.357143
epoch 41
**********


24it [00:00, 39.51it/s]


Finish 41 epoch, Loss: 2.479206, Acc: 0.733509
Validation Loss: 78.993339, Acc: 0.285714
epoch 42
**********


24it [00:00, 39.58it/s]


Finish 42 epoch, Loss: 2.438726, Acc: 0.725594
Validation Loss: 78.922773, Acc: 0.428571
Early stopping
Model saved in ONNX format at E:\ZLJ_code\yt-url-har-pcap\data\model\model_p100.onnx


In [58]:
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

# 加载最好的模型
model.load_state_dict(torch.load(r'E:\ZLJ_code\yt-url-har-pcap\data\model\model_p100.pth'))
# model = model.cuda()

model.eval()  # 设置模型为评估模式
y_true = []
y_pred = []
for data in dataset.train_dataloader:
    img, label = data
    if True:
        # img = img.cuda()
        # label = label.cuda()
        ...
    with torch.no_grad():
        out = model(img)
    _, pred = torch.max(out, 1)
    y_true.extend(label.cpu().numpy())
    y_pred.extend(pred.cpu().numpy())

# 生成分类报告
report = classification_report(y_true, y_pred, output_dict=True)

# 获取f1-score的macro avg和accuracy
f1_score_macro_avg = report['macro avg']['f1-score']
accuracy = report['accuracy']


# 输出混淆矩阵和分类报告
print('Confusion Matrix:')
print(confusion_matrix(y_true, y_pred))
print('Classification Report:')
print(classification_report(y_true, y_pred, digits=4))

Confusion Matrix:
[[1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 3 ... 0 0 0]
 ...
 [0 0 0 ... 2 0 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 0]]
Classification Report:
              precision    recall  f1-score   support

           0     1.0000    0.5000    0.6667         2
           1     0.0000    0.0000    0.0000         1
           2     0.2727    1.0000    0.4286         3
           3     1.0000    0.5000    0.6667         2
           4     1.0000    0.3333    0.5000         3
           5     0.0000    0.0000    0.0000         2
           6     1.0000    0.6667    0.8000         3
           7     0.0000    0.0000    0.0000         1
           8     1.0000    0.5000    0.6667         2
           9     1.0000    1.0000    1.0000         1
          10     0.0000    0.0000    0.0000         1
          11     1.0000    0.5000    0.6667         2
          12     0.0000    0.0000    0.0000         2
          13     0.5000    0.7500    0.6000         4
          14     1.0000    1

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
