# 简介

在这个Notebook里面，我将尝试利用现有技术实现一个基于Transformer的入侵检测系统并尽可能对其进行改进。首先，我将描述本架构的各个组成部分，在阐明其原理后对模型进行训练与评估，并在最后在相同问题下将该架构与其他架构进行对比。

以下是为训练此模型而导入的包：

In [1]:
# !pip install einops
# !pip install numpy
# !pip install pandas
# !pip install scikit-learn
# !pip install imbalanced-learn
# !pip install scikit-image
# !pip install torchsummary
# !pip install rtdl_num_embeddings
!pip install seaborn

# 结构

## 嵌入(Embedding)

在该部分里，数值信息将被转换为可被Transformer识别的特征值并提供给后面的结构进行学习。

$ReLU(Linear(Periodic(x_i)))$

In [2]:
from torch import nn
import torch
from einops import rearrange

class Embedder(nn.Module):
    def __init__(self, dim, num_numerical_types=25):
        super().__init__()
        self.weights = nn.Parameter(torch.randn(num_numerical_types, dim))
        self.biases = nn.Parameter(torch.randn(num_numerical_types, dim))

    def forward(self, x):
        x = rearrange(x, 'b n -> b n 1')
        return x * self.weights + self.biases

## 位置编码(Positional Encoding)

由于嵌入产生的信息并不带有位置信息，故需要在学习前将位置信息写入训练数据中。

$$PE(pos, 2i) = \sin(pos/1000^{2i/d\_model})$$
$$PE(pos, 2i+1) = \cos(pos/1000^{2i/d\_model})$$

In [3]:
import torch
from torch import nn
from torch.autograd import Variable
from math import cos, sin, sqrt

class Positional_Encoder(nn.Module):
    def __init__(self, d_model, max_seq_len = 25):
        super().__init__()
        self.d_model = d_model

        pe = torch.zeros(max_seq_len, d_model)
        for pos in range(max_seq_len):
            for i in range(0,d_model,2):
                pe[pos,i] = sin(pos / (1000**(i/d_model)))
                pe[pos,i+1] = cos(pos / (1000**(i/d_model)))

        pe = pe.unsqueeze(0)
        self.register_buffer("pe",pe)

    def forward(self, x):
        x = x * sqrt(self.d_model)
        seq_len = x.size(1)
        y = Variable(self.pe[:,:seq_len],requires_grad=False).cuda()
        x = x + y
        return x

## 注意力机制(Attention)

本部分用于对输入向量进行处理以获取输入与输出间的对应关系。具体计算过程如下：

$$Attention(Q,K,V) = softmax\left(M+\frac{QK^T}{\sqrt{d_k}}\right)$$

In [4]:
import torch
from torch import nn
from torch.nn import functional as F
import math

def attention(q, k, v, d_k, mask=None, dropout=None):
    scores = torch.matmul(q, k.transpose(-2, -1)) /  math.sqrt(d_k)
    if mask is not None:
      shape = scores.shape
      mask = get_mask(shape[0], 8, 10)
      scores = scores.masked_fill(mask == 0, -1e9)
    scores = F.softmax(scores, dim=-1)

    if dropout is not None:
        scores = dropout(scores)

    output = torch.matmul(scores, v)
    return output

class Multi_Head_Attention(nn.Module):
    def __init__(self, heads, d_model, dropout):
        super().__init__()
        self.d_model = d_model
        self.d_k = d_model // heads
        self.h = heads

        self.q_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(d_model, d_model)

    def forward(self, q, k, v, mask=None):

        bs = q.size(0)

        # perform linear operation and split into h heads
        k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
        q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
        v = self.v_linear(v).view(bs, -1, self.h, self.d_k)

        # transpose to get dimensions bs * h * sl * d_model
        k = k.transpose(1,2)
        q = q.transpose(1,2)
        v = v.transpose(1,2)

        scores = attention(q, k, v, self.d_k, mask, self.dropout)

        # concatenate heads and put through final linear layer
        concat = scores.transpose(1,2).contiguous().view(bs, -1, self.d_model)

        output = self.out(concat)
        return output

## 掩蔽(Masks)

该部分用于对输入解码器(Decoder)的向量进行掩蔽来使得解码器能够根据上下文来推算结果。

In [5]:
import numpy as np
import torch
from torch.autograd import Variable

def get_mask(batch_size, heads, seq_size):
    mask_prob = 0.2
    mask = torch.rand((batch_size, heads, seq_size, seq_size)) > mask_prob
    return mask.cuda()

## 前馈网络(Feed-Forward Network)

该部分主要用于记忆注意力机制计算所产生的的关系。具体原理如下：

$$FFN(x) = \max(0,xW_1 + b_1)W_2 + b_2$$

In [6]:
from torch import nn
from torch.nn import functional as F

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=1024, dropout = 0.1):
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model)
    def forward(self, x):
        x = self.dropout(F.relu(self.linear_1(x)))
        x = self.linear_2(x)
        return x

## 层归一化(Layer Norm)

主要用于使不同范围的数据归一化到[0, 1]区间内，方便模型进行处理，具体原理如下：

$$LN(x) = \frac{x - \mu}{\delta}\cdot \alpha + \beta$$

In [None]:
from torch import nn
import torch

class Norm(nn.Module):
    def __init__(self, d_model, eps = 1e-6):
        super().__init__()

        self.size = d_model

        self.alpha = nn.Parameter(torch.ones(self.size))
        self.bias = nn.Parameter(torch.zeros(self.size))
        self.eps = eps
    def forward(self, x):
        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \
        / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
        return norm

## 编码器(Encoder)

整个编码器由数个编码器嵌入器、数个编码器层与层归一化层组成。其中，一个编码器层由层归一化、注意力机制与前馈神经网络层组成，数据在经过这些处理后由dropout方法进行输出。

In [None]:
from torch import nn
import copy

def get_clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

class Encoder_Layer(nn.Module):
    def __init__(self, d_model, heads, dropout = 0.1):
        super().__init__()
        self.norm_1 = Norm(d_model)
        self.norm_2 = Norm(d_model)
        self.attn = Multi_Head_Attention(heads, d_model, dropout)
        self.feedf = FeedForward(d_model).cuda()
        self.dropout_1 = nn.Dropout(dropout).cuda()
        self.dropout_2 = nn.Dropout(dropout).cuda()

    def forward(self, x, mask):
        x2 = self.norm_1(x)
        x = x + self.dropout_1(self.attn(x2,x2,x2,mask))
        x2 = self.norm_2(x)
        x = x + self.dropout_2(self.feedf(x2))
        return x

class Encoder(nn.Module):
    def __init__(self, d_model, N, heads, dropout = 0.1):
        super().__init__()
        self.N = N
        self.embed = Embedder(d_model)
        self.pe = Positional_Encoder(d_model)
        self.layers = get_clones(Encoder_Layer(d_model, heads, dropout), N)
        self.norm = Norm(d_model)

    def forward(self, src, mask = None):
        x = self.embed(src)
        x = self.pe(x)
        for i in range(self.N):
            x = self.layers[i](x,mask)
        return self.norm(x)

## 解码器

In [9]:
import torch
from torch import nn
import copy

def get_clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

class Decoder_Layer(nn.Module):
    def __init__(self, d_model, heads, dropout = 0.1):
        super().__init__()
        self.norm_1 = Norm(d_model)
        self.norm_2 = Norm(d_model)
        self.norm_3 = Norm(d_model)
        self.attn = Multi_Head_Attention(heads, d_model, dropout)
        self.msk_attn = Multi_Head_Attention(heads, d_model, dropout)
        self.feedf = FeedForward(d_model).cuda()
        self.dropout_1 = nn.Dropout(dropout).cuda()
        self.dropout_2 = nn.Dropout(dropout).cuda()
        self.dropout_3 = nn.Dropout(dropout).cuda()

    def forward(self, x, e_outputs, mask):
        x2 = self.norm_1(x)
        x = x + self.dropout_1(self.msk_attn(x2,x2,x2,mask))
        x2 = self.norm_2(x)
        x = x + self.dropout_2(self.attn(x2,e_outputs,e_outputs,mask=None))
        x2 = self.norm_3(x)
        x = x +self.dropout_3(self.feedf(x2))
        return x

class Decoder(nn.Module):
    def __init__(self, d_model, N, heads, dropout = 0.1):
        super().__init__()
        self.N = N
        self.embed = Embedder(d_model)
        self.pe = Positional_Encoder(d_model,1)
        self.layers = get_clones(Decoder_Layer(d_model, heads, dropout), N)
        self.norm = Norm(d_model)

    def forward(self, trg, e_outputs, mask = None):
        x = self.embed(trg)
        x = self.pe(x)
        for i in range(self.N):
            x = self.layers[i](x,e_outputs,mask)
        return self.norm(x)


## 整体架构定义

In [10]:
import torch
from torch import nn

class Transformer(nn.Module):
    def __init__(self, trg_vocab, d_model, N, heads, dropout = 0.1):
        super().__init__()
        self.encoder = Encoder(d_model, N , heads, dropout)
        self.decoder = Decoder(d_model, N , heads, dropout)
        self.out = nn.Linear(25*d_model, trg_vocab)

    def forward(self, src, trg_mask=None):
        e_outputs = self.encoder(src, None)
        d_output = self.decoder(src, e_outputs, trg_mask)
        d_intermediate = d_output.view(d_output.size(0), -1)
        output = self.out(d_intermediate)
        output = torch.softmax(output,dim=1)
        return output

# 数据集与数据处理

## NF-CEC-CIC-IDS2018-v2
下载数据集:https://rdm.uq.edu.au/files/ce5161d0-ef9c-11ed-827d-e762de186848

昆士兰大学Sarhan团队利用了CSE-CIC-IDS2018数据集的原始pcap文件生成了一个基于NetFlow的数据集，称为NF-CSE-CIC-IDS2018-v2。总的流量数量为18,893,708，其中2,258,141（11.95%）是攻击样本，16,635,567（88.05%）是良性样本。

## 数据处理
本部分用于对数据进行预处理。由于本数据集的特殊性——正常流量远多于异常攻击流量，因此需要对数据进行下采样与SMOTE处理，前者用于限制数量较多的样本的数量，后者用于对数量较少的样本进行插值处理以增加其数量。

In [14]:
import pandas as pd
import os
import numpy as np
import pandas as pd
import torch
import torchvision.transforms as transforms
from PIL import Image
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from skimage.feature import local_binary_pattern
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader
import glob

# 声明全局变量
directory_path = 'SIDD/'


# 定义数据集类
class CICIDSDataset(Dataset):
    def __init__(self, data):
        self.features = data[:, :-3]  # 特征列
        self.at_type = data[:, -3]  # 攻击类型列
        self.labels = data[:, -2:]  # 1 Hot Encoded Label

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        feature = torch.from_numpy(self.features[idx]).float()
        label = self.labels[idx]
        at_type = self.at_type[idx]

        return feature, label, at_type


def data_process(data, label_mapping):
    # Undersampling & SMOTE
    max_class_size = 100000  # Size of all Classes for Undersampling
    class_counts = data['attack_type'].value_counts()
    classes_to_undersample = class_counts[class_counts > max_class_size]

    under_sampler = RandomUnderSampler(sampling_strategy={
        label: 7 * max_class_size if label_mapping[
                                         label] == "Benign" else max_class_size if label in classes_to_undersample else
        class_counts[label] for label in np.unique(data['attack_type'])
    }, random_state=42)
    nn_estimator = NearestNeighbors(n_neighbors=5, n_jobs=-1)
    smote = SMOTE(sampling_strategy={
        label: 7 * max_class_size if label_mapping[label] == "Benign" else max_class_size for label in
        np.unique(data['attack_type'])
    }, k_neighbors=nn_estimator, random_state=42)
    scaler = MinMaxScaler()

    features = data.drop(columns=['attack_type', 'labels'])
    labels = data['attack_type']

    print('Under sampling...')
    sampled_features, sampled_labels = under_sampler.fit_resample(features, labels)
    print('Using SMOTE...')
    balanced_features, balanced_labels = smote.fit_resample(sampled_features, sampled_labels)
    print('Scaling...')
    scaled_data = scaler.fit_transform(balanced_features)

    data = pd.DataFrame(data=scaled_data, columns=features.columns)
    data = pd.concat([data, balanced_labels], axis=1)

    # 1 Hot Encoding
    data['Attack_Label'] = data['attack_type'].apply(lambda x: "Attack" if label_mapping[x] != "Benign" else "Benign")
    print('Label encoding...')
    encoded_labels = pd.get_dummies(data['Attack_Label'], prefix='', prefix_sep='')
    data = pd.concat([data, encoded_labels], axis=1)
    data.drop('Attack_Label', axis=1, inplace=True)

    label_counts = data['attack_type'].value_counts()
    print(label_counts)
    return data


def pca(feature, label, target_variance=0.95):
    print('Using PCA...')
    scaler = StandardScaler()
    feature = np.nan_to_num(scaler.fit_transform(feature))
    Iter = PCA(svd_solver='randomized')
    feature = Iter.fit_transform(feature)
    explained_variance_ratio = Iter.explained_variance_ratio_
    cumulative_variance_ratio = np.cumsum(explained_variance_ratio)
    n_components = np.argmax(cumulative_variance_ratio >= target_variance) + 1
    Iter = PCA(n_components=n_components, svd_solver='randomized')
    feature = pd.DataFrame(Iter.fit_transform(feature))
    data = pd.concat([feature, label], axis=1)
    return data


# 定义提取 LBP 特征的函数
def lbp_feature_extraction(image):
    # 将 PIL 图像转换为 NumPy 数组
    image_array = np.array(image)

    # 计算 LBP 特征
    lbp_image = local_binary_pattern(image_array, 8, 1, method='uniform')

    # 将 LBP 图像展平成一维向量
    lbp_feature = lbp_image.ravel()

    return lbp_feature


def feature_extraction():
    # 初始化特征、标签和攻击类型列表
    features = []
    labels = []
    attack_types = []

    # 图像预处理
    transform = transforms.Compose([
        transforms.Grayscale(),  # 转为灰度图
        transforms.Resize((48, 48)),  # 调整大小为48x48
    ])

    # 遍历基础文件夹中的 nxxx 文件夹
    cnt = 1
    for root, dirs, files in os.walk(directory_path):
        for dir_name in dirs:
            # 检查文件夹名称是否符合 nxxx 结构
            if dir_name.startswith('n'):
                nxxx_folder = os.path.join(root, dir_name)

                # 寻找 pcap 文件夹
                pcap_folder = os.path.join(nxxx_folder, 'pcap')

                if os.path.exists(pcap_folder):
                    # 遍历 pcap 文件夹中的 nxxx_xxxx 文件夹
                    for pcap_subfolder in os.listdir(pcap_folder):
                        subfolder_path = os.path.join(pcap_folder, pcap_subfolder)

                        # 确定攻击类型
                        attack_type = int(pcap_subfolder.split('_')[-1])  # 提取最后一位数字并转换为整数

                        # 寻找 dataset 文件夹中的图片文件
                        dataset_folder = os.path.join(subfolder_path, 'dataset')
                        benign_folder = os.path.join(dataset_folder, 'benign')
                        malicious_folder = os.path.join(dataset_folder, 'malicious')


                        # 提取 benign 文件夹中的图片特征并标注数据类型
                        if os.path.exists(benign_folder):
                            for image_name in os.listdir(benign_folder):
                                image_path = os.path.join(benign_folder, image_name)
                                image = Image.open(image_path)

                                # 图像预处理
                                image = transform(image)

                                # 使用 LBP 提取特征
                                lbp_feature = lbp_feature_extraction(image)

                                # 添加特征和标签到列表中
                                features.append(lbp_feature.tolist())
                                labels.append(0)  # 标注为 0
                                attack_types.append(2)

                        # 提取 malicious 文件夹中的图片特征并标注数据类型
                        if os.path.exists(malicious_folder):
                            for image_name in os.listdir(malicious_folder):
                                image_path = os.path.join(malicious_folder, image_name)
                                image = Image.open(image_path)

                                # 图像预处理
                                image = transform(image)

                                # 使用 LBP 提取特征
                                lbp_feature = lbp_feature_extraction(image)

                                # 添加特征和标签到列表中
                                features.append(lbp_feature.tolist())
                                attack_types.append(attack_type)
                                labels.append(1)  # 标注为 1

                        # 创建包含唯一列名的特征列列表

                features = np.array(features)
                features.reshape(-1, 2304)
                feature_columns = [f'feature_{i}' for i in range(features.shape[1])]

                # 将每个特征向量作为单独的列添加到 DataFrame 中
                data = pd.DataFrame(features, columns=feature_columns)

                # 添加标签和攻击类型列
                data['labels'] = labels
                data['attack_type'] = attack_types
                data.to_csv(directory_path + "preprocessed/data" + str(cnt) + ".csv.gz", index=False,
                            compression='gzip')

                features = []
                labels = []
                attack_types = []
                data = []

    data = glob.glob(directory_path + 'preprocessed/' + '*.gz')
    dataframes = []
    for file in data:
        dataframe = pd.read_csv(file, compression='gzip')
        dataframe = pca(dataframe.drop(columns=['labels', 'attack_type']), dataframe[['labels', 'attack_type']])
        dataframes.append(dataframe)
    data = pd.concat(dataframes, ignore_index=True)
    del dataframes

    return data

    # 将 DataFrame 保存为 gzip 压缩的 CSV 文件
    '''if not os.path.exists(directory_path + 'preprocessed'):
        os.makedirs(directory_path + 'preprocessed')
    data.to_csv(output_file, index=False, compression='gzip')
    print("saved")'''


# 定义加载数据的函数
def load_data():
    if os.path.exists(directory_path + "preprocessed/data.csv.gz"):
        print("Loading Preprocessed Data")
        data = pd.read_csv(directory_path + "preprocessed/data.csv.gz", compression='gzip')

        data.iloc[:, :-3] = data.iloc[:, :-3].astype('float64')
        data.iloc[:, -3:] = data.iloc[:, -3:].astype('int')

    else:
        atk_dict = {1: 'SMB attack', 3: 'SYN flood', 2: 'Benign'}

        # data = feature_extraction()
        data = glob.glob(directory_path + 'preprocessed/' + '*.gz')
        dataframes = []
        for file in data:
            dataframe = pd.read_csv(file, compression='gzip')
            
            dataframes.append(dataframe)
        data = pd.concat(dataframes, ignore_index=True)
        del dataframes
        
        data = pca(data.drop(columns=['labels', 'attack_type']), data[['labels', 'attack_type']])
        
        # 调用 data_process 函数对 PCA 后的数据进行处理
        data = data_process(data, atk_dict)

        if not os.path.exists(directory_path + 'preprocessed'):
            os.makedirs(directory_path + 'preprocessed')
        data.to_csv(directory_path + "preprocessed/data.csv.gz", index=False, compression='gzip')
        print("saved")

        data.iloc[:, :-3] = data.iloc[:, :-3].astype('float64')
        data.iloc[:, -3:] = data.iloc[:, -3:].astype('int')

    train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)
    train_data, val_data = train_data.values, val_data.values

    return train_data, val_data

def get_data_loader(data, batch_size):
    cicids_dataset = CICIDSDataset(data)

    return DataLoader(cicids_dataset, batch_size=batch_size, shuffle=True)

# 模型训练

定义训练与评估函数

In [12]:
import torch
from torch import nn
from torch.nn import functional as F
import numpy as np
import time
import os
import torch
from torch.nn import functional as F
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, r2_score, \
    mean_squared_error, log_loss
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import ConfusionMatrixDisplay


def plot_metrics(losses, accuracies, roc_auc_scores, confusion_matrix):
    # 绘制损失和准确率曲线
    epochs = range(1, len(losses) + 1)
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(epochs, losses, 'r', label='Training Loss')
    plt.title('Training Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(epochs, accuracies, 'b', label='Accuracy')
    plt.title('Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.tight_layout()
    plt.show()

    # 绘制ROC曲线
    plt.figure(figsize=(6, 6))
    for i in range(len(roc_auc_scores)):
        fpr, tpr, _ = roc_auc_scores[i]
        plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc_scores[i][-1]:0.2f})')
    plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc="lower right")
    plt.show()

    # 绘制混淆矩阵
    plt.figure(figsize=(8, 6))
    sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title("Confusion Matrix")
    plt.show()


def eval_model(model, loader):
    model.cuda()
    model.eval()
    losses = []
    correct = 0
    predictions = []
    targets = []

    with torch.no_grad():
        for data, target, _ in loader:
            data, target = data.to("cuda"), target.to("cuda")
            output = model(data)
            loss = F.cross_entropy(output, target).item()
            losses.append(loss)
            pred = torch.argmax(output, dim=1)
            correct += torch.sum(pred == target).item()
            predictions.extend(pred.cpu().numpy())
            targets.extend(target.cpu().numpy())

    eval_loss = np.mean(losses)
    eval_acc = correct / len(loader.dataset)

    # 计算精确率、召回率、F1 分数
    precision = precision_score(targets, predictions, average='weighted')
    recall = recall_score(targets, predictions, average='weighted')
    f1 = f1_score(targets, predictions, average='weighted')

    # 计算 ROC 曲线和 AUC
    try:
        auc = roc_auc_score(targets, predictions, average='weighted')
    except ValueError:
        auc = None

    # 计算混淆矩阵
    conf_matrix = confusion_matrix(targets, predictions)

    # 计算 R-squared
    r_squared = r2_score(targets, predictions)

    # 计算均方误差
    mse = mean_squared_error(targets, predictions)

    # 计算对数损失
    try:
        logloss = log_loss(targets, predictions)
    except ValueError:
        logloss = None

    print("Loss:", eval_loss, "Accuracy:", eval_acc)
    print("Precision:", precision, "Recall:", recall, "F1 Score:", f1)
    print("AUC:", auc)
    print("Confusion Matrix:")
    print(conf_matrix)
    print("R-squared:", r_squared)
    print("Mean Squared Error:", mse)
    print("Log Loss:", logloss)

    return eval_loss, eval_acc, precision, recall, f1, auc, conf_matrix, r_squared, mse, logloss


def train_model(model, opt, epochs, data, eval_data, path, print_every=100):
    model.cuda()

    pretrained_path = "SIDD/pretrained"
    top_acc = 0.

    if os.path.exists(pretrained_path + "/" + path):
        print("Loading Pretrained Model")
        state = torch.load(pretrained_path + "/" + path)
        model.load_state_dict(state["model_state_dict"])
        start_epoch = state["epoch"] + 1
        losses = state["ep_loss"]
        accs = state["ep_acc"]
        top_acc = max(accs)
    else:
        start_epoch = 0
        losses, accs = [], []
        try:
            os.mkdir(pretrained_path)
        except OSError as error:
            pass

    start = time.time()
    temp = start

    for epoch in range(start_epoch, epochs):
        model.train()
        total_loss = 0
        for i, batch in enumerate(data):
            src, trg, _ = batch
            src, trg = src.cuda(), trg.cuda()

            if isinstance(model, Transformer):
                trg_mask = get_mask(128, 8, 25)
            else:
                trg_mask = None

            preds = model(src, trg_mask)
            opt.zero_grad()
            loss = F.cross_entropy(preds, trg)
            loss.backward()
            opt.step()

            total_loss += loss.data
            if (i + 1) % print_every == 0:
                loss_avg = total_loss / print_every
                print("time = %dm, epoch %d, iter = %d, loss = %.3f, \
                %ds per %d iters" % ((time.time() - start) // 60,
                                     epoch + 1, i + 1, loss_avg, time.time() - temp,
                                     print_every))
                total_loss = 0
                temp = time.time()
        eval_loss, eval_acc, precision, recall, f1, auc, conf_matrix, r_squared, mse, logloss = eval_model(model, eval_data)

        losses.append(ep_loss)
        accs.append(ep_acc)
        if ep_acc > top_acc:
            top_state = {
                "model_state_dict": model.state_dict(),
                "epoch": epoch
            }
            torch.save(top_state, pretrained_path + "/max_" + path)
        state = {
            'model_state_dict': model.state_dict(),
            'epoch': epoch,
            'ep_loss': losses,
            'ep_acc': accs
        }
        torch.save(state, pretrained_path + "/" + path)

设置参数并训练模型

In [None]:
import torch
from torch import nn
from torchsummary import summary

def main():
    learning_rate = 5e-4
    batch_size = 128
    epochs = 30
    dropout_rate = 0.5
    d_model = 32
    heads = 8
    N = 6
    trg_vocab = 2

    train_data, val_data = load_data()

    train_loader = get_data_loader(train_data, batch_size)
    val_loader = get_data_loader(val_data, batch_size)

    model = Transformer(trg_vocab, d_model, N, heads, dropout_rate)
    save_path = "pretrained.pt"

    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    model = model.to("cuda:0")
    summary(model, (25, ))

    optim = torch.optim.SGD(model.parameters(), lr=learning_rate)

    train_model(model, optim, epochs, train_loader, val_loader, save_path)

main()

Loading Preprocessed Data


1         1
2         1
3         1
4         1
         ..
899995    1
899996    1
899997    1
899998    1
899999    1
Name: Attack, Length: 900000, dtype: int64' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
  data.iloc[:, -3:] = data.iloc[:, -3:].astype('int')
1         0
2         0
3         0
4         0
         ..
899995    0
899996    0
899997    0
899998    0
899999    0
Name: Benign, Length: 900000, dtype: int64' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
  data.iloc[:, -3:] = data.iloc[:, -3:].astype('int')


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
          Embedder-1               [-1, 25, 32]               0
Positional_Encoder-2               [-1, 25, 32]               0
              Norm-3               [-1, 25, 32]              32
            Linear-4               [-1, 25, 32]           1,056
            Linear-5               [-1, 25, 32]           1,056
            Linear-6               [-1, 25, 32]           1,056
           Dropout-7            [-1, 8, 25, 25]               0
            Linear-8               [-1, 25, 32]           1,056
Multi_Head_Attention-9               [-1, 25, 32]               0
          Dropout-10               [-1, 25, 32]               0
             Norm-11               [-1, 25, 32]              32
           Linear-12             [-1, 25, 1024]          33,792
          Dropout-13             [-1, 25, 1024]               0
           Linear-14               [-