In [3]:
import torch
import pandas as pd

In [4]:
import pandas as pd  
import numpy as np  
from torch.utils.data import Dataset, DataLoader, random_split
 

# 自定义数据集类，继承自Dataset类    
class CustomDataset(Dataset):
    def __init__(self, data, targets, num_classes):
        self.data = data.astype(np.float32)
        self.targets = targets.astype(np.int64)
        self.num_classes = num_classes

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x = self.data[idx]
        y = torch.zeros(self.num_classes)
        y[self.targets[idx]] = 1
        return x, y  


class GetDataObj():
    def __init__(self):
        pass
    
    # 根据dataframe创建创建DataLoader对象
    def get_dataloader(self, df, num_classes=2, batch_size=32):
        
        # 将DataFrame转换为NumPy数组  
        data = df.drop('label', axis=1).values.astype(np.float32)
        targets = df['label'].values.astype(np.int64) 
        
        # 创建自定义数据集对象  
        dataset = CustomDataset(data, targets, num_classes)  
        
        # 创建DataLoader对象，使用自定义数据集  
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
        
        return dataloader
    
    def get_splited_dataloader(self, df, num_classes, batch_size=32, train_ratio=0.8):
        
        data_loader = self.get_dataloader(df, num_classes=2, batch_size=32)
        
        # 确定训练集和测试集的划分比例    
        test_ratio = 1 - train_ratio  

        # 获取数据集的总大小  
        total_size = len(data_loader.dataset)  

        # 计算训练集和测试集的大小  
        train_size = int(total_size * train_ratio)  
        test_size = total_size - train_size  

        # 使用random_split划分数据集  
        train_dataset, test_dataset = random_split(data_loader.dataset, [train_size, test_size])  

        # 重新创建DataLoader以使用新的数据集  
        train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)  
        test_dataloader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
        
        return train_dataloader, test_dataloader
    
    # 根据csv文件生成有标签dataframe
    def get_df_from_featured_csv_add_label(self, featured_csv_path, label):
        
        df = pd.read_csv(featured_csv_path)
        
        # 添加label列，并赋值
        if label=='good':
            df['label'] = 0
        elif label=='bad':
            df['label'] = 1
        elif label=='unknown':
            df['label'] = 2
        else:
            raise Exception('label error')
        
        # print(df.head())
        
        return df
    
    # 根据csv文件生成无标签dataframe
    def get_df_from_featured_csv(self, featured_csv_path):
        
        df = pd.read_csv(featured_csv_path)
        
        # print(df.head())
        
        return df
    

In [5]:
op = GetDataObj()
df1 = op.get_df_from_featured_csv_add_label('data/featured_csv/good.csv', 'good')
df2 = op.get_df_from_featured_csv_add_label('data/featured_csv/bad.csv', 'bad')

df = pd.concat([df1, df2])
df

Unnamed: 0,Unnamed: 1,len_mean,len_std,time_mean,time_std,num_unkown,IP,UDP,DNS ANS,DNS Qry,IPV6,label
90.000000,0.000000,0.000000,0.000000e+00,0,1,0,0,0,0,1,0,0
78.000000,0.000000,0.000000,0.000000e+00,0,0,0,0,0,0,0,0,0
70.000000,0.000000,4.000000,1.066667e+01,0,2,0,0,0,0,2,0,0
42.000000,0.000000,1009.636364,1.010099e+07,0,0,0,0,0,0,0,0,0
344.000000,0.000000,2.000000,4.000000e+00,0,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1472.000000,0.000000,0.000000,0.000000e+00,0,0,0,0,0,0,0,0,1
1472.000000,0.000000,0.000000,0.000000e+00,0,0,0,0,0,0,0,0,1
1472.000000,0.000000,0.000000,0.000000e+00,0,0,0,0,0,0,0,0,1
195.185185,293.922960,728.759259,3.362227e+05,0,53,0,0,0,0,0,21,1


In [6]:
dataloader = op.get_dataloader(df, num_classes=2, batch_size=32)
train_dl,test_dl = op.get_splited_dataloader(df, num_classes=2, batch_size=32, train_ratio=0.8)

In [7]:
# 测试划分后的dataloader
# 将 DataLoader 中的数据转换成列表
data_list = []
target_list = []
for batch in train_dl:
    data, target = batch
    data_list.append(data)
    target_list.append(target)

# 将列表转换成 DataFrame
df_show = pd.DataFrame({
    'data': data_list,
    'target': target_list
})
df_show

Unnamed: 0,data,target
0,"[[tensor(0.), tensor(0.), tensor(0.), tensor(0...","[[tensor(0.), tensor(1.)], [tensor(0.), tensor..."
1,"[[tensor(0.), tensor(0.), tensor(0.), tensor(0...","[[tensor(0.), tensor(1.)], [tensor(0.), tensor..."
2,"[[tensor(0.), tensor(0.), tensor(0.), tensor(0...","[[tensor(0.), tensor(1.)], [tensor(0.), tensor..."
3,"[[tensor(3245.9077), tensor(10301687.), tensor...","[[tensor(1.), tensor(0.)], [tensor(0.), tensor..."
4,"[[tensor(0.), tensor(0.), tensor(0.), tensor(0...","[[tensor(0.), tensor(1.)], [tensor(0.), tensor..."
...,...,...
195,"[[tensor(35.7534), tensor(2092.9529), tensor(0...","[[tensor(1.), tensor(0.)], [tensor(1.), tensor..."
196,"[[tensor(35.5211), tensor(2187.0383), tensor(0...","[[tensor(1.), tensor(0.)], [tensor(0.), tensor..."
197,"[[tensor(0.), tensor(0.), tensor(0.), tensor(0...","[[tensor(0.), tensor(1.)], [tensor(0.), tensor..."
198,"[[tensor(0.), tensor(0.), tensor(0.), tensor(0...","[[tensor(0.), tensor(1.)], [tensor(0.), tensor..."


In [8]:
from torch import nn

class Net(nn.Module):
    def __init__(self, indim):
        super(Net, self).__init__()
        self.fc1 = nn.Sequential(
                                nn.Linear(indim, 32),
                                nn.BatchNorm1d(32),
                                nn.ReLU(),
                                nn.Dropout(p=0.2)
                                )
        self.fc2 = nn.Sequential(
                                nn.Linear(32, 16),
                                nn.ReLU(),
                                nn.Dropout(p=0.2)
                                )
        self.fc3 = nn.Sequential(
                                nn.Linear(16, 2),
                                )
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.2)
        

    def forward(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        return x



In [9]:
import torch.optim as optim
import torch.nn.functional as F

model = Net(indim=23)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)  # 使用Adam优化器 
num_epochs = 3
model.train()
for epoch in range(num_epochs): # num_epochs表示训练的轮数
    for i, (inputs, targets) in enumerate(train_dl): # 前向传播

        outputs = model(inputs)
        
        loss = criterion(outputs, targets)

        # 反向传播  
        optimizer.zero_grad()  
        loss.backward()  
        optimizer.step()  

        if (i + 1) % 100 == 0:  
            print('Epoch [%d/%d], Iter [%d/%d] Loss: %.4f'  
                  % (epoch + 1, num_epochs, i + 1, len(train_dl), loss.item()))
torch.save(model.state_dict(), 'model/model.pth') 
            
# 训练循环结束后，进行测试  
model.eval()
test_loss = 0  # 用于累积测试损失  
correct = 0  # 用于累积正确预测的数目  
  
with torch.no_grad(): 
    for inputs, targets in test_dl: # 对于 test_dl 中的每个批次   
        outputs = model(inputs)  # 前向传播  
        
        loss = criterion(outputs, targets)  # 计算损失  
        test_loss += loss.item()  # 累积损失  
        
        # 对模型输出进行调整，将输出转换为一个包含两个元素的向量
        # 模型的最后一层没有softmax
        outputs = F.softmax(outputs, dim=1)
        # 获取最大概率的索引作为预测结果
        pred = outputs.argmax(dim=1, keepdim=True)
        
        correct += pred.eq(targets.argmax(dim=1, keepdim=True)).sum().item()  # 计算正确预测的数目
        
    test_loss /= len(test_dl)  # 计算平均测试损失    
    accuracy = 100. * correct / len(test_dl.dataset)  # 计算准确率并转化为百分比形式  
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(    
        test_loss, correct, len(test_dl.dataset), accuracy))  # 打印测试损失和准确率

Epoch [1/3], Iter [100/200] Loss: 0.4153
Epoch [1/3], Iter [200/200] Loss: 0.6206
Epoch [2/3], Iter [100/200] Loss: 0.5543
Epoch [2/3], Iter [200/200] Loss: 0.5234
Epoch [3/3], Iter [100/200] Loss: 0.5591
Epoch [3/3], Iter [200/200] Loss: 0.4656

Test set: Average loss: 0.5763, Accuracy: 1187/1595 (74%)



In [10]:
mymodel = Net(indim=23)
mymodel.load_state_dict(torch.load('model/model.pth'))

<All keys matched successfully>

In [11]:
now_df = pd.read_csv('data/featured_csv/bad.csv')
# 给定一个没有标签的featured_csv，这里用bad.csv模拟
now_df = now_df[:500]
now_df

Unnamed: 0,Unnamed: 1,len_mean,len_std,time_mean,time_std,num_unkown,IP,UDP,DNS ANS,DNS Qry,IPV6
106.000000,0.000000,2633.138941,2.277275e+06,0,8463,8463,0,0,0,0,0
126.000000,0.000000,2633.138941,2.277275e+06,0,8463,8463,0,0,0,0,0
113.751785,29.235540,3476.239852,2.088881e+07,0,31089,31076,0,15569,0,0,0
51.000000,9.000000,9117.109375,2.601011e+07,0,0,0,0,0,0,0,0
70.000000,0.000000,5196.714286,2.631596e+07,0,13,0,0,0,0,13,0
...,...,...,...,...,...,...,...,...,...,...,...
64.000000,0.000000,36530.307692,1.297428e+09,0,25,25,0,0,0,0,0
114.873303,39.805776,50155.950226,7.299693e+08,0,220,220,0,99,0,0,0
86.000000,0.000000,44806.752488,6.752191e+08,0,3918,0,0,0,0,3918,0
59.333333,5.497474,28799.888889,1.658733e+09,0,17,0,0,0,0,0,0


In [12]:
input_tensor = torch.tensor(now_df.values, dtype=torch.float32)

In [13]:
model.eval()
with torch.no_grad():
    output = mymodel(input_tensor)
probabilities = F.softmax(output, dim=1)
_, predicted_classes = torch.max(probabilities, dim=1)
predicted_labels = predicted_classes.tolist()
predicted_labels

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,


In [14]:
dfg = pd.read_csv('data/featured_csv/good.csv')
# 给定一个没有标签的featured_csv，这里用good.csv模拟
# now_df1 = now_df1[:500]
input = torch.tensor(dfg.values, dtype=torch.float32)
model.eval()
with torch.no_grad():
    y = model(input)
p= F.softmax(y, dim=1)
_, predicted = torch.max(p, dim=1)
p_labels = predicted.tolist()
p_labels

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,


In [16]:
import pandas as pd
'''
基于流的特征提取
'''
be_df = pd.read_csv("data/featured_csv/benign_small_flow.csv")
be_df["label"] = 0
be_df

Unnamed: 0,src,sport,dst,dport,fiat_mean,fiat_min,fiat_max,fiat_std,biat_mean,biat_min,...,fwd_urg_cnt,bwd_pst_cnt,bwd_urg_cnt,fp_hdr_len,bp_hdr_len,dp_hdr_len,f_ht_len,b_ht_len,d_ht_len,label
0,192.168.10.14,50167,54.230.38.215,443,1.713385,0.000048,5.110408,2.402088,5.047341,0.000214,...,0,0,0,216,216,432,0.812030,0.853755,0.835590,0
1,192.168.10.14,50095,13.107.4.50,80,0.000546,0.000001,0.476635,0.003788,0.000413,0.000000,...,0,2839,0,6615000,8756370,15371370,0.894015,0.023431,0.040333,0
2,192.168.10.16,37572,104.16.165.179,443,5.612169,0.000044,10.240052,4.729539,6.731596,0.000003,...,0,0,0,378,324,702,0.847534,0.897507,0.872050,0
3,192.168.10.16,55396,52.87.48.187,443,2.243377,0.000235,10.128269,3.521390,2.243628,0.000245,...,0,12,0,1458,1458,2916,0.067400,0.087980,0.076331,0
4,192.168.10.14,50178,104.88.14.9,443,3.734520,0.000011,10.031254,4.142035,3.728795,0.000213,...,0,1,0,270,270,540,0.828221,0.798817,0.815710,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499,192.168.10.12,42320,139.162.9.91,443,0.112712,0.000593,0.257570,0.124699,0.112864,0.000001,...,0,3,0,432,432,864,0.286853,0.392727,0.331797,0
500,192.168.10.12,42322,139.162.9.91,443,0.300776,0.000657,0.993505,0.364367,0.250075,0.000003,...,0,2,0,324,378,702,0.332991,0.591549,0.436025,0
501,192.168.10.14,50354,192.168.10.3,445,0.016580,0.000000,0.166783,0.039613,0.018427,0.000003,...,0,12,0,1134,864,1998,0.129556,0.217687,0.157075,0
502,192.168.10.14,50355,131.253.61.80,443,0.022642,0.000003,0.110316,0.031733,0.022623,0.000044,...,0,3,0,810,810,1620,0.120914,0.045833,0.066475,0


In [17]:
ma_df = pd.read_csv("data/featured_csv/malicious_small_flow.csv")
ma_df["label"] = 1
ma_df

Unnamed: 0,src,sport,dst,dport,fiat_mean,fiat_min,fiat_max,fiat_std,biat_mean,biat_min,...,fwd_urg_cnt,bwd_pst_cnt,bwd_urg_cnt,fp_hdr_len,bp_hdr_len,dp_hdr_len,f_ht_len,b_ht_len,d_ht_len,label
0,192.168.10.5,49693,202.153.190.10,443,0.437834,0.000129,10.230919,1.998893,0.314706,0.000001,...,0,17,0,8100,11286,19386,0.487922,0.029014,0.047797,1
1,192.168.10.5,49695,202.153.190.10,443,0.501965,0.000018,10.221469,2.101407,0.349602,0.000048,...,0,18,0,6048,8694,14742,0.438452,0.031970,0.051594,1
2,192.168.10.5,49697,202.153.190.10,443,0.500303,0.000098,10.225926,2.092228,0.362922,0.000001,...,0,18,0,6102,8424,14526,0.362912,0.029848,0.048576,1
3,192.168.10.5,49699,202.153.190.10,443,0.359024,0.000016,10.228289,1.773288,0.235860,0.000002,...,0,19,0,8640,13176,21816,0.525707,0.031765,0.050590,1
4,192.168.10.15,49533,172.217.3.110,443,0.000000,0.000000,0.000000,0.000000,0.000003,0.000003,...,0,0,0,54,108,162,0.885246,0.892562,0.900000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1220,192.168.10.19,47304,209.85.201.109,465,0.056251,0.000431,0.220984,0.052601,0.045476,0.000049,...,0,9,0,756,864,1620,0.406670,0.161616,0.224906,1
1221,192.168.10.25,49482,178.172.160.2,443,0.038362,0.000003,0.153047,0.065932,0.030811,0.000004,...,0,1,0,702,594,1296,0.486824,0.038690,0.077175,1
1222,192.168.10.15,49551,23.217.25.139,443,0.034178,0.034178,0.034178,0.000000,0.000002,0.000002,...,0,1,0,108,108,216,0.892562,0.739726,0.815094,1
1223,192.168.10.25,49483,178.172.160.2,443,0.075924,0.000587,0.151261,0.075337,0.000000,0.000000,...,0,0,0,162,54,216,0.462857,0.720000,0.510638,1


In [18]:
dataf = pd.concat([be_df, ma_df])
dataf = dataf.drop(["src", "sport", "dst", "dport"], axis=1)
dataf

Unnamed: 0,fiat_mean,fiat_min,fiat_max,fiat_std,biat_mean,biat_min,biat_max,biat_std,diat_mean,diat_min,...,fwd_urg_cnt,bwd_pst_cnt,bwd_urg_cnt,fp_hdr_len,bp_hdr_len,dp_hdr_len,f_ht_len,b_ht_len,d_ht_len,label
0,1.713385,0.000048,5.110408,2.402088,5.047341,0.000214,10.032021,4.095706,2.163149,0.000017,...,0,0,0,216,216,432,0.812030,0.853755,0.835590,0
1,0.000546,0.000001,0.476635,0.003788,0.000413,0.000000,0.498814,0.003681,0.000235,0.000000,...,0,2839,0,6615000,8756370,15371370,0.894015,0.023431,0.040333,0
2,5.612169,0.000044,10.240052,4.729539,6.731596,0.000003,10.242423,4.396443,2.806085,0.000003,...,0,0,0,378,324,702,0.847534,0.897507,0.872050,0
3,2.243377,0.000235,10.128269,3.521390,2.243628,0.000245,10.183908,3.548535,1.101608,0.000005,...,0,12,0,1458,1458,2916,0.067400,0.087980,0.076331,0
4,3.734520,0.000011,10.031254,4.142035,3.728795,0.000213,10.032032,4.148724,1.659808,0.000011,...,0,1,0,270,270,540,0.828221,0.798817,0.815710,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1220,0.056251,0.000431,0.220984,0.052601,0.045476,0.000049,0.131596,0.038765,0.025216,0.000007,...,0,9,0,756,864,1620,0.406670,0.161616,0.224906,1
1221,0.038362,0.000003,0.153047,0.065932,0.030811,0.000004,0.152897,0.060986,0.020015,0.000003,...,0,1,0,702,594,1296,0.486824,0.038690,0.077175,1
1222,0.034178,0.034178,0.034178,0.000000,0.000002,0.000002,0.000002,0.000000,0.011393,0.000002,...,0,1,0,108,108,216,0.892562,0.739726,0.815094,1
1223,0.075924,0.000587,0.151261,0.075337,0.000000,0.000000,0.000000,0.000000,0.050616,0.000177,...,0,0,0,162,54,216,0.462857,0.720000,0.510638,1


In [19]:
loader = op.get_dataloader(dataf, num_classes=2, batch_size=32)
train_loader,test_loader = op.get_splited_dataloader(dataf, num_classes=2, batch_size=32, train_ratio=0.8)
len(train_loader.dataset)

1383

In [20]:
import torch.optim as optim
import torch.nn.functional as F

ctx = Net(indim=72)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(ctx.parameters(), lr=0.01)  # 使用Adam优化器 
num_epochs = 5

ctx.train()
for epoch in range(num_epochs): # num_epochs表示训练的轮数
    for i, (inputs, targets) in enumerate(train_loader): # 前向传播

        outputs = ctx(inputs)
        loss = criterion(outputs, targets)

        # 反向传播  
        optimizer.zero_grad()  
        loss.backward()  
        optimizer.step()  

        if (i + 1) % 10 == 0:  
            print('Epoch [%d/%d], Iter [%d/%d] Loss: %.4f'  
                  % (epoch + 1, num_epochs, i + 1, len(train_loader), loss.item()))

torch.save(ctx.state_dict(), 'model/flow_model.pth') 
            
# 训练循环结束后，进行测试  
ctx.eval()
test_loss = 0  # 用于累积测试损失  
correct = 0  # 用于累积正确预测的数目  
  
with torch.no_grad(): 
    for inputs, targets in test_loader: # 对于 test_loader 中的每个批次   
        outputs = ctx(inputs)  # 前向传播  
        
        loss = criterion(outputs, targets)  # 计算损失  
        test_loss += loss.item()  # 累积损失  
        
        # 对模型输出进行调整，将输出转换为一个包含两个元素的向量
        # 模型的最后一层没有softmax
        outputs = F.softmax(outputs, dim=1)
        # 获取最大概率的索引作为预测结果
        pred = outputs.argmax(dim=1, keepdim=True)
        
        correct += pred.eq(targets.argmax(dim=1, keepdim=True)).sum().item()  # 计算正确预测的数目
        
    test_loss /= len(test_loader)  # 计算平均测试损失    
    accuracy = 100. * correct / len(test_loader.dataset)  # 计算准确率并转化为百分比形式  
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(    
        test_loss, correct, len(test_loader.dataset), accuracy))  # 打印测试损失和准确率

Epoch [1/5], Iter [10/44] Loss: 0.6471
Epoch [1/5], Iter [20/44] Loss: 0.4939
Epoch [1/5], Iter [30/44] Loss: 0.5335
Epoch [1/5], Iter [40/44] Loss: 0.6402
Epoch [2/5], Iter [10/44] Loss: 0.5033
Epoch [2/5], Iter [20/44] Loss: 0.5946
Epoch [2/5], Iter [30/44] Loss: 0.4887
Epoch [2/5], Iter [40/44] Loss: 0.5527
Epoch [3/5], Iter [10/44] Loss: 0.5302
Epoch [3/5], Iter [20/44] Loss: 0.5469
Epoch [3/5], Iter [30/44] Loss: 0.6856
Epoch [3/5], Iter [40/44] Loss: 0.4743
Epoch [4/5], Iter [10/44] Loss: 0.5242
Epoch [4/5], Iter [20/44] Loss: 0.5334
Epoch [4/5], Iter [30/44] Loss: 0.7727
Epoch [4/5], Iter [40/44] Loss: 0.5487
Epoch [5/5], Iter [10/44] Loss: 0.5399
Epoch [5/5], Iter [20/44] Loss: 0.5422
Epoch [5/5], Iter [30/44] Loss: 0.5918
Epoch [5/5], Iter [40/44] Loss: 0.5793

Test set: Average loss: 0.8860, Accuracy: 235/346 (68%)



In [25]:
model1 = Net(indim=72)
model1.load_state_dict(torch.load('model/flow_model.pth'))

dfg = pd.read_csv('data/featured_csv/malicious_small_flow.csv')
dfg = dfg.drop(["src", "sport", "dst", "dport"], axis=1)
# 给定一个没有标签的featured_csv
input = torch.tensor(dfg.values, dtype=torch.float32)

model1.eval()
with torch.no_grad():
    output = model1(input)
probabilities = F.softmax(output, dim=1)
_, predicted_classes = torch.max(probabilities, dim=1)
predicted_labels = predicted_classes.tolist()
len(predicted_labels)

1225

In [27]:
df5 = pd.read_csv('data/featured_csv/benign_small_flow.csv')
df5 = df5.drop(["src", "sport", "dst", "dport"], axis=1)
# 给定一个没有标签的featured_csv
input2 = torch.tensor(df5.values, dtype=torch.float32)

model1.eval()
with torch.no_grad():
    output = model1(input2)
probabilities = F.softmax(output, dim=1)
_, predicted_classes = torch.max(probabilities, dim=1)
predicted_labels = predicted_classes.tolist()
probabilities = probabilities.tolist()

In [28]:
predicted_labels

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,


In [31]:
mal_prob = [sublist[1] for sublist in probabilities]
mal_prob 

[0.5998033285140991,
 1.0,
 0.5981110334396362,
 0.5958721041679382,
 0.5996516346931458,
 0.5996467471122742,
 0.5995774865150452,
 0.5996344685554504,
 0.59964919090271,
 0.5997447371482849,
 0.5981811881065369,
 0.5887157917022705,
 0.5987817049026489,
 0.5987361073493958,
 0.5996536612510681,
 0.5997799038887024,
 0.5997793674468994,
 0.5995563864707947,
 0.599733829498291,
 0.5984660983085632,
 0.5996293425559998,
 0.5997580885887146,
 0.5994274020195007,
 0.5989457368850708,
 0.5997504591941833,
 0.5996533632278442,
 0.5994929671287537,
 0.5994398593902588,
 0.5995774865150452,
 0.5994390845298767,
 0.5991519689559937,
 0.5996775031089783,
 0.599745512008667,
 0.5997883081436157,
 0.5992980003356934,
 0.5996870398521423,
 0.5997840762138367,
 0.59978848695755,
 0.5997298359870911,
 0.5994771122932434,
 0.6073434948921204,
 0.599405825138092,
 0.5993848443031311,
 0.5998468995094299,
 0.6128939390182495,
 0.5995858907699585,
 0.5997452139854431,
 0.5997452139854431,
 0.61280590295

In [1]:
import numpy as np
import pandas as pd
# 创建示例DataFrame对象
df1 = pd.read_csv("data/featured_csv/benign_small_flow.csv")
df2 = pd.read_csv("data/featured_csv/malicious_small_flow.csv")
df = pd.concat([df1, df2])
df = df.drop(["src", "sport", "dst", "dport"], axis=1)
df

Unnamed: 0,fiat_mean,fiat_min,fiat_max,fiat_std,biat_mean,biat_min,biat_max,biat_std,diat_mean,diat_min,...,fwd_pst_cnt,fwd_urg_cnt,bwd_pst_cnt,bwd_urg_cnt,fp_hdr_len,bp_hdr_len,dp_hdr_len,f_ht_len,b_ht_len,d_ht_len
0,1.713385,0.000048,5.110408,2.402088,5.047341,0.000214,10.032021,4.095706,2.163149,0.000017,...,1,0,0,0,216,216,432,0.812030,0.853755,0.835590
1,0.000546,0.000001,0.476635,0.003788,0.000413,0.000000,0.498814,0.003681,0.000235,0.000000,...,65,0,2839,0,6615000,8756370,15371370,0.894015,0.023431,0.040333
2,5.612169,0.000044,10.240052,4.729539,6.731596,0.000003,10.242423,4.396443,2.806085,0.000003,...,1,0,0,0,378,324,702,0.847534,0.897507,0.872050
3,2.243377,0.000235,10.128269,3.521390,2.243628,0.000245,10.183908,3.548535,1.101608,0.000005,...,11,0,12,0,1458,1458,2916,0.067400,0.087980,0.076331
4,3.734520,0.000011,10.031254,4.142035,3.728795,0.000213,10.032032,4.148724,1.659808,0.000011,...,1,0,1,0,270,270,540,0.828221,0.798817,0.815710
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1220,0.056251,0.000431,0.220984,0.052601,0.045476,0.000049,0.131596,0.038765,0.025216,0.000007,...,10,0,9,0,756,864,1620,0.406670,0.161616,0.224906
1221,0.038362,0.000003,0.153047,0.065932,0.030811,0.000004,0.152897,0.060986,0.020015,0.000003,...,4,0,1,0,702,594,1296,0.486824,0.038690,0.077175
1222,0.034178,0.034178,0.034178,0.000000,0.000002,0.000002,0.000002,0.000000,0.011393,0.000002,...,0,0,1,0,108,108,216,0.892562,0.739726,0.815094
1223,0.075924,0.000587,0.151261,0.075337,0.000000,0.000000,0.000000,0.000000,0.050616,0.000177,...,1,0,0,0,162,54,216,0.462857,0.720000,0.510638


In [2]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data_scaled = scaler.fit_transform(df.values)
# 将标准化后的数据转换回 DataFrame
df_scaled = pd.DataFrame(data_scaled, columns=df.columns)
df_scaled

Unnamed: 0,fiat_mean,fiat_min,fiat_max,fiat_std,biat_mean,biat_min,biat_max,biat_std,diat_mean,diat_min,...,fwd_pst_cnt,fwd_urg_cnt,bwd_pst_cnt,bwd_urg_cnt,fp_hdr_len,bp_hdr_len,dp_hdr_len,f_ht_len,b_ht_len,d_ht_len
0,-0.201078,-0.252088,-0.106361,0.284861,0.226467,-0.298677,0.477968,1.196054,0.330172,-0.285901,...,-0.238743,0.0,-0.073502,0.0,-0.037494,-0.037193,-0.037322,1.086398,1.323792,1.259530
1,-0.462430,-0.252095,-0.647621,-0.646415,-0.486475,-0.298708,-0.648345,-0.676655,-0.541065,-0.287099,...,6.542158,0.0,31.686405,0.0,32.065905,31.701778,31.858208,1.439893,-1.199432,-1.282027
2,0.393814,-0.252088,0.492819,1.188626,0.464389,-0.298707,0.502826,1.333686,0.589151,-0.286888,...,-0.238743,0.0,-0.073502,0.0,-0.036708,-0.036801,-0.036762,1.239481,1.456747,1.376052
3,-0.120210,-0.252059,0.479762,0.719494,-0.169593,-0.298673,0.495913,0.945642,-0.097424,-0.286747,...,0.820772,0.0,0.060742,0.0,-0.031466,-0.032691,-0.032168,-2.124226,-1.003277,-1.166981
4,0.107315,-0.252093,0.468430,0.960494,0.040206,-0.298677,0.477969,1.220318,0.127423,-0.286324,...,-0.238743,0.0,-0.062315,0.0,-0.037232,-0.036997,-0.037098,1.156209,1.156844,1.195995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1724,-0.453931,-0.252028,-0.677482,-0.627460,-0.480109,-0.298701,-0.691730,-0.660598,-0.531002,-0.286606,...,0.714821,0.0,0.027181,0.0,-0.034873,-0.034844,-0.034857,-0.661394,-0.779509,-0.692151
1725,-0.456660,-0.252095,-0.685418,-0.622284,-0.482181,-0.298707,-0.689213,-0.650429,-0.533097,-0.286888,...,0.079111,0.0,-0.062315,0.0,-0.035135,-0.035822,-0.035529,-0.315794,-1.153062,-1.164284
1726,-0.457299,-0.246791,-0.699303,-0.647886,-0.486533,-0.298708,-0.707277,-0.678339,-0.536570,-0.286958,...,-0.344695,0.0,-0.062315,0.0,-0.038018,-0.037584,-0.037771,1.433629,0.977276,1.194027
1727,-0.450929,-0.252004,-0.685627,-0.618632,-0.486533,-0.298708,-0.707278,-0.678339,-0.520771,-0.274624,...,-0.238743,0.0,-0.073502,0.0,-0.037756,-0.037780,-0.037771,-0.419133,0.917332,0.221018
