In [1]:
from botdet.data.dataset_botnet import BotnetDataset
from botdet.data.dataloader import GraphDataLoader

### 数据含义
batch 表示图的区分，属于哪个图中的数据
edge_index 表示边关系
x 表示节点,全部为1
y 表示标签
edge_y ？？？

In [2]:
botnet_dataset_train = BotnetDataset(name='chord', split='train', graph_format='pyg')
botnet_dataset_val = BotnetDataset(name='chord', split='val', graph_format='pyg')
botnet_dataset_test = BotnetDataset(name='chord', split='test', graph_format='pyg')

train_loader = GraphDataLoader(botnet_dataset_train, batch_size=2, shuffle=False, num_workers=0)
val_loader = GraphDataLoader(botnet_dataset_val, batch_size=1, shuffle=False, num_workers=0)
test_loader = GraphDataLoader(botnet_dataset_test, batch_size=1, shuffle=False, num_workers=0)

### 图神经网络构建

In [3]:
data_name = 'chord'     # 僵尸网络的拓扑名称，['chord', 'debru', 'kadem', 'leet', 'c2', 'p2p']
batch_size = 2
in_memory = True
shuffle = False
data_dir = './data/botnet'


device = 0
devid = 0   # 使用的CPU编号
seed = 0    # 随机种子


in_channels = 1   # 输入节点特征数量
enc_sizes = [32] * 10        # 编码层的特征维数


num_classes = 2     # 分类数
act = 'relu'        # 残差后添加的非线性激活函数，['none', 'lrelu', 'relu', 'elu']可选
layer_act = 'none'    # 残差后前添加的非线性激活函数，['none', 'lrelu', 'relu', 'elu']可选
residual_hop = 1    # 每层残差
nodemodel = 'additive'  # 使用的节点种类，nodemodel = 'additive',['additive', 'mlp', 'attention']可选
edge_gate = 'none'  # 独立的边缘网关？
final = 'proj'      # 最后的输出层，['none', 'proj']可选
nheads = [1]*10     # 多头注意力机制中的头数，应该是一个长度等于1或者层数
att_act = 'lrelu'   # 多头注意力机制中的非线性激活函数，['none', 'lrelu', 'relu', 'elu'],
att_dropout = 0     # 多头注意力机制中的dropout
att_dir = 'in'      # 多头注意力的方向，['in', 'out']
att_combine = 'cat' # 多头注意力向量进行合并的方式,['cat', 'add', 'mean']

deg_norm = 'rw'     # 正则化方法，['none', 'sm', 'rw']
aggr = 'add'        # 特征整合方法，['add', 'mean', 'max'],

dropout = 0.0
bias = True

learning_rate = 0.005
weight_decay = 5e-4
num_epochs = 50
early_stop = True
save_dir = './saved_models'
save_name = 'temp.pt'

final_layer_config = {'att_combine': att_combine}  #最后一层配置

In [4]:
from botdet.models_pyg.gcn_model import GCNModel

In [5]:
model = GCNModel(
    in_channels,
    enc_sizes,
    num_classes,
    non_linear=act,
    non_linear_layer_wise=layer_act,
    residual_hop=residual_hop,
    dropout=dropout,
    final_layer_config=final_layer_config,
    final_type=final,
    pred_on='node',
    nodemodel=nodemodel,
    deg_norm=deg_norm,
    edge_gate=edge_gate,
    aggr=aggr,
    bias=bias,
    nheads=nheads,
    att_act=att_act,
    att_dropout=att_dropout,
    att_dir=att_dir,
    att_combine=att_combine,
)

model.to(device)

GCNModel(
  (gcn_net): ModuleList(
    (0): GCNLayer(
      (gcn): NodeModelAdditive (in_channels: 1, out_channels: 32, in_edgedim: None, deg_norm: rw, edge_gate: NoneType,aggr: add | number of parameters: 64)
      (non_linear): Identity()
    )
    (1): GCNLayer(
      (gcn): NodeModelAdditive (in_channels: 32, out_channels: 32, in_edgedim: None, deg_norm: rw, edge_gate: NoneType,aggr: add | number of parameters: 1056)
      (non_linear): Identity()
    )
    (2): GCNLayer(
      (gcn): NodeModelAdditive (in_channels: 32, out_channels: 32, in_edgedim: None, deg_norm: rw, edge_gate: NoneType,aggr: add | number of parameters: 1056)
      (non_linear): Identity()
    )
    (3): GCNLayer(
      (gcn): NodeModelAdditive (in_channels: 32, out_channels: 32, in_edgedim: None, deg_norm: rw, edge_gate: NoneType,aggr: add | number of parameters: 1056)
      (non_linear): Identity()
    )
    (4): GCNLayer(
      (gcn): NodeModelAdditive (in_channels: 32, out_channels: 32, in_edgedim: None, deg_

In [13]:
import os
import parser
import argparse

from botdet.eval.evaluation import eval_metrics, eval_predictor, PygModelPredictor
from botdet.optim.earlystop import EarlyStopping


def train(model, args, train_loader, val_dataset, test_dataset, optimizer, criterion,
          scheduler=None, logger=None):
    """
        model: 要进行训练的模型
        args:
            epochs:训练轮数
            log_intercval: 每隔多少轮输出一条日志
            save_dir: 存储目录
            save_name: 
        train_loader: 训练数据加载器
        val_ds: 验证集
        test_ds: 测试集
        optimizer: 优化器
        criterion: 损失函数
        device: GPU版本号
        prediceter: 预测函数
    """
    if logger is None:
        logging = print
    else:
        logging = logger.info
        
    device = next(model.parameters()).device
    predictor = PygModelPredictor(model)
    
    early_stopper = EarlyStopping(patience=5, mode='min', verbose=True, logger=logger)
    
    best_epoch = 0    # 最佳训练轮数
    start = time.time()
    
    for ep in range(args.epochs):
        loss_avg_train = 0     # 本轮训练过程中的平均loss
        num_train_graph = 0    # 参与训练的图数量
        model.train()
        for n, batch in enumerate(train_loader):
            batch.to(device)
            optimizer.zero_grad()

            x = model(batch.x, batch.edge_index)
            loss = criterion(x, batch.y.long())
            
            loss_avg_train += float(loss)
            num_train_graph += batch.num_graphs

            loss.backward()
            optimizer.step()
            
            #-------------------打印训练效果日志--------------------------
            if num_train_graph % args.log_interval == 0 or n == len(train_loader) - 1:
                with torch.no_grad():
                    # pred = x.argmax(dim=1)
                    pred_prob = torch.softmax(x, dim=1)[:, 1]
                    y = batch.y.long()
                    result_dict = eval_metrics(y, pred_prob)
                logging(f'epoch: {ep + 1}, passed number of graphs: {num_train_graph}, '
                        f'train running loss: {loss_avg_train / num_train_graph:.5f} (passed time: {time_since(start)})')
                logging(' ' * 10 + ', '.join(['{}: {:.5f}'.format(k, v) for k, v in result_dict.items()]))
                
        result_dict_avg, loss_avg = eval_predictor(val_dataset, predictor)
        logging(f'Validation --- epoch: {ep + 1}, loss: {loss_avg:.5f}')
        logging(' ' * 10 + ', '.join(['{}: {:.5f}'.format(k, v) for k, v in result_dict_avg.items()]))

        if scheduler is not None:
            scheduler.step(loss_avg)

        if args.early_stop:
            early_stopper(loss_avg)
        else:
            early_stopper.improved = True

        if early_stopper.improved:
            torch.save(model, os.path.join(args.save_dir, args.save_name))
            logging(f'model saved at {os.path.join(args.save_dir, args.save_name)}.')
            best_epoch = ep
        elif early_stopper.early_stop:
            logging(f'Early stopping here.')
            break
        else:
            pass   
    #-----------------------打印最佳轮数的训练效果--------------------------   
    if early_stopper.improved:
        best_model = model
    else:
        best_model = torch.load(os.path.join(args.save_dir, args.save_name))
    logging('*' * 12 + f' best model obtained after epoch {best_epoch + 1}, '
                       f'saved at {os.path.join(args.save_dir, args.save_name)} ' + '*' * 12)
    predictor = PygModelPredictor(best_model)
    result_dict_avg, loss_avg = eval_predictor(test_dataset, predictor)
    logging(f'Testing --- loss: {loss_avg:.5f}')
    logging(' ' * 10 + ', '.join(['{}: {:.5f}'.format(k, v) for k, v in result_dict_avg.items()]))
    
    
    


In [14]:
pwd

'/home/yhk/jupyter-notebook/botnet_detect/botnet-detection'

In [None]:
from torch.optim.lr_scheduler import 

In [7]:
x = model(batch.x, batch.edge_index)

In [None]:

def train(model, args, train_loader, val_dataset, test_dataset, optimizer, criterion,
          scheduler=None, logger=None):
    if logger is None:
        logging = print
    else:
        logging = logger.info

    device = next(model.parameters()).device
    predictor = PygModelPredictor(model)

    early_stopper = EarlyStopping(patience=5, mode='min', verbose=True, logger=logger)

    best_epoch = 0
    start = time.time()
    for ep in range(args.epochs):
        loss_avg_train = 0
        num_train_graph = 0
        model.train()
        for n, batch in enumerate(train_loader):
            batch.to(device)

            optimizer.zero_grad()

            x = model(batch.x, batch.edge_index)
            loss = criterion(x, batch.y.long())

            loss_avg_train += float(loss)
            num_train_graph += batch.num_graphs

            loss.backward()
            optimizer.step()

            if num_train_graph % args.log_interval == 0 or n == len(train_loader) - 1:
                with torch.no_grad():
                    # pred = x.argmax(dim=1)
                    pred_prob = torch.softmax(x, dim=1)[:, 1]
                    y = batch.y.long()
                    result_dict = eval_metrics(y, pred_prob)
                logging(f'epoch: {ep + 1}, passed number of graphs: {num_train_graph}, '
                        f'train running loss: {loss_avg_train / num_train_graph:.5f} (passed time: {time_since(start)})')
                logging(' ' * 10 + ', '.join(['{}: {:.5f}'.format(k, v) for k, v in result_dict.items()]))

        result_dict_avg, loss_avg = eval_predictor(val_dataset, predictor)
        logging(f'Validation --- epoch: {ep + 1}, loss: {loss_avg:.5f}')
        logging(' ' * 10 + ', '.join(['{}: {:.5f}'.format(k, v) for k, v in result_dict_avg.items()]))

        if scheduler is not None:
            scheduler.step(loss_avg)

        if args.early_stop:
            early_stopper(loss_avg)
        else:
            early_stopper.improved = True

        if early_stopper.improved:
            torch.save(model, os.path.join(args.save_dir, args.save_name))
            logging(f'model saved at {os.path.join(args.save_dir, args.save_name)}.')
            best_epoch = ep
        elif early_stopper.early_stop:
            logging(f'Early stopping here.')
            break
        else:
            pass

    if early_stopper.improved:
        best_model = model
    else:
        best_model = torch.load(os.path.join(args.save_dir, args.save_name))
    logging('*' * 12 + f' best model obtained after epoch {best_epoch + 1}, '
                       f'saved at {os.path.join(args.save_dir, args.save_name)} ' + '*' * 12)
    predictor = PygModelPredictor(best_model)
    result_dict_avg, loss_avg = eval_predictor(test_dataset, predictor)
    logging(f'Testing --- loss: {loss_avg:.5f}')
    logging(' ' * 10 + ', '.join(['{}: {:.5f}'.format(k, v) for k, v in result_dict_avg.items()]))

In [11]:
row,col = batch.edge_index

In [19]:
tmp = [0,0,1]
batch.edge_index[tmp]

tensor([[     0,      0,      0,  ..., 289496, 289497, 289498],
        [     0,      0,      0,  ..., 289496, 289497, 289498],
        [   282,    430,    799,  ..., 289496, 289497, 289498]],
       device='cuda:0')

In [15]:
t = torch.arange(24).reshape(2, 3, 4) # 初始化一个tensor，从0到23，形状为（2,3,4）
t

tensor([[[ 0,  1,  2,  3],
         [ 4,  5,  6,  7],
         [ 8,  9, 10, 11]],

        [[12, 13, 14, 15],
         [16, 17, 18, 19],
         [20, 21, 22, 23]]])

In [10]:
import torch

In [14]:
t = torch.arange(24).reshape(2, 3, 4) # 初始化一个tensor，从0到23，形状为（2,3,4）
print("t--->", t)
 
index = torch.tensor([1, 1,2]) # 要选取数据的位置
print("index--->", index)
 
data1 = t.index_select(1, index) # 第一个参数:从第1维挑选， 第二个参数:从该维中挑选的位置
print("data1--->", data1)
 
data2 = t.index_select(2, index) # 第一个参数:从第2维挑选， 第二个参数:从该维中挑选的位置
print("data2--->", data2)

t---> tensor([[[ 0,  1,  2,  3],
         [ 4,  5,  6,  7],
         [ 8,  9, 10, 11]],

        [[12, 13, 14, 15],
         [16, 17, 18, 19],
         [20, 21, 22, 23]]])
index---> tensor([1, 1, 2])
data1---> tensor([[[ 4,  5,  6,  7],
         [ 4,  5,  6,  7],
         [ 8,  9, 10, 11]],

        [[16, 17, 18, 19],
         [16, 17, 18, 19],
         [20, 21, 22, 23]]])
data2---> tensor([[[ 1,  1,  2],
         [ 5,  5,  6],
         [ 9,  9, 10]],

        [[13, 13, 14],
         [17, 17, 18],
         [21, 21, 22]]])


In [9]:
import torch_scatter

In [10]:
torch_scatter.__version__

'2.0.4'

In [None]:
from torch_geometric.nn import GCNConv

In [85]:
net = Net(in_channels,num_classes).to(device)

In [86]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate, weight_decay=weight_decay)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.25, patience=1)

In [87]:
loss_avg_train = 0
num_train_graph = 0
model.train()


for n, batch in enumerate(train_loader):
    batch = batch.to(device)
    print(batch.x.shape,batch.edge_index.shape)
    x = net(batch.x, batch.edge_index)
    
    
    loss = criterion(x,batch.y.long())
    loss_avg_train += float(loss)
    num_train_graph += batch.num_graphs

    loss.backward()
    optimizer.step()
#     if num_train_graph % 10 == 0 or n == len(train_loader) - 1:
#         with torch.no_grad():
#             # pred = x.argmax(dim=1)
#             pred_prob = torch.softmax(x, dim=1)[:, 1]
#             y = batch.y.long()
#             result_dict = eval_metrics(y, pred_prob)
#         logging(f'epoch: {ep + 1}, passed number of graphs: {num_train_graph}, '
#                 f'train running loss: {loss_avg_train / num_train_graph:.5f} (passed time: {time_since(start)})')
#         logging(' ' * 10 + ', '.join(['{}: {:.5f}'.format(k, v) for k, v in result_dict.items()]))

torch.Size([289499, 1]) torch.Size([2, 3035520])


TypeError: can only concatenate str (not "Tensor") to str

In [69]:
#测试
model.eval()
_, pred = net(batch.x,batch.edge_index).max(dim=1)
correct = float(pred.eq(batch.y).sum().item())
acc = correct / batch.x.shape[0]
print('Accuracy: {:.4f}'.format(acc))

Accuracy: 0.9181


In [27]:
index  = torch.tensor([[2,1],[1,3],[0,2],[3,0],[3,1],[3,2]])
src = torch.tensor([[1,2],[3,4],[5,6],[7,8],[9,10],[11,12]]).float()

In [38]:
output = torch.ones((4,4))

In [39]:
output

tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]])

In [40]:
index

tensor([[2, 1],
        [1, 3],
        [0, 2],
        [3, 0],
        [3, 1],
        [3, 2]])

In [41]:
src

tensor([[ 1.,  2.],
        [ 3.,  4.],
        [ 5.,  6.],
        [ 7.,  8.],
        [ 9., 10.],
        [11., 12.]])

In [42]:
output

tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]])

In [43]:
import torch_scatter

In [45]:
torch_scatter.scatter_add(src,index,0)

tensor([[ 5.,  8.],
        [ 3., 12.],
        [ 1., 18.],
        [27.,  4.]])