In [1]:
#============
# 导入所需的包
#============
import string
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from Data_Mining_Toolbox.dl_helper import *
from Data_Mining_Toolbox.plot_helper import *
from Data_Mining_Toolbox.common import string_to_index

torch.cuda.set_device(1)

In [2]:
def second_domain(input_data):
    second_domain = input_data.split(".")[0]
    return second_domain

In [3]:
#============
# 读取实验数据
#============
df_train = pd.read_csv("./data/train_set.csv")
df_val = pd.read_csv("./data/val_set.csv")
df_test = pd.read_csv("./data/test_set.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df_train['second_domain'] = df_train.apply(lambda x:second_domain(x.domain),axis=1)
df_val['second_domain'] = df_val.apply(lambda x:second_domain(x.domain),axis=1)
df_test['second_domain'] = df_test.apply(lambda x:second_domain(x.domain),axis=1)

In [5]:
#=======================
# 定义生成模型输入向量的函数
#=======================
def get_input(df, max_len, with_label=True):
    x = string_to_index(df['second_domain'].values,max_len)
    y = df['label'].values
    x = torch.autograd.Variable(torch.LongTensor(x)).cuda()
    if with_label:
        y = torch.autograd.Variable(torch.LongTensor(y)).cuda()
        return x, y
    else:
        return x

# 生成输入向量
train_x, train_y = get_input(df_train, 50)
val_x, val_y = get_input(df_val, 50)
test_x, test_y = get_input(df_test, 50)

In [6]:
#================
# 定义RNN的网络结构
#================
class GRUNet(nn.Module):
    
    def __init__(self, chars_num, encode_dim, hidden_size):
        super(GRUNet, self).__init__()
        self.embedding = nn.Embedding(chars_num, encode_dim)
        self.features = nn.GRU(input_size=encode_dim, hidden_size=hidden_size, num_layers=1)
        self.classifier = nn.Linear(hidden_size, 2)
        
    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(1, 0, 2)
        x, _ = self.features(x)
        x = x.contiguous().float()
        x = x[-1].view(x[-1].size(0), -1)
        x = F.dropout(x, p=0.5)
        x = self.classifier(x)
        return x

In [7]:
#========
# 训练模型
#========

# 重要: char_num=len(chars) + 1 不然有越界错误，GPU异常报错xid error
char_num=len(string.printable) + 1
model = GRUNet(chars_num=char_num, encode_dim=128, hidden_size=128).cuda(1)
optimizer = optim.Adam(params=model.parameters(), lr=0.0001, weight_decay=0.00001)

In [None]:
train(model,train_x,train_y,val_x,val_y, epochs=100, batch_size=256,\
      save_prefix="{}-second-domain".format(model.__class__.__name__))

In [8]:
# 比较不同轮数模型效果
compare_diff_epoch(model,test_x,test_y,100,"GRUNet-second-domain")

Unnamed: 0,model,epoch,tp,fp,tn,fn,precision,recall,auc
0,GRUNet,20,228973,9746,489907,3849,0.983468,0.959174,0.975689
1,GRUNet,40,228981,9738,490150,3606,0.984496,0.959207,0.975952
2,GRUNet,60,228941,9778,489612,4144,0.982221,0.95904,0.975323
3,GRUNet,80,226917,11802,490968,2788,0.987863,0.950561,0.972457
4,GRUNet,100,229225,9494,489160,4596,0.980344,0.960229,0.975461


In [10]:
# 加载最佳模型参数进行测试集预测
model.load_state_dict(torch.load("./model/GRUNet-second-domain-model-epoch-40.state"))
start_time = time.time()
test_x, test_y = get_input(df_test, 50)
test(model, test_x, test_y, batch_size=256)
print("用时:", time.time() - start_time)



test loss: 0.067835 	 测试准确率: 0.981816
              precision    recall  f1-score   support

           0   0.980523  0.992745  0.986596    493756
           1   0.984598  0.959211  0.971739    238719

    accuracy                       0.981816    732475
   macro avg   0.982560  0.975978  0.979167    732475
weighted avg   0.981851  0.981816  0.981754    732475


用时: 9.565704822540283


### 查看各个dga家族的预测情况

In [20]:
def print_family_recall(df,dga_family):
    correct_nums = df_test[(df_test['label']==df_test['pred'])&(df_test['dga_family']==dga_family)].shape[0]
    family_nums = df_test[df_test['dga_family']==dga_family].shape[0]
    error_nums = family_nums - correct_nums
    return correct_nums/(family_nums+1),correct_nums,error_nums

In [21]:
pred = predict(model,test_x,256)
df_test['pred'] = pred

for family in df_test['dga_family'].drop_duplicates().values:
    recall,correct_nums,error_nums = print_family_recall(df_test,family)
    if recall<0.8:
        print("{}:{}\t{}\t{}".format(family,round(recall,4),correct_nums,error_nums))

nymaim:0.7973	354	89
symmi:0.6885	2931	1325
conficker:0.7537	352	114
madmax:0.0	0	1
suppobox:0.0812	175	1979
virut:0.6637	6471	3278
matsnu:0.0	0	27
blackhole:0.6667	2	0
xshellghost:0.6667	2	0
ccleaner:0.0	0	2
mydoom:0.7447	35	11
simda:0.5	1	0
banjori:0.525	42	37
vawtrak:0.6947	355	155
enviserv:0.0423	21	474
nan:0.0	0	0
