In [52]:
#============
# 导入所需的包
#============
import string
import warnings
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from Data_Mining_Toolbox.plot_helper import *
from Data_Mining_Toolbox.common import string_to_index

warnings.filterwarnings("ignore")


torch.cuda.set_device(1)

In [53]:
#============
# 读取实验数据
#============
df_train = pd.read_csv("./data/train_set.csv")
df_val = pd.read_csv("./data/val_set.csv")
df_test = pd.read_csv("./data/test_set.csv")

In [54]:
print("df_train.shape:%s\ndf_val.shape:%s\ndf_test.shape:%s"%(df_train.shape,df_val.shape,df_test.shape))

df_train.shape:(1575606, 2)
df_val.shape:(675261, 2)
df_test.shape:(732475, 5)


In [55]:
#=======================
# 定义生成模型输入向量的函数
#=======================
def get_input(df, max_len, with_label=True):
    x = string_to_index(df['domain'].values,max_len)
    y = df['label'].values
    x = torch.autograd.Variable(torch.LongTensor(x)).cuda()
    if with_label:
        y = torch.autograd.Variable(torch.LongTensor(y)).cuda()
        return x, y
    else:
        return x

# 生成输入向量
train_x, train_y = get_input(df_train, 50)
val_x, val_y = get_input(df_val, 50)
test_x, test_y = get_input(df_test, 50)

In [62]:
#================
# 定义RNN的网络结构
#================
class GRUNet(nn.Module):
    def __init__(self, chars_num, encode_dim, hidden_size):
        super(GRUNet, self).__init__()
        self.embedding = nn.Embedding(chars_num, encode_dim)
        self.features = nn.GRU(input_size=encode_dim, hidden_size=hidden_size, num_layers=1)
        self.classifier = nn.Linear(hidden_size, 2)
        
    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(1, 0, 2)
        x, _ = self.features(x)
        x = x.contiguous().float()
        x = x[-1].view(x[-1].size(0), -1)
        x = F.dropout(x, p=0.5)
        x = self.classifier(x)
        x = nn.functional.softmax(x)
        return x

In [63]:
#========
# 训练模型
#========

# 重要: char_num=len(chars) + 1 不然有越界错误，GPU异常报错xid error
char_num=len(string.printable) + 1
model = GRUNet(chars_num=char_num, encode_dim=128, hidden_size=128).cuda(1)
optimizer = optim.Adam(params=model.parameters(), lr=0.0001, weight_decay=0.00001)

In [None]:
train(model,train_x,train_y,val_x,val_y, epochs=100, batch_size=256,save_prefix=model.__class__.__name__)

### 测试集上不同轮数效果对比

In [64]:
compare_diff_epoch(model,test_x,test_y,100)

Unnamed: 0,model,epoch,tp,fp,tn,fn,precision,recall,auc
0,GRUNet,20,229706,9013,491877,1879,0.991886,0.962244,0.979219
1,GRUNet,40,229939,8780,492306,1450,0.993733,0.96322,0.980142
2,GRUNet,60,231465,7254,492042,1714,0.992649,0.969613,0.983071
3,GRUNet,80,232392,6327,491510,2246,0.990428,0.973496,0.984474
4,GRUNet,100,233253,5466,490958,2798,0.988147,0.977103,0.985718


### 加载最佳表现的模型参数

In [65]:
model.load_state_dict(torch.load("./model/GRUNet-model-epoch-100.state"))

IncompatibleKeys(missing_keys=[], unexpected_keys=[])

In [66]:
#=====================================
# 加载最佳模型并在测试集上预测，报告模型性能
#====================================
start_time = time.time()

test_x, test_y = get_input(df_test, 50)
test(model, test_x, test_y, batch_size=256)
print("用时:", time.time() - start_time)

test loss: 0.324379 	 测试准确率: 0.988696
              precision    recall  f1-score   support

           0   0.988975  0.994315  0.991638    493756
           1   0.988109  0.977073  0.982560    238719

    accuracy                       0.988696    732475
   macro avg   0.988542  0.985694  0.987099    732475
weighted avg   0.988693  0.988696  0.988679    732475


用时: 10.295165777206421


### 查看各个家族的识别情况

In [16]:
def print_family_recall(df,dga_family):
    correct_nums = df_test[(df_test['label']==df_test['pred'])&(df_test['dga_family']==dga_family)].shape[0]
    family_nums = df_test[df_test['dga_family']==dga_family].shape[0]
    error_nums = family_nums - correct_nums
    return correct_nums/(family_nums+1),correct_nums,error_nums

In [10]:
pred = predict(model,test_x,256)
df_test['pred'] = pred



In [24]:
for family in df_test['dga_family'].drop_duplicates().values:
    recall,correct_nums,error_nums = print_family_recall(df_test,family)
    if recall<0.8:
        print("{}:{}\t{}\t{}".format(family,round(recall,4),correct_nums,error_nums))

madmax:0.0	0	1
suppobox:0.0148	32	2122
tofsee:0.7143	15	5
virut:0.7936	7738	2011
matsnu:0.0	0	27
blackhole:0.6667	2	0
xshellghost:0.6667	2	0
ccleaner:0.6667	2	0
simda:0.5	1	0
banjori:0.5	40	39
vawtrak:0.775	396	114
enviserv:0.498	247	248
nan:0.0	0	0


### 无论从测试集二分类效果还是各个子类的识别上，与采用二级域名进行识别的相比具有明显的优势

### 查看识别较差的家族

#### suppobox
该家族由单词表中两个正常的单词组成，因此从域名随机性上确实很难进行识别

In [22]:
df_test[df_test['dga_family']=='suppobox'].head(10)

Unnamed: 0,dga_family,domain,e_time,label,s_time,pred
35146,suppobox,strangeelectricity.net,2020-03-21 00:08:31,1,2020-03-17 23:19:28,0
35147,suppobox,historydelight.ru,2020-03-21 00:34:07,1,2020-03-17 23:45:04,0
35148,suppobox,historydelight.net,2020-03-21 00:34:07,1,2020-03-17 23:53:36,0
35149,suppobox,amountborrow.net,2020-03-21 00:42:39,1,2020-03-18 00:02:08,0
35150,suppobox,weatherborrow.net,2020-03-21 00:51:11,1,2020-03-18 00:02:08,0
35151,suppobox,amounttrain.net,2020-03-21 00:59:43,1,2020-03-18 00:10:40,0
35152,suppobox,amountelectricity.ru,2020-03-21 01:16:47,1,2020-03-18 00:27:44,0
35153,suppobox,amountelectricity.net,2020-03-21 01:16:47,1,2020-03-18 00:36:16,0
35154,suppobox,weatherelectricity.net,2020-03-21 01:25:19,1,2020-03-18 00:44:48,0
35155,suppobox,amountdelight.net,2020-03-21 01:33:51,1,2020-03-18 00:44:48,0


#### virut
长度固定位6的随机字母生成，长度较短随机度衡量较难

In [23]:
df_test[df_test['dga_family']=='virut'].head(10)

Unnamed: 0,dga_family,domain,e_time,label,s_time,pred
41252,virut,eogaex.com,2020-03-21 23:59:59,1,2020-03-20 00:00:00,1
41253,virut,vebrho.com,2020-03-21 23:59:59,1,2020-03-21 00:00:00,1
41254,virut,leuxig.com,2020-03-21 23:59:59,1,2020-03-21 00:00:00,0
41255,virut,eoiutw.com,2020-03-21 23:59:59,1,2020-03-21 00:00:00,1
41256,virut,ncodrh.com,2020-03-21 23:59:59,1,2020-03-21 00:00:00,1
41257,virut,xeyewi.com,2020-03-21 23:59:59,1,2020-03-21 00:00:00,1
41258,virut,iuhpfp.com,2020-03-21 23:59:59,1,2020-03-21 00:00:00,1
41259,virut,lecfoy.com,2020-03-21 23:59:59,1,2020-03-21 00:00:00,1
41260,virut,lvymnb.com,2020-03-21 23:59:59,1,2020-03-21 00:00:00,1
41261,virut,uaqbuk.com,2020-03-21 23:59:59,1,2020-03-21 00:00:00,1


#### enviserv
长度为10，16进制表示，训练集中类似数据较少

In [26]:
df_test[df_test['dga_family']=='enviserv'].head(10)

Unnamed: 0,dga_family,domain,e_time,label,s_time,pred
236753,enviserv,02261e64b3.org,2030-01-01 00:00:00,1,1970-01-01 00:00:00,0
236754,enviserv,20c97d8c3d.info,2030-01-01 00:00:00,1,1970-01-01 00:00:00,0
236755,enviserv,5ae4d66001.biz,2030-01-01 00:00:00,1,1970-01-01 00:00:00,1
236756,enviserv,e3bea872ae.in,2030-01-01 00:00:00,1,1970-01-01 00:00:00,0
236757,enviserv,150d064880.com,2030-01-01 00:00:00,1,1970-01-01 00:00:00,0
236758,enviserv,34636b0b94.net,2030-01-01 00:00:00,1,1970-01-01 00:00:00,0
236759,enviserv,4e8414394d.org,2030-01-01 00:00:00,1,1970-01-01 00:00:00,0
236760,enviserv,d84a6a7a28.info,2030-01-01 00:00:00,1,1970-01-01 00:00:00,1
236761,enviserv,d84f49a200.biz,2030-01-01 00:00:00,1,1970-01-01 00:00:00,1
236762,enviserv,f49524f1f1.in,2030-01-01 00:00:00,1,1970-01-01 00:00:00,0
