## 冠军解决方案复现

In [240]:
import pandas as pd
df1 = pd.read_csv("../DataCon2020/dga/dns_2_question_a1805e67f3a33814e3eb5d5ce609996299b3835b/domains_1.txt",names=['domain'])

In [241]:
print("域名总数:{}".format(df1.shape[0]))

域名总数:14110


### 获取域名记录个数，过滤大于3的

In [242]:
import socket
import whois
import  dns.resolver


def get_DNS_Record_Nums(domain):
    """
        获取DGA域名的条数
        Parameter:
        -----------------------
            domain: 需要进行查询记录数的域名
        Return:
            域名对应的记录数量
    """
    record_nums = 0
    try:
        result = dns.resolver.resolve(domain)
        answer = result.response.answer
        for i,data in enumerate(answer):
            record_nums += len(data)
        return record_nums
    except:
        return 0

In [243]:
from joblib import Parallel,delayed

def tmp_func(df1):
    tqdm.pandas(ncols=50)
    df1["dns_record_nums"] = df1.progress_apply(lambda x:get_DNS_Record_Nums(x.domain),axis=1)
    return df1
                  
def apply_parallel(df_grouped,func):
    results = Parallel(n_jobs=30)(delayed(func)(group) for name,group in df_grouped)
    return pd.concat(results)
    
df_grouped = df1.groupby(df1.index)
%time df1 = apply_parallel(df_grouped,tmp_func)

CPU times: user 1min 16s, sys: 2.11 s, total: 1min 18s
Wall time: 4min 14s


In [244]:
df1 = df1[df1['dns_record_nums']<3]
print("过滤DNS记录数大于3的域名后剩余域名数量:{}".format(df1.shape[0]))

过滤DNS记录数大于3的域名后剩余域名数量:12955


### 获取域名对应的子域名个数，过滤大于3的域名

In [245]:
import requests
import re
import sys

def get_subdomain(url):
    """
        使用ip138网获取域名的子域名   
        Parameters:
        -----------------------------
            url: 要进行查询的url
        Return:
        -----------------------------
            subdomain_list:子域名列表
    """
    headers = {
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
        'Cookie': 'PHPSESSID=8d3k4g8sub7s34mn73c8r12aq7; Hm_lvt_d39191a0b09bb1eb023933edaa468cd5=1599205303; PHPSESSID=skmtbkigevkdbvaj2jq311gtne; Hm_lpvt_d39191a0b09bb1eb023933edaa468cd5=1599206267'
    }
    
    target = url
    api = "http://site.ip138.com/%s/domain.htm" % target
    
    try:
        req = requests.get(api,headers=headers,timeout=10)
        html = req.text
    except Exceptions as e:
        html = ''
        print(e)
        
    # 使用正则进行匹配，匹配页面上返回的子域名
    re_subdomains = re.findall(r"\"_blank\">(.*?)</a></p>",html)
    returnlist = re_subdomains
    return returnlist


In [246]:
def tmp_func(df1):
    tqdm.pandas(ncols=50)
    df1["subdomain"] = df1.progress_apply(lambda x:len(get_subdomain(x.domain)),axis=1)
    return df1
                  
def apply_parallel(df_grouped,func):
    results = Parallel(n_jobs=30)(delayed(func)(group) for name,group in df_grouped)
    return pd.concat(results)
    
df_grouped = df1.groupby(df1.index)
%time df1 = apply_parallel(df_grouped,tmp_func)

CPU times: user 1min 27s, sys: 1.56 s, total: 1min 29s
Wall time: 1min 30s


In [247]:
df1 = df1[df1['subdomain']<3]
print("过滤子域名大于3的域名后剩余可疑域名个数{}".format(df1.shape[0]))

过滤子域名大于3的域名后剩余可疑域名个数11173


### 过滤alexa 1m中的域名

In [248]:
df_alexa = pd.read_csv("./../../../data/alexa-top-1m.csv",names=['rank','domain'])

In [249]:
df1 = df1[~df1['domain'].isin(df_alexa['domain'])]

In [250]:
print("过滤alexa域名1m后剩余域名数量为{}".format(df1.shape[0]))

过滤alexa域名1m后剩余域名数量为9973


### 使用英文进行分词

In [251]:
import tldextract

def get_contain_english_words(input_str):
    """
        获取域名中的英文单词，使用SCOWL提供的单词表进行匹配，其中包含了8万多个最常见的英文单词，这里只匹配长度大于2的英文单词
        Parameters:
        ----------------------
            input_str: 输入字符串，这里是指域名
        Return:
        -----------------------
            word_found: 域名中匹配到的单词列表
    """
    dictionary = set(open('SCOWL-wordlist.txt','r').read().split())
    max_len = max(map(len, dictionary))
    
    input_str = input_str.lower().rstrip()
    
    # 域名中去掉顶级域名、去掉空格
    extracted = tldextract.extract(input_str)
    justfound = extracted.domain.replace(" ","")
    
    # 遍历整个域名，按照英文字典中最大域名长度划分chunk，然后从1个字符到chunk长度遍历与英文字典进行匹配
    words_found = set() #set of words found, starts empty
    for i in range(len(justfound)): 
        chunk = input_str[i:i+max_len+1] 
        for j in range(1,len(chunk)+1): 
            word = chunk[:j] #subchunk
            if word in dictionary: 
                if len(word) > 2: words_found.add(word)
                    
    words_found = sorted(words_found)
    number_of_words = len(words_found)
    

    return words_found

def get_max_english_words_len(domain):
    """
        获取域名中能匹配到的最长英文单词长度
        Parameters:
        -----------------------------
            domain: 要进行匹配的字符串
        Return:
        -----------------------------
            max_len: 匹配到最长的英文单词长度
    """
    words_list = get_contain_english_words(domain)
    if words_list==[]:
        return 0
    return max([len(i) for i in words_list])
    

    
    
from joblib import Parallel,delayed


def tmp(df):
    df['max_english_word_len'] = df.apply(lambda x:get_max_english_words_len(x.domain),axis=1)
    return df

def apply_Parallel(df_grouped,func):
    results = Parallel(n_jobs=30)(delayed(func)(group) for name,group in df_grouped)
    return pd.concat(results)


df1_grouped = df1.groupby(df1.index)
df1 = apply_Parallel(df1_grouped,tmp)

In [252]:
df1 = df1[df1['max_english_word_len']<4]
print("去除包含英文单词的域名后剩余域名个数:{}".format(df1.shape[0]))

去除包含英文单词的域名后剩余域名个数:1795


### 使用拼音进行分词

In [269]:
def _get_pinyin_dicionary(dictionary_file="pinyin.txt"): 
    """
        从原始常见拼音文件中得到特定格式的字典，处理主要包含两部分：
            1."hao/fei" -> "hao"
                           "fei"
            2.去除长度小于2的拼音，因为即使成功匹配也可能存在很大偶然性
            
    """
    
    
    dictionary = set(open(dictionary_file,'r').read().split())
    
    # 遍历
    pinyin_dictionary = set()
    for i in dictionary:
        if '/' in i:
            l = i.split("/")
            for j in l:
                if len(j)>2:
                    pinyin_dictionary.add(j)
        else:
            if len(i)>2:
                pinyin_dictionary.add(i)
                
    return pinyin_dictionary
    
#     # 存储为csv格式            
#     pd.DataFrame(pinyin_dictionary).to_csv("pinyin.csv",index=False,header=False)
    
    
def get_contain_pinyin_words(input_str):
    """
        获取域名全部能够匹配到的拼音
        Parameters:
        -----------------------------
            domain: 要进行匹配的字符串
        Return:
        -----------------------------
            words_found: 匹配到的拼音列表
    """
    dictionary = _get_pinyin_dicionary(dictionary_file="pinyin.txt")
    max_len = max(map(len, dictionary))
    
    input_str = input_str.lower().rstrip()
    
    # 域名中去掉顶级域名、去掉空格
    extracted = tldextract.extract(input_str)
    justfound = extracted.domain.replace(" ","")
    
    # 遍历整个域名，按照英文字典中最大域名长度划分chunk，然后从1个字符到chunk长度遍历与英文字典进行匹配
    words_found = set() #set of words found, starts empty
    for i in range(len(justfound)): 
        chunk = input_str[i:i+max_len+1] 
        for j in range(1,len(chunk)+1): 
            word = chunk[:j] #subchunk
            if word in dictionary: 
                if len(word) > 2: words_found.add(word)
                    
    words_found = sorted(words_found)
    number_of_words = len(words_found)
    

    return words_found

def get_pinyin_nums(domain):
    """
        获取能够匹配到的拼音数量
        Parameters:
        -----------------------------
            domain: 域名字符串
        Return:
        -----------------------------
            pinyin_nums: 域名中能够匹配到的拼音数量
        
    """
    words_list = get_contain_pinyin_words(domain)
    return len(words_list)


def tmp(df):
    df['pinyin_nums'] = df.apply(lambda x:get_pinyin_nums(x.domain),axis=1)
    return df

def apply_Parallel(df_grouped,func):
    results = Parallel(n_jobs=30)(delayed(func)(group) for name,group in df_grouped)
    return pd.concat(results)

In [270]:
df1_grouped = df1.groupby(df1.index)
df1 = apply_Parallel(df1_grouped,tmp)

df1 = df1[df1['pinyin_nums']==0]

In [271]:
print("使用拼音进行过滤后剩余可以域名数量:{}".format(df1.shape[0]))

使用拼音进行过滤后剩余可以域名数量:238


### 根据熵值对域名进行排序

In [258]:
def cal_entropy(text):
    h = 0.0
    sum = 0
    letter = [0] * 26
    text = text.lower()
    for i in range(len(text)):
        if text[i].isalpha():
            letter[ord(text[i]) - ord('a')] += 1
            sum += 1
    for i in range(26):
        p = 1.0 * letter[i] / sum
        if p > 0:
            h += -(p * math.log(p, 2))
    return "%.4f"%(h)


df1['entorpy'] = df1.apply(lambda x:cal_entropy(tldextract.extract(x.domain).domain),axis=1)

In [264]:
df1 = df1.sort_values("entorpy",ascending=False)

In [266]:
df1.head(100)

Unnamed: 0,domain,dns_record_nums,subdomain,max_english_word_len,pin_nums,pinyin_nums,entorpy
11571,tvznmabcdefg.com,0,1,3,0,0,3.5850
7810,mycupofteahk.com,1,2,3,0,0,3.5850
12198,wlcbsjnqrmfy.com,1,1,3,0,0,3.5850
177,akwsryilodjt.com,0,1,0,0,0,3.5850
4054,gjdferqmiiuz.com,0,1,3,0,0,3.4183
...,...,...,...,...,...,...,...
13389,ytslsqshflzx.com,0,1,0,0,0,3.0221
12291,wtjcqntwdtgs.com,0,1,0,0,0,3.0221
12825,xogkpytfgyzy.com,0,1,0,0,0,3.0221
14053,znfwfqoeogfi.com,0,1,0,0,0,3.0221


In [268]:
df1['domain'].head(100).to_csv("dga2_1_result.csv",index=False,header=False)