# DGA (Data Collection & Features Extraction)

## 1. Data Collection

In [1]:
import numpy as np 
import pandas as pd
import math
from collections import Counter
import tldextract

In [2]:
# Reading dga dataset
columns_dga = ["source","domain_name","date","second","first", "time"]

dga_domains = pd.read_csv("../data/DGA/dga_project_dga_domain_list_clean.txt", sep="\s+", names=columns_dga)
dga_domains.head()

Unnamed: 0,source,domain_name,date,second,first,time
0,nymaim,vvqbhhwma.org,2021-05-11,00:00:00,2021-05-11,23:59:59
1,nymaim,cfhauqbaz.com,2021-05-11,00:00:00,2021-05-11,23:59:59
2,nymaim,bvxjsbkqu.biz,2021-05-11,00:00:00,2021-05-11,23:59:59
3,nymaim,hggazskvkdy.com,2021-05-11,00:00:00,2021-05-11,23:59:59
4,nymaim,xgoqyxgfgm.com,2021-05-11,00:00:00,2021-05-11,23:59:59


In [3]:
# Removing none useful columns 
dga_domains.drop(["source", "date", "second", "first", "time"], axis=1, inplace=True)
dga_domains["label"] = "dga"

print("DGA Dataframe has {} rows".format(dga_domains.shape[0]))
dga_domains.head()

DGA Dataframe has 1404792 rows


Unnamed: 0,domain_name,label
0,vvqbhhwma.org,dga
1,cfhauqbaz.com,dga
2,bvxjsbkqu.biz,dga
3,hggazskvkdy.com,dga
4,xgoqyxgfgm.com,dga


In [4]:
# Readin Alexa data
legit_columns = ["domain_name"]

# legit_domains = pd.read_csv("/kaggle/input/domain-generation-algorithm/dga_project_top-1m.csv", names=legit_columns)
legit_domains = pd.read_csv("../data/DGA/top-1m.csv", names=legit_columns)
legit_domains["label"] = "legit"

print("Legit Dataframe has {} rows".format(legit_domains.shape[0]))
legit_domains.head()

Legit Dataframe has 694787 rows


Unnamed: 0,domain_name,label
1,google.com,legit
2,youtube.com,legit
3,tmall.com,legit
4,qq.com,legit
5,baidu.com,legit


In [5]:
# Adjusting datasets sizes and asserting that
dga_domains = dga_domains.loc[:legit_domains.shape[0]-1, :]

assert dga_domains.shape[0] == legit_domains.shape[0]

In [6]:
# Concatenating both datasets
data = pd.concat([dga_domains, legit_domains])

print("Whole Dataset has {} rows".format(data.shape[0]))
assert data.shape[0] == legit_domains.shape[0] * 2
data.head()

Whole Dataset has 1389574 rows


Unnamed: 0,domain_name,label
0,vvqbhhwma.org,dga
1,cfhauqbaz.com,dga
2,bvxjsbkqu.biz,dga
3,hggazskvkdy.com,dga
4,xgoqyxgfgm.com,dga


## 2. DATA PREPROCESSING
The following features will be extracted from the dga to classification
<ol>
    <li>Structural Features
        <ul>
            <li>Domain Name Length</li>
            <li>Number of Subdomains</li>
            <li>Subdomain Length Mean</li>
            <li>Has www Prefix</li>
            <li>Has valid TLD</li>
            <li>Contains Single-Character Subdomain</li>
            <li>Contains TLD as Subdomain</li>
            <li>Underscore Ratio</li>
            <li>Contains IP Address</li>
        </ul>
    </li>
    <br>
    <li>Linguistic Features
        <ul>
            <li>Contains Digits</li>
            <li>Vowel Ratio</li>
            <li>Digit Ratio</li>
            <li>Ratio of Repeated Characters</li>
            <li>Ratio of Consecutive Consonants</li>
            <li>Ratio of Consecutive Digits</li>  
        </ul>
    </li>
    <br>
    <li>Statistical Features
        <ul>
            <li>Entropy</li>
            <li>words gram</li>
            <li>alexia gram</li>
        </ul>
    </li>
</ol>

In [7]:
def labelTo_Binary(type):
  # Convert Type to Binary variable DGA = 1, Normal = 0
  if type == 'dga':
    return 1
  else:
    return 0

data['label'] = data['label'].apply(lambda i: labelTo_Binary(i))

In [8]:
import re
import collections
from publicsuffixlist import PublicSuffixList

In [9]:
psl = PublicSuffixList()

In [10]:
# Load Valid Top Level Domains data
def load_topLevelDomain():
  topLevelDomain = []
  with open('../data/DGA/tlds-alpha-by-domain.txt', 'r') as content:
      for line in content:
          topLevelDomain.append((line.strip('\n')))
  return topLevelDomain

In [11]:
def ignoreVPS(domain):
    # Return the rest of domain after ignoring the Valid Public Suffixes:
    validPublicSuffix = '.' + psl.publicsuffix(domain)
    if len(validPublicSuffix) < len(domain):
         # If it has VPS
        subString = domain[0: domain.index(validPublicSuffix)]  
    elif len(validPublicSuffix) == len(domain):
        return 0
    else:
        # If not
        subString = domain
    
    return subString

### 2.1 Structural Features

In [12]:
def domain_length(domain):
  # Generate Domain Name Length (DNL)
  return len(domain)

In [13]:
def subdomains_number(domain):
  # Generate Number of Subdomains (NoS)
  subdomain = ignoreVPS(domain)
  return (subdomain.count('.') + 1)

In [14]:
def subdomain_length_mean(domain):
  # enerate Subdomain Length Mean (SLM) 
  subdomain = ignoreVPS(domain)
  result = (len(subdomain) - subdomain.count('.')) / (subdomain.count('.') + 1)
  return result

In [15]:
def has_www_prefix(domain):
  # Generate Has www Prefix (HwP)
  if domain.split('.')[0] == 'www':
    return 1
  else:
    return 0

In [16]:
def has_hvltd(domain):
  topLevelDomain = load_topLevelDomain()
  # Generate Has a Valid Top Level Domain (HVTLD)
  if domain.split('.')[len(domain.split('.')) - 1].upper() in topLevelDomain:
    return 1
  else:
    return 0

In [17]:
def contains_single_character_subdomain(domain):
  # Generate Contains Single-Character Subdomain (CSCS) 
  domain = ignoreVPS(domain)
  str_split = domain.split('.')
  minLength = len(str_split[0])
  for i in range(0, len(str_split) - 1):
      minLength = len(str_split[i]) if len(str_split[i]) < minLength else minLength
  if minLength == 1:
    return 1
  else:
    return 0

In [18]:
def contains_TLD_subdomain(domain):
  # Generate Contains TLD as Subdomain (CTS)
  subdomain = ignoreVPS(domain)
  str_split = subdomain.split('.')
  topLevelDomain = load_topLevelDomain()
  for i in range(0, len(str_split) - 1):
        if str_split[i].upper() in topLevelDomain:
            return 1
  return 0

In [19]:
def underscore_ratio(domain):
  # Generate Underscore Ratio (UR) on dataset
  subString = ignoreVPS(domain)
  result = subString.count('_') / (len(subString) - subString.count('.'))
  return result

In [20]:
def contains_IP_address(domain):
  # Generate Contains IP Address (CIPA) on datasetx
    splitSet = domain.split('.')
    for element in splitSet:
        if(re.match("\d+", element)) == None:
            return 0
    return 1 

### 2.2 Linguistic Features

In [21]:
def contains_digit(domain):
  """
   Contains Digits 
  """
  subdomain = ignoreVPS(domain)
  for item in subdomain:
    if item.isdigit():
      return 1
  return 0

In [22]:
def vowel_ratio(domain):
  """
  calculate Vowel Ratio 
  """
  VOWELS = set('aeiou')
  v_counter = 0
  a_counter = 0
  subdomain = ignoreVPS(domain)
  for item in subdomain:
    if item.isalpha():
      a_counter+=1
      if item in VOWELS:
        v_counter+=1
  if a_counter>1:
    ratio = v_counter/a_counter
    return ratio

In [23]:
def digit_ratio(domain):
  """
  calculate digit ratio
  """
  d_counter = 0
  counter = 0
  subdomain = ignoreVPS(domain)
  for item in subdomain:
    if item.isalpha() or item.isdigit():
      counter+=1
      if item.isdigit():
        d_counter+=1
  if counter>1:
    ratio = d_counter/counter
    return ratio

In [24]:
def prc_rrc(domain):
  """
  calculate the Ratio of Repeated Characters in a subdomain
  """
  subdomain = ignoreVPS(domain)
#   subdomain =''.join(re.findall('[a-zA-Z]+', subdomain)) 
  subdomain = re.sub("[.]", "", subdomain)
  char_num=0
  repeated_char_num=0
  d = collections.defaultdict(int)
  for c in list(subdomain):
      d[c] += 1
  for item in d:
    char_num +=1
    if d[item]>1:
      repeated_char_num +=1
  ratio = repeated_char_num/char_num
  return ratio

In [25]:
def prc_rcc(domain):
  """
  calculate the Ratio of Consecutive Consonants
  """
  VOWELS = set('aeiou')
  counter = 0
  cons_counter=0
  subdomain = ignoreVPS(domain)
#   subdomain =''.join(re.findall('[a-zA-Z]+', subdomain)) 
  for item in subdomain:
    i = 0
    if item.isalpha() and item not in VOWELS:
      counter+=1
    else:
      if counter>1:
        cons_counter+=counter
      counter=0
    i+=1
  if i==len(subdomain) and counter>1:
    cons_counter+=counter
  ratio = cons_counter/len(subdomain)
  return ratio

In [26]:
def prc_rcd(domain):
  """
  calculate the ratio of consecutive digits
  """
  counter = 0
  digit_counter=0
  subdomain = ignoreVPS(domain)
#   subdomain =''.join(re.findall('[a-zA-Z]+', subdomain)) 
  for item in subdomain:
    i = 0
    if item.isdigit():
      counter+=1
    else:
      if counter>1:
        digit_counter+=counter
      counter=0
    i+=1
  if i==len(subdomain) and counter>1:
    digit_counter+=counter
  ratio = digit_counter/len(subdomain)
  return ratio

### 2.3 Statistical Features

In [27]:
def prc_entropy(domain):
    """
    calculate the entropy of subdomain
    :param domain_str: subdomain
    :return: the value of entropy
    """
    subdomain = ignoreVPS(domain)
    # get probability of chars in string
    prob = [float(subdomain.count(c)) / len(subdomain) for c in dict.fromkeys(list(subdomain))]

    # calculate the entropy
    entropy = - sum([p * math.log(p) / math.log(2.0) for p in prob])
    return entropy

In [28]:
def extract_features():
  data['DNL'] = data['domain_name'].apply(lambda x: domain_length(x))
  data['NoS'] = data['domain_name'].apply(lambda x: subdomains_number(x))
  data['SLM'] = data['domain_name'].apply(lambda x: subdomain_length_mean(x))
  data['HwP'] = data['domain_name'].apply(lambda x: has_www_prefix(x))
  data['HVTLD'] = data['domain_name'].apply(lambda x: has_hvltd(x))
  data['CSCS'] = data['domain_name'].apply(lambda x: contains_single_character_subdomain(x))
  data['CTS'] = data['domain_name'].apply(lambda x: contains_TLD_subdomain(x))
  data['UR'] = data['domain_name'].apply(lambda x: underscore_ratio(x))
  data['CIPA'] = data['domain_name'].apply(lambda x: contains_IP_address(x))
  data['contains_digit']= data['domain_name'].apply(lambda x:contains_digit(x))
  data['vowel_ratio']= data['domain_name'].apply(lambda x:vowel_ratio(x))
  data['digit_ratio']= data['domain_name'].apply(lambda x:digit_ratio(x))
  data['RRC']= data['domain_name'].apply(lambda x:prc_rrc(x))
  data['RCC']= data['domain_name'].apply(lambda x:prc_rcc(x))
  data['RCD']= data['domain_name'].apply(lambda x:prc_rcd(x))
  data['Entropy']= data['domain_name'].apply(lambda x:prc_entropy(x))

In [29]:
extract_features()
data.head()

Unnamed: 0,domain_name,label,DNL,NoS,SLM,HwP,HVTLD,CSCS,CTS,UR,CIPA,contains_digit,vowel_ratio,digit_ratio,RRC,RCC,RCD,Entropy
0,vvqbhhwma.org,1,13,1,9.0,0,1,0,0,0.0,0,0,0.111111,0.0,0.285714,0.888889,0.0,2.725481
1,cfhauqbaz.com,1,13,1,9.0,0,1,0,0,0.0,0,0,0.333333,0.0,0.125,0.555556,0.0,2.947703
2,bvxjsbkqu.biz,1,13,1,9.0,0,1,0,0,0.0,0,0,0.111111,0.0,0.125,0.888889,0.0,2.947703
3,hggazskvkdy.com,1,15,1,11.0,0,1,0,0,0.0,0,0,0.090909,0.0,0.222222,0.272727,0.0,3.095795
4,xgoqyxgfgm.com,1,14,1,10.0,0,1,0,0,0.0,0,0,0.1,0.0,0.285714,0.2,0.0,2.646439


In [32]:
def entropy(domain_name):
    """ Function which computes the entropy of a given domain name based on it's chars """
    elements, length = Counter(domain_name), len(domain_name)    
    
    return -sum(element/length * math.log(element/length, 2) for element in elements.values())

def get_domain_name(domain):
    """ Function which extracts domain name from subdomain name """
    res = tldextract.extract(domain)
    return res.domain if len(res.domain) > len(res.subdomain) or entropy(res.domain) > entropy(res.subdomain) else res.subdomain


data["domain"] = data["domain_name"].apply(lambda domain: get_domain_name(domain))

In [34]:
data.head()

Unnamed: 0,domain_name,label,DNL,NoS,SLM,HwP,HVTLD,CSCS,CTS,UR,CIPA,contains_digit,vowel_ratio,digit_ratio,RRC,RCC,RCD,Entropy,domain
0,vvqbhhwma.org,1,13,1,9.0,0,1,0,0,0.0,0,0,0.111111,0.0,0.285714,0.888889,0.0,2.725481,vvqbhhwma
1,cfhauqbaz.com,1,13,1,9.0,0,1,0,0,0.0,0,0,0.333333,0.0,0.125,0.555556,0.0,2.947703,cfhauqbaz
2,bvxjsbkqu.biz,1,13,1,9.0,0,1,0,0,0.0,0,0,0.111111,0.0,0.125,0.888889,0.0,2.947703,bvxjsbkqu
3,hggazskvkdy.com,1,15,1,11.0,0,1,0,0,0.0,0,0,0.090909,0.0,0.222222,0.272727,0.0,3.095795,hggazskvkdy
4,xgoqyxgfgm.com,1,14,1,10.0,0,1,0,0,0.0,0,0,0.1,0.0,0.285714,0.2,0.0,2.646439,xgoqyxgfgm


In [35]:
# A choice of 3 to 5 grams seems okay

from sklearn.feature_extraction.text import CountVectorizer


split_condition = data["label"] == 0
legit = data[split_condition]
dga = data[~split_condition]

alexa_vc = CountVectorizer(analyzer="char", ngram_range=(3,5), min_df=0.00001, max_df=1.0)
counts_matrix = alexa_vc.fit_transform(legit["domain"])

counts_matrix

<694787x203095 sparse matrix of type '<class 'numpy.int64'>'
	with 13635372 stored elements in Compressed Sparse Row format>

In [36]:
alexa_counts = np.log10(np.asarray(counts_matrix.sum(axis=0)).flatten())
ngrams_list = alexa_vc.get_feature_names_out()
print(ngrams_list[100:200])

['--btb' '--c' '--c1' '--c1a' '--ct' '--ctb' '--d' '--d1' '--d1a' '--dt'
 '--dtb' '--e' '--e1' '--e1a' '--f' '--g' '--g1' '--g1a' '--gt' '--gtb'
 '--h' '--h1' '--h1a' '--ht' '--htb' '--i' '--it' '--itb' '--j' '--j1'
 '--j1a' '--jt' '--jtb' '--k' '--l' '--l3' '--l3c' '--m' '--mg' '--mgb'
 '--n' '--o' '--p' '--q' '--r' '--s' '--t' '--u' '--v' '--w' '--y' '--ym'
 '--ymc' '--z' '-01' '-1-' '-10' '-100' '-101' '-11' '-12' '-123' '-12c'
 '-12ca' '-12cl' '-13' '-14' '-15' '-16' '-168' '-17' '-18' '-19' '-1x'
 '-1x2' '-2-' '-20' '-200' '-201' '-202' '-2020' '-2021' '-21' '-22'
 '-22c' '-23' '-24' '-24-' '-24-6' '-25' '-26' '-28' '-29' '-3-' '-30'
 '-31' '-32' '-33' '-33-' '-34']


In [37]:
sorted_ngrams = sorted(zip(ngrams_list, alexa_counts), key=lambda alexa_count: alexa_count[1], reverse=True)
print("Alexa NGrams {}".format(len(sorted_ngrams)))
for ngram, count in sorted_ngrams[:10]:
    print(ngram, count)

Alexa NGrams 203095
ing 4.3956233943558365
ion 4.244821194593283
ent 4.227475343482371
ine 4.2259034449261605
ter 4.213623993416087
the 4.206771923055583
lin 4.1798389280231865
and 4.177103432436536
ers 4.123099955255574
tor 4.11747016362012


In [43]:
data["alexa_grams"] = alexa_counts * alexa_vc.transform(data["domain"]).T
data[data["label"] == 0].head()

Unnamed: 0,domain_name,label,DNL,NoS,SLM,HwP,HVTLD,CSCS,CTS,UR,...,contains_digit,vowel_ratio,digit_ratio,RRC,RCC,RCD,Entropy,domain,alexa_grams,words_grams
1,google.com,0,10,1,6.0,0,1,0,0,0.0,...,0,0.5,0.0,0.5,0.333333,0.0,1.918296,google,25.014933,34.23263
2,youtube.com,0,11,1,7.0,0,1,0,0,0.0,...,0,0.571429,0.0,0.166667,0.0,0.0,2.521641,youtube,32.872987,42.128381
3,tmall.com,0,9,1,5.0,0,1,0,0,0.0,...,0,0.2,0.0,0.25,0.4,0.0,1.921928,tmall,16.879412,30.07428
4,qq.com,0,6,1,2.0,0,1,0,0,0.0,...,0,0.0,0.0,1.0,0.0,0.0,-0.0,qq,0.0,0.0
5,baidu.com,0,9,1,5.0,0,1,0,0,0.0,...,0,0.6,0.0,0.0,0.0,0.0,2.321928,baidu,12.181409,20.700835


In [44]:
words_df = pd.read_csv("../data/DGA/words.txt", names=["word"],
                             encoding="utf-8", header=None, dtype={"word": np.str_}, sep='\t')
words_df.head()

Unnamed: 0,word
0,2
1,1080
2,&c
3,10-point
4,10th


In [45]:
def clean_words_df(word):
    return str(word).strip().lower()

def keep_alphanumeric(word):
    return str(word).isalpha()



words_df = words_df[words_df["word"].map(lambda word: str(word).isalpha())]
words_df = words_df.applymap(lambda word: str(word).strip().lower())

words_df = words_df.dropna()
words_df = words_df.drop_duplicates()

In [46]:
dict_cv = CountVectorizer(analyzer="char", ngram_range=(3,5), min_df=0.00001, max_df=1.0)
words_counts_matrix = dict_cv.fit_transform(words_df["word"])

dict_counts = np.log(np.asarray(words_counts_matrix.sum(axis=0)).flatten())
words_ngrams_list = dict_cv.get_feature_names_out()

print(words_ngrams_list[100:200])

['abbed' 'abber' 'abbes' 'abbet' 'abbey' 'abbi' 'abbie' 'abbil' 'abbin'
 'abbis' 'abbit' 'abbl' 'abble' 'abbli' 'abbo' 'abbot' 'abbr' 'abbre'
 'abbro' 'abby' 'abc' 'abd' 'abda' 'abde' 'abdi' 'abdic' 'abdit' 'abdo'
 'abdoc' 'abdom' 'abdop' 'abdos' 'abdu' 'abduc' 'abe' 'abea' 'abeat'
 'abec' 'abece' 'abecu' 'abed' 'abee' 'abef' 'abei' 'abel' 'abela' 'abele'
 'abeli' 'abell' 'abels' 'abem' 'abema' 'aben' 'abena' 'abend' 'aber'
 'aberd' 'aberg' 'abern' 'aberr' 'abers' 'abes' 'abet' 'abete' 'abeth'
 'abeti' 'abeto' 'abett' 'abey' 'abeya' 'abez' 'abf' 'abh' 'abha' 'abho'
 'abhor' 'abi' 'abia' 'abial' 'abian' 'abiat' 'abib' 'abic' 'abica'
 'abich' 'abici' 'abid' 'abida' 'abide' 'abidi' 'abie' 'abies' 'abiet'
 'abif' 'abifi' 'abify' 'abig' 'abiga' 'abil' 'abila']


In [47]:
data["words_grams"] = dict_counts * dict_cv.transform(data["domain"]).T
data.head()

Unnamed: 0,domain_name,label,DNL,NoS,SLM,HwP,HVTLD,CSCS,CTS,UR,...,contains_digit,vowel_ratio,digit_ratio,RRC,RCC,RCD,Entropy,domain,alexa_grams,words_grams
0,vvqbhhwma.org,1,13,1,9.0,0,1,0,0,0.0,...,0,0.111111,0.0,0.285714,0.888889,0.0,2.725481,vvqbhhwma,3.480582,4.430817
1,cfhauqbaz.com,1,13,1,9.0,0,1,0,0,0.0,...,0,0.333333,0.0,0.125,0.555556,0.0,2.947703,cfhauqbaz,11.153124,14.986492
2,bvxjsbkqu.biz,1,13,1,9.0,0,1,0,0,0.0,...,0,0.111111,0.0,0.125,0.888889,0.0,2.947703,bvxjsbkqu,4.006466,0.0
3,hggazskvkdy.com,1,15,1,11.0,0,1,0,0,0.0,...,0,0.090909,0.0,0.222222,0.272727,0.0,3.095795,hggazskvkdy,11.862345,10.112126
4,xgoqyxgfgm.com,1,14,1,10.0,0,1,0,0,0.0,...,0,0.1,0.0,0.285714,0.2,0.0,2.646439,xgoqyxgfgm,3.497759,0.0


In [48]:
data.to_csv("DGA_Processed.csv")

In [None]:
vec = open("../models/alexa_vc.pkl","wb")
joblib.dump(alexa_vc, vec)
vec.close()

In [None]:
vec = open("../models/dict_cv.pkl","wb")
joblib.dump(dict_cv, vec)
vec.close()