In [1]:
import re
import datetime
import enchant
import numpy as np
import pandas as pd
from tld import parse_tld

# Defining Famous Domains

In [2]:
alexa_rank = pd.read_csv('../Dataset/Alexa Rank/top-1m.csv', header=None)
alexa_rank.columns = ['rank', 'domain']

In [3]:
def get_sld(s):
    parse = parse_tld(s, fix_protocol=True)
    return parse[1]

alexa_rank['SLD'] = alexa_rank['domain'].apply(get_sld)
famous_domains = list(alexa_rank['SLD'].drop_duplicates())[:1000]

# Generating for Train Dataset

In [4]:
train = pd.read_pickle('../Dataset/Train/raw_train.pickle')

In [5]:
################################################################
def is_private(s):
    privacy_strings = ['REDACTED FOR PRIVACY',
                       'REDACTED FOR PRIVACY REDACTED FOR PRIVACY',
                       'REDACTED_FOR_PRIVACY, REDACTED_FOR_PRIVACY, REDACTED_FOR_PRIVACY',
                       'REDACTED_FOR_PRIVACY',
                       'Redacted for privacy',
                       'Redacted for GDPR privacy',
                      ]
    if s in privacy_strings:
        return 1
    return 0

def privacy_check_lst(items):
    for item in items:
        if isinstance(item, list):
            privacy_check_lst(item)
        elif isinstance(item, dict):
            privacy_check_dic(item)
        elif isinstance(item, str) and is_private(item):
            return 1
    return 0

def privacy_check_dic(dic):
    keys = dic.keys()
    for key in keys:
        if isinstance(dic[key], dict):
            if privacy_check_dic(dic[key]):
                return 1
        else:
            if isinstance(dic[key], list):
                items = dic[key]
            else:
                items = [dic[key]]
                
            if privacy_check_lst(items):
                return 1
    return 0

def privacy_check(var):
    if isinstance(var, dict):
        return privacy_check_dic(var)
    return 0

################################################################
def item_is_valid(item):
    if is_private(item):
        return 0
    if item is None:
        return 0
    return 1      
    
def get_field_lst(items, key, field):
    count = 0
    for item in items:
        if isinstance(item, dict):
            count += get_field_dic(item, field)
        else:
            if not field in key:
                return 0
            else:
                count += item_is_valid(item)
    return count

def get_field_dic(dic, field):
    count = 0
    keys = dic.keys()
    for key in keys:
        if isinstance(dic[key], dict):
            count += get_field_dic(dic[key], field)
        else:
            if isinstance(dic[key], list):
                items = dic[key]
            else:
                items = [dic[key]]
            
            count += get_field_lst(items, key, field)
    return count
                
def get_field(var, field):
    if isinstance(var, dict):
        return get_field_dic(var, field)
    return 0

################################################################

In [6]:
################################################################

def get_connection(var):
    if var == 0:
        return 0
    return 1

def get_name(var):
    field = 'name'
    count = get_field(var, field)
    return count
def get_address(var):
    field = 'address'
    count = get_field(var, field)
    return address

def get_state(var):
    field = 'state'
    count = get_field(var, field)
    return count

def get_country(var):
    field = 'country'
    count = get_field(var, field)
    return count

def get_server(var):
    field = 'server'
    count = get_field(var, field)
    return count

def get_email(var):
    field = 'email'
    count = get_field(var, field)
    return count

def get_registrar(var):
    field = 'registrar'
    count = get_field(var, field)
    return count

def get_org(var):
    field = 'org'
    count = get_field(var, field)
    return count

In [7]:
# ################################################################
def append(lst, new):
    if isinstance(new, list):
        ret = lst+new
    else:
        ret = lst + [new]    
    ret = filter(None.__ne__, ret)
    ret = list(ret)
    return ret

# def most_common(lst):
#     return max(set(lst), key=lst.count)
# ################################################################

In [8]:
################################################################
def extract_year(date):
    if isinstance(date, datetime.datetime):
        return date.year
    
    
    if isinstance(date, str):
        year = re.sub("[^0-9]", "", date)
        if date[0] == '0':
            year = year[1:5]
        else:
            year = year[:4]
        
        year = int(year)
        if year < 1900 or year > 2021:
            return 0
        return(year)
    else:
        print(date, type(date))
        print("BIG ERROR")

################################################################  

def get_creation(dic):
    creations = []
    if dic == 0:
        return 0
    elif 'creation_date' in dic:
        creation = dic['creation_date']
        creations = append(creations, creation)
    elif 'nets' in dic:
        for net in dic['nets']:
            creation = net['created']
            creations = append(creations, creation)

    if len(creations) == 0:
        return 0
    
    creations = list(filter(('REDACTED FOR PRIVACY').__ne__, creations))   
    if len(creations) == 0:
        return -1
    
           
    creations = [extract_year(creation) for creation in creations]                
          
    return min(creations)

################################################################

def get_age(creation_year):
    if creation_year <= 0:
        return 0
    current_year = datetime.datetime.today().year
    age = current_year-creation_year
    return age

################################################################

In [9]:
train['connection']  = train['Third Party'].apply(get_connection)
train['privacy']     = train['Third Party'].apply(privacy_check)

train['name']        = train['Third Party'].apply(get_name)
train['address']     = train['Third Party'].apply(get_address)
train['state']       = train['Third Party'].apply(get_state)
train['country']     = train['Third Party'].apply(get_country)

train['email']       = train['Third Party'].apply(get_email)
train['servers']     = train['Third Party'].apply(get_server)

train['creation']    = train['Third Party'].apply(get_creation)
train['age']         = train['creation'].apply(get_age)

In [10]:
################################################################

def get_no_slds(s):
    count = 0
    words = s.split('.')
    for word in words:
        if word in famous_domains:
            count+=1
    return count

################################################################

def get_sld_dst(s):
    min_dst = np.inf
    for famous_domain in famous_domains:
        dst = enchant.utils.levenshtein(s, famous_domain)
        min_dst = min(min_dst, dst)
    return min_dst

################################################################

In [11]:
train

Unnamed: 0,domain,class,sub_class,ip_format,SSD,SUB,SLD,TLD,Third Party,connection,privacy,name,address,state,country,email,servers,creation,age
43956,water.gov.il,1,benign,0,water,,water,gov.il,"{  ""domain_name"": ""water.gov.il"",  ""expirati...",1,0,1,1,0,0,2,2,0,0
30599,diocesilucca.it,1,benign,0,diocesilucca,,diocesilucca,it,"{  ""domain_name"": ""diocesilucca.it"",  ""creat...",1,0,1,1,0,0,0,1,2006,15
73775,wan4399.com,0,malware,0,wan4399,,wan4399,com,"{  ""domain_name"": [  ""WAN4399.COM"",  ""w...",1,0,1,0,1,1,1,5,2012,9
66845,cardgamesolitaire.com,1,benign,0,cardgamesolitaire,,cardgamesolitaire,com,"{  ""domain_name"": ""CARDGAMESOLITAIRE.COM"",  ...",1,0,1,1,1,1,2,5,2007,14
5466,fuego.digital,1,benign,0,fuego,,fuego,digital,"{  ""domain_name"": ""fuego.digital"",  ""registr...",1,1,1,0,1,1,1,3,2021,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54886,vpdes.com,1,benign,0,vpdes,,vpdes,com,"{  ""domain_name"": [  ""VPDES.COM"",  ""vpd...",1,0,1,1,1,1,2,9,2021,0
76820,cybermarine.in,0,malware,0,cybermarine,,cybermarine,in,"{  ""domain_name"": ""cybermarine.in"",  ""regist...",1,0,1,0,1,1,0,4,2019,2
103694,bancobci.trial.inicioseguro.com,0,phishing,0,bancobci.trial.inicioseguro,bancobci.trial,inicioseguro,com,"{  ""domain_name"": [  ""INICIOSEGURO.COM"",  ...",1,0,1,0,1,1,1,5,2018,3
860,none.com,1,benign,0,none,,none,com,"{  ""domain_name"": [  ""NONE.COM"",  ""none...",1,1,1,1,1,1,2,4,1995,26


In [12]:
train['SLDs in SUB']  = train['SUB'].apply(get_no_slds)
train['SLD Distance'] = train['SLD'].apply(get_sld_dst)

In [13]:
train_subset = train.drop(columns=['Third Party', 'creation'])

In [14]:
train_subset.to_csv('../Dataset/Train/Temp/tp_train.csv', index=False)

# Generating for Test Dataset

In [15]:
test = pd.read_pickle('../Dataset/Test/raw_test.pickle')

In [16]:
test['connection']   = test['Third Party'].apply(get_connection)
test['privacy']      = test['Third Party'].apply(privacy_check)
test['name']         = test['Third Party'].apply(get_name)
test['address']      = test['Third Party'].apply(get_address)
test['state']        = test['Third Party'].apply(get_state)
test['country']      = test['Third Party'].apply(get_country)
test['email']        = test['Third Party'].apply(get_email)
test['servers']      = test['Third Party'].apply(get_server)
test['creation']     = test['Third Party'].apply(get_creation)
test['age']          = test['creation'].apply(get_age)
test['SLDs in SUB']  = test['SUB'].apply(get_no_slds)
test['SLD Distance'] = test['SLD'].apply(get_sld_dst)

In [19]:
test_subset = test.drop(columns=['Third Party', 'creation'])

In [20]:
test_subset.to_csv('../Dataset/Test/Temp/tp_test.csv', index=False)