In [2]:
import pandas as pd

In [3]:
df = pd.read_pickle('Dataset/brute_tp_characteristics.pickle')

In [4]:
def get_connection(v):
    if v == 0:
        return 0
    return 1

df['connection'] = df['Third Party'].apply(get_connection)

In [5]:
def append(lst, new):
    if isinstance(new, list):
        ret = lst+new
    else:
        ret = lst + [new]    
    ret = filter(None.__ne__, ret)
    ret = list(ret)
    return ret

def most_common(lst):
    return max(set(lst), key=lst.count)

In [6]:
def get_name(dic):
    names = []    
    if dic == 0:
        return 0
    elif 'domain_name' in dic:
        name = dic['domain_name']
        names = append(names, name)
    elif 'name' in dic:
        name = dic['name']
        names = append(names, name)
    elif 'nets' in dic:
        for net in dic['nets']:
            name = net['name']
            names = append(names, name)   
    
    if len(names) == 0:
        return 0
    
    return 1

df['name'] = df['Third Party'].apply(get_name)

In [7]:
def get_address(dic):
    addresses = []    
    if dic == 0:
        return 0
    elif 'address' in dic:
        address = dic['address']
        addresses = append(addresses, address)
    elif 'nets' in dic:
        for net in dic['nets']:
            address = net['address']
            addresses = append(addresses, address)
            
    if len(addresses) == 0:
        return 0    

    addresses = list(filter(('REDACTED FOR PRIVACY').__ne__, addresses))   
    if len(addresses) == 0:
        return -1
    
    return 1


df['address'] = df['Third Party'].apply(get_address)

In [8]:
def get_emails(dic):
    emails = []
    if dic == 0:
        return 0    
    elif 'emails' in dic:
        email = dic['emails']
        emails = append(emails, email)
    elif 'nets' in dic:
        for net in dic['nets']:
            email = net['emails']
            emails = append(emails, email)
                   

    if len(emails) == 0:
        return 0

    emails = list(filter(('REDACTED FOR PRIVACY').__ne__, emails))   
    if len(emails) == 0:
        return -1
        
    return len(emails)                
                
df['email'] = df['Third Party'].apply(get_emails)

In [9]:
def get_country(dic):
    countries = []
    if dic == 0:
        return ''    
    elif 'country' in dic:
        country = dic['country']
        countries = append(countries, country)
    elif 'nets' in dic:
        for net in dic['nets']:
            country = net['country']
            countries = append(countries, country)
                   

    if len(countries) == 0:
        return ''

    countries = list(filter(('REDACTED FOR PRIVACY').__ne__, countries))   
    if len(countries) == 0:
        return '-1'
        
    return most_common(countries)
                
df['country'] = df['Third Party'].apply(get_country)

In [10]:
def get_state(dic):
    states = []
    if dic == 0:
        return ''    
    elif 'state' in dic:
        state = dic['state']
        states = append(states, state)
    elif 'nets' in dic:
        for net in dic['nets']:
            state = net['state']
            states = append(states, state)
                   

    if len(states) == 0:
        return ''

    states = list(filter(('REDACTED FOR PRIVACY').__ne__, states))   
    if len(states) == 0:
        return '-1'
        
    return most_common(states)
                
df['state'] = df['Third Party'].apply(get_state)

In [63]:
import datetime
import re
def extract_year(date):
    if isinstance(date, datetime.datetime):
        return date.year
    
    
    if isinstance(date, str):
        year = re.sub("[^0-9]", "", date)
        if date[0] == '0':
            year = year[1:5]
        else:
            year = year[:4]
        
        year = int(year)
        if year < 1900 or year > 2021:
            return 0
        return(year)
    else:
        print(date, type(date))
        print("BIG ERROR")


def get_creation(dic):
    creations = []
    if dic == 0:
        return 0
    elif 'creation_date' in dic:
        creation = dic['creation_date']
        creations = append(creations, creation)
    elif 'nets' in dic:
        for net in dic['nets']:
            creation = net['created']
            creations = append(creations, creation)

    if len(creations) == 0:
        return 0
    
    creations = list(filter(('REDACTED FOR PRIVACY').__ne__, creations))   
    if len(creations) == 0:
        return -1
    
           
    creations = [extract_year(creation) for creation in creations]                
          
    return min(creations)

df['creation'] = df['Third Party'].apply(get_creation)

In [83]:
def get_age(creation_year):
    if creation_year <= 0:
        return -1
    current_year = datetime.datetime.today().year
    age = current_year-creation_year
    return age

df['age'] = df['creation'].apply(get_age)

In [97]:
from tld import parse_tld

alexa_rank = pd.read_csv('top-1m.csv', header=None)
alexa_rank.columns = ['rank', 'domain']

def get_sld(s):
    parse = parse_tld(s, fix_protocol=True)
    return parse[1]

alexa_rank['SLD'] = alexa_rank['domain'].apply(get_sld)
famous_domains = list(alexa_rank['SLD'].drop_duplicates())[:1000]

In [106]:
def get_no_slds(s):
    count = 0
    words = s.split('.')
    for word in words:
        if word in famous_domains:
            count+=1
    return count

df['SLDs in SUB'] = df['SUB'].apply(get_no_slds)

In [122]:
import enchant
import numpy as np

def get_sld_dst(s):
    min_dst = np.inf
    for famous_domain in famous_domains:
        dst = enchant.utils.levenshtein(s, famous_domain)
        min_dst = min(min_dst, dst)
    return min_dst

df['SLD Distance'] = df['SLD'].apply(get_sld_dst)

In [136]:
df_subset = df.drop(columns=['Third Party'])

In [137]:
df_subset.to_csv('Dataset/tp_characteristics.csv', index=False)