In [1]:
import requests
import pandas as pd
import os
import whois
import dns.resolver
import six
from google.cloud import translate_v2 as translate

In [23]:
# Google Cloud Trnslation service API keys
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]='ENTER_YOUR_JSON_FILENAME_HERE'

In [3]:
# wrappers for simplifying the application of punycode-related lambda experessions to dataframes
def puny(s):
    return s.encode('idna').decode()
 
def unpuny(s):
    return str(s).lower().encode('idna').decode('idna')

In [4]:
def get_all_tlds():
    """ Gets all of the IDN TLDs

    Retrieves a dynamic list from IANA's website
    """
    res = requests.get('https://data.iana.org/TLD/tlds-alpha-by-domain.txt')
    all_tlds = res.text.split('\n')
    # Filter only punycode TLDs
    return (list(filter(lambda tld: 'XN--' in tld, all_tlds)))

In [5]:
# get all the current TLDs then use it to initialize the dataframe
idn_tld_puny_list = get_all_tlds()
df = pd.DataFrame(idn_tld_puny_list, columns=['punycode'])
# add non-ascii representation
df['unpunycode'] = df.punycode.map(unpuny)

In [6]:
def translate_text(target, text):
    """Translates text into the target language.

    Target must be an ISO 639-1 language code.
    See https://g.co/cloud/translate/v2/translate-reference#supported_languages
    """

    translate_client = translate.Client()

    if isinstance(text, six.binary_type):
        text = text.decode("utf-8")

    # Text can also be a sequence of strings, in which case this method
    # will return a sequence of results for each text.
    result = translate_client.translate(text, target_language=target)
    
    return(result)

In [7]:
# Translate the TLD to English
df['full_res'] = df.unpunycode.map(lambda x : translate_text('en-us', x))

In [8]:
# do some pandas tricks to split the raw content to separate cols
df['src_lang_code'] = df['full_res']
df['detectedSourceLanguage'] = df['full_res'].map(lambda x : x['detectedSourceLanguage'])
df['translatedText'] = df['full_res'].map(lambda x : x['translatedText'])

In [9]:
# cleanup of internal state column leftovers
df = df.drop(['full_res', 'src_lang_code'], axis=1)

In [10]:
def get_word_in_all_langs(word, lang_codes):
    """
    Construct a dictionary - translate the keyword to any of the lang_codes
    """
    word_lang_dic = {}
    for lang in lang_codes:
        res = translate_text(lang, word)
        word_lang_dic[lang] = res['translatedText']
    return word_lang_dic  

In [11]:
# Aggregate a list of all the distinct lang codes
lang_codes = df.detectedSourceLanguage.unique()

In [12]:
# Get the translations for the target, in this case, Microsoft
idn_squatting_target = 'Microsoft'
word_in_other_langs = get_word_in_all_langs(idn_squatting_target, lang_codes)

In [13]:
# Put the translated target in the appropriate row
df['translated_target'] = df['detectedSourceLanguage'].map(lambda x : word_in_other_langs[x].replace(' ', ''))

In [14]:
# Construct the targeted domain
df['domain_to_register'] = df.translated_target + '.' + df.unpunycode

# check and mark failed translation attempts
df['domain_to_register'] = df['domain_to_register'].map(lambda x : 'translation failure' if idn_squatting_target in x else x)

# display also the punycode representation of the website 
df['domain_to_register_punycode'] = df.domain_to_register.map(puny)

In [22]:
def is_whois_exists(domain_name):
    """
    Return True iff domain_name has associated WHOIS records
    """
    try:
        w = whois.whois(domain_name)
    except Exception:
        return False
    else:
        return bool(w.domain_name)
    
def is_dns_exists(domain_name):
    """
    Return True iff domain_name has associated DNS records
    """
    try:
        r = dns.resolver.Resolver()
        # 8.8.8.8 is Google's public DNS server
        r.nameservers = ['8.8.8.8']
        
        result = r.resolve(domain_name, 'NS')
    except dns.resolver.NXDOMAIN:
        return False
    except dns.resolver.NoAnswer:
        return False
    except dns.resolver.NoNameservers:
        return True
    except Exception as e:
        print(f'Unhandled exception for {domain_name}')
        print(f'str(e)\n')
    else: 
        return bool(result)

def check_domain_avail(domain_to_check):
    """ 
    Wrapping function for two different domain availability tests
    Return true iff the domain is available, i.e. there are *no* records
    """
    
    if is_dns_exists(domain_to_check) or is_whois_exists(domain_to_check):
        return False
    else:
        return True

In [17]:
df['domainAvailability'] = df.domain_to_register_punycode.map(check_domain_avail)

Unhandled for xn--mgbaj0ak9fxa0a58f8j.xn--mgbab2bd
The resolution lifetime expired after 5.405 seconds: Server 8.8.8.8 UDP port 53 answered The DNS operation timed out.; Server 8.8.8.8 UDP port 53 answered The DNS operation timed out.; Server 8.8.8.8 UDP port 53 answered The DNS operation timed out.
Unhandled for xn--lgbbb3bk9fwa1a85bjpmm.xn--mgbai9azgqp6j
The resolution lifetime expired after 5.405 seconds: Server 8.8.8.8 UDP port 53 answered The DNS operation timed out.; Server 8.8.8.8 UDP port 53 answered The DNS operation timed out.; Server 8.8.8.8 UDP port 53 answered The DNS operation timed out.


In [18]:
def check_tld_registerable(tld):
    """Placeholder for future implementation, should verify whether a TLD is actually open for registration
    """
    return '//TODO'

# df['tld_registerable'] = df.punycode.map(check_tld_registerable)

In [19]:
# export a loadable copy of the dataframe
df.to_excel(r'nekuda.xlsx', index=False)

In [20]:
df.head(10)

Unnamed: 0,punycode,unpunycode,detectedSourceLanguage,translatedText,translated_target,domain_to_register,domain_to_register_punycode,domainAvailability
0,XN--11B4C3D,कॉम,hi,com,माइक्रोसॉफ्ट,माइक्रोसॉफ्ट.कॉम,xn--n1b2a4a9bmo0d2bzgsauf.xn--11b4c3d,True
1,XN--1CK2E1B,セール,ja,Sale,マイクロソフト,マイクロソフト.セール,xn--eckwa6b3bwd6b6f.xn--1ck2e1b,True
2,XN--1QQW23A,佛山,zh-CN,Foshan,微软,微软.佛山,xn--g5ty67h.xn--1qqw23a,True
3,XN--2SCRJ9C,ಭಾರತ,kn,India,ಮೈಕ್ರೋಸಾಫ್ಟ್,ಮೈಕ್ರೋಸಾಫ್ಟ್.ಭಾರತ,xn--nscu9akl2c1bye3asfb.xn--2scrj9c,True
4,XN--30RR7Y,慈善,zh-CN,charitable,微软,微软.慈善,xn--g5ty67h.xn--30rr7y,True
5,XN--3BST00M,集团,zh-CN,group,微软,微软.集团,xn--g5ty67h.xn--3bst00m,True
6,XN--3DS443G,在线,zh-CN,online,微软,微软.在线,xn--g5ty67h.xn--3ds443g,False
7,XN--3E0B707E,한국,ko,korea,마이크로소프트,마이크로소프트.한국,xn--2o2b1zp4o0qg9pm4kdu7e.xn--3e0b707e,False
8,XN--3HCRJ9C,ଭାରତ,or,India,ମାଇକ୍ରୋସଫ୍ଟ|,ମାଇକ୍ରୋସଫ୍ଟ|.ଭାରତ,xn--|-9ne6bzb5cpq8d7b9huae.xn--3hcrj9c,True
9,XN--3PXU8K,点看,zh-CN,Click to see,微软,微软.点看,xn--g5ty67h.xn--3pxu8k,True


In [21]:
# how many domains are taken ('False') vs. free ('True')?
df.domainAvailability.value_counts()

domainAvailability
True     132
False     20
Name: count, dtype: int64