In [26]:
import requests
import pandas as pd
import os
import whois
import dns.resolver
import six
from google.cloud import translate_v2 as translate

In [35]:
# Google Cloud Trnslation service API keys
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]='ENTER_YOUR_JSON_FILENAME_HERE'

In [3]:
# wrappers for simplfying the application of punycode-related lambda experessions to dataframes
def puny(s):
    return s.encode('idna').decode()
 
def unpuny(s):
    return str(s).lower().encode('idna').decode('idna')

In [4]:
def get_all_tlds():
    """ Gets all of the IDN TLDs

    Retrieves a dynamic list from IANA's website
    """
    res = requests.get('https://data.iana.org/TLD/tlds-alpha-by-domain.txt')
    all_tlds = res.text.split('\n')
    # Filter only punycode TLDs
    return (list(filter(lambda tld: 'XN--' in tld, all_tlds)))

In [6]:
# get all the current TLDs then use it to initialize the dataframe
idn_tld_puny_list = get_all_tlds()
df = pd.DataFrame(idn_tld_puny_list, columns=['punycode'])
# add non-ascii representation
df['unpunycode'] = df.punycode.map(unpuny)

In [7]:
def translate_text(target, text):
    """Translates text into the target language.

    Target must be an ISO 639-1 language code.
    See https://g.co/cloud/translate/v2/translate-reference#supported_languages
    """

    translate_client = translate.Client()

    if isinstance(text, six.binary_type):
        text = text.decode("utf-8")

    # Text can also be a sequence of strings, in which case this method
    # will return a sequence of results for each text.
    result = translate_client.translate(text, target_language=target)
    
    return(result)

In [8]:
# Translate the TLD to English
df['full_res'] = df.unpunycode.map(lambda x : translate_text('en-us', x))

In [9]:
# do some pandas tricks to split the raw content to separate cols
df['src_lang_code'] = df['full_res']
df['detectedSourceLanguage'] = df['full_res'].map(lambda x : x['detectedSourceLanguage'])
df['translatedText'] = df['full_res'].map(lambda x : x['translatedText'])

In [10]:
def get_word_in_all_langs(word, lang_codes):
    """
    Construct a dictionary - translate the keyword to any of the lang_codes
    """
    word_lang_dic = {}
    for lang in lang_codes:
        res = translate_text(lang, word)
        word_lang_dic[lang] = res['translatedText']
    return word_lang_dic  

In [11]:
# Aggregate a list of all the distinct lang codes
lang_codes = df.detectedSourceLanguage.unique()

In [12]:
# Get the translations for the target, in this case, Microsoft
idn_squatting_target = 'Microsoft'
word_in_other_langs = get_word_in_all_langs(idn_squatting_target, lang_codes)

In [13]:
# Put the translated target in the appropriate row
df['translated_target'] = df['detectedSourceLanguage'].map(lambda x : word_in_other_langs[x].replace(' ', ''))

In [14]:
# Construct the targeted domain
df['domain_to_register'] = df.translated_target + '.' + df.unpunycode

# check and mark failed translation attempts
df['domain_to_register'] = df['domain_to_register'].map(lambda x : 'translation failure' if idn_squatting_target in x else x)

# display also the punycode representation of the website 
df['domain_to_register_punycode'] = df.domain_to_register.map(puny)

In [29]:
def is_whois_exists(domain_name):
    """
    Return True iff domain_name has associated WHOIS records
    """
    try:
        w = whois.whois(domain_name)
    except Exception:
        return False
    else:
        return bool(w.domain_name)
    
def is_dns_exists(domain_name):
    """
    Return True iff domain_name has associated DNS records
    """
    try:
        r = dns.resolver.Resolver()
        # 8.8.8.8 is Google's public DNS server
        r.nameservers = ['8.8.8.8']
        
        result = r.resolve(domain_name, 'NS')
    except dns.resolver.NXDOMAIN:
        return False
    except dns.resolver.NoAnswer:
        return False
    except dns.resolver.NoNameservers:
        return True
    except Exception as e:
        print(f'Unhandled for {domain_name}')
        print(str(e))
    else: 
        return bool(result)

def check_domain_avail(domain_to_check):
    """ 
    Wrapping function for two different domain availability tests
    Return true iff the domain is available, i.e. there are *no* records
    """
    
    if is_dns_exists(domain_to_check) or is_whois_exists(domain_to_check):
        return False
    else:
        return True

In [None]:
df['domainAvailability'] = df.domain_to_register_punycode.map(check_domain_avail)

In [None]:
def check_tld_registerable(tld):
    """Placeholder for future implementation, should verify whether a TLD is actually open for registration
    """
    return '//TODO'

df['tld_registerable'] = df.punycode.map(check_tld_registerable)

In [None]:
# cleanup of internal state column leftovers
df = df.drop(['full_res', 'src_lang_code'], axis=1)

In [34]:
df[df.domainAvailability==True]

Unnamed: 0,punycode,unpunycode,full_res,src_lang_code,detectedSourceLanguage,translatedText,translated_target,domain_to_register,domain_to_register_punycode,tld_registerable,domainAvailability
0,XN--11B4C3D,कॉम,"{'translatedText': 'com', 'detectedSourceLangu...","{'translatedText': 'com', 'detectedSourceLangu...",hi,com,माइक्रोसॉफ्ट,माइक्रोसॉफ्ट.कॉम,xn--n1b2a4a9bmo0d2bzgsauf.xn--11b4c3d,//TODO,True
1,XN--1CK2E1B,セール,"{'translatedText': 'Sale', 'detectedSourceLang...","{'translatedText': 'Sale', 'detectedSourceLang...",ja,Sale,マイクロソフト,マイクロソフト.セール,xn--eckwa6b3bwd6b6f.xn--1ck2e1b,//TODO,True
2,XN--1QQW23A,佛山,"{'translatedText': 'Foshan', 'detectedSourceLa...","{'translatedText': 'Foshan', 'detectedSourceLa...",zh-CN,Foshan,微软,微软.佛山,xn--g5ty67h.xn--1qqw23a,//TODO,True
3,XN--2SCRJ9C,ಭಾರತ,"{'translatedText': 'India', 'detectedSourceLan...","{'translatedText': 'India', 'detectedSourceLan...",kn,India,ಮೈಕ್ರೋಸಾಫ್ಟ್,ಮೈಕ್ರೋಸಾಫ್ಟ್.ಭಾರತ,xn--nscu9akl2c1bye3asfb.xn--2scrj9c,//TODO,True
4,XN--30RR7Y,慈善,"{'translatedText': 'charitable', 'detectedSour...","{'translatedText': 'charitable', 'detectedSour...",zh-CN,charitable,微软,微软.慈善,xn--g5ty67h.xn--30rr7y,//TODO,True
...,...,...,...,...,...,...,...,...,...,...,...
146,XN--XKC2AL3HYE2A,இலங்கை,"{'translatedText': 'Sri Lanka', 'detectedSourc...","{'translatedText': 'Sri Lanka', 'detectedSourc...",ta,Sri Lanka,மைக்ரோசாப்ட்,மைக்ரோசாப்ட்.இலங்கை,xn--clckp7brn3h8c3asfb.xn--xkc2al3hye2a,//TODO,True
147,XN--XKC2DL3A5EE0H,இந்தியா,"{'translatedText': 'India', 'detectedSourceLan...","{'translatedText': 'India', 'detectedSourceLan...",ta,India,மைக்ரோசாப்ட்,மைக்ரோசாப்ட்.இந்தியா,xn--clckp7brn3h8c3asfb.xn--xkc2dl3a5ee0h,//TODO,True
148,XN--Y9A3AQ,հայ,"{'translatedText': 'Armenian:', 'detectedSourc...","{'translatedText': 'Armenian:', 'detectedSourc...",hy,Armenian:,Microsoft-ը,translation failure,translation failure,//TODO,True
149,XN--YFRO4I67O,新加坡,"{'translatedText': 'Singapore', 'detectedSourc...","{'translatedText': 'Singapore', 'detectedSourc...",zh-CN,Singapore,微软,微软.新加坡,xn--g5ty67h.xn--yfro4i67o,//TODO,True


In [None]:
# export a loadable copy of the dataframe
df.to_excel(r'nekuda.xlsx', index=False)