In [73]:
import pandas as pd
import requests
import io
import numpy as np

In [19]:
# Get the location of your CSV
url="https://www.namejet.com/download/10-08-2020.txt"

# Download and light feature engineering
s=requests.get(url).content
df=pd.read_csv(io.StringIO(s.decode('utf-8')), header=None)
df.columns = ['root_domain']

In [20]:
# View Data
df.head()

Unnamed: 0,root_domain
0,0-search.net
1,00000dd.com
2,0000359.com
3,0000592.com
4,0000598.com


In [21]:
# Get the value counts for the top level domains
df['tld'] = df['root_domain'].apply(lambda x: x.split(".")[1])
df['tld'].value_counts()

com     83874
net      9747
info     4348
org      4327
cc       1400
biz       744
tv        169
Name: tld, dtype: int64

In [29]:
# Get the length of the domain name
df['domain'] = df['root_domain'].apply(lambda x: x.split(".")[0])
df['domain_length'] = df['domain'].str.len()

In [54]:
# Create flag to see if there are punctuation marks
punctuation = ["-", "*", "."]
df['puct'] = df['domain'].apply(lambda x: any(char in x for char in punctuation))

In [56]:
# Create flag to see if there are numbers
number_list = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
df['num'] = df['domain'].apply(lambda x: any(char in x for char in number_list))

In [98]:
# Create domain score
# tld = .com is good
# domain length, shorter is better
# No puctuation
# No numbers

tld_dict_penalty = {
    'com'   :  .00,
    'net'   :  .15,
    'info'  :  .25,
    'org'   :  .06,
    'cc'    :  .38,
    'biz'   :  .46,
    'tv'    :  .29,
}

def score(row):
    score = 1
    
    # TLDs
    score = score - tld_dict_penalty[row['tld']]
    
    # Legnths
    score = score - (np.max(row['domain_length']/20, 0)**1.5)
    
    # Puncts
    if row['puct']:
        score = score - .3
        
    # Numbers
    if row['num']:
        score = score - .2
    
    score = np.min([score, 1])
    score = np.max([score, 0])
    
    return score

In [99]:
# Create a new column with your score
df['score'] = df.apply(score, axis=1)

In [102]:
# View your scores
df.sort_values(by='score', ascending=False).head(10)

Unnamed: 0,root_domain,tld,domain,domain_length,puct,num,score
48300,jeiv.com,com,jeiv,4,False,False,0.910557
103426,zffs.com,com,zffs,4,False,False,0.910557
69946,pfkn.com,com,pfkn,4,False,False,0.910557
69945,pfkd.com,com,pfkd,4,False,False,0.910557
73615,qriz.com,com,qriz,4,False,False,0.910557
17714,bzrq.com,com,bzrq,4,False,False,0.910557
22181,cnot.com,com,cnot,4,False,False,0.910557
93296,urfvg.com,com,urfvg,5,False,False,0.875
45098,icpyl.com,com,icpyl,5,False,False,0.875
45108,icuhp.com,com,icuhp,5,False,False,0.875
