In [1]:
import tldextract
import math

def shannon_entropy(text):
    if not text:
        return 0.0
    from collections import Counter
    counts = Counter(text)
    length = len(text)
    return -sum((c/length) * math.log2(c/length) for c in counts.values())

def length_penalty(n):
    if n <= 12:
        return 1.0
    elif n <= 18:
        return 0.9
    elif n <= 25:
        return 0.8
    else:
        return 0.7

def domain_complexity_v23_formula(url, whitelist):
    ext = tldextract.extract(url)
    reg = (ext.registered_domain or "").lower()
    core = (ext.domain or "").lower()
    
    if not reg:
        return 0.0
    
    # 1. Longitud normalizada
    domain_length = len(reg)
    norm_len = min(domain_length / 18.0, 1.0)
    
    # 2. Entropía normalizada
    entropy = shannon_entropy(core)
    norm_ent = min(entropy / 3.8, 1.0)
    
    # 3. Combinación ponderada
    raw = 0.78 * norm_ent + 0.22 * norm_len
    
    # 4. Penalización continua
    raw *= length_penalty(domain_length)
    
    # 5. Whitelist dura
    if reg in whitelist:
        raw = 0.0
    
    # 6. Reescalado final
    score = raw ** 0.55
    return max(0.0, min(score, 1.0))
