In [10]:
import sys
import tldextract
import os
from math import exp

# Ruta absoluta al proyecto (sube un nivel desde /notebooks)
ROOT = os.path.abspath("..")
sys.path.append(ROOT)

import features.features_v2 as fv2
import importlib
importlib.reload(fv2)

fv2


<module 'features.features_v2' from '/Users/test/Desktop/phishing-detector/features/features_v2.py'>

In [7]:
import pandas as pd

es = pd.read_csv("../docs/dominios_espanyoles.csv")["domain"].str.lower().tolist()
globales = pd.read_csv("../docs/global_neutral_domains.csv")["domain"].str.lower().tolist()

whitelist_full = set(es) | set(globales)


In [3]:
df["domain_complexity"] = df["url"].apply(
    lambda u: fv2.extract_features_v2(u, spanish_whitelist)[0]
)


In [4]:
df.groupby("label")["domain_complexity"].describe()


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,244.0,0.548587,0.202859,0.0,0.432556,0.602267,0.706502,0.904847
1,248.0,0.530251,0.280479,0.0,0.218656,0.605302,0.795112,0.971063


In [5]:
def compute_domain_complexity_fixed(url, whitelist):
    parsed = urlparse(url)
    extract_res = tldextract.extract(url)

    domain_length, domain_entropy = fv2._compute_domain_features(extract_res)
    registered = (extract_res.registered_domain or "").lower()
    wl = fv2._compute_domain_whitelist_score(registered, whitelist)

    # Normalizaciones buenas
    normalized_length = min(domain_length / 20.0, 1.0)
    normalized_entropy = min(domain_entropy / 4.0, 1.0)

    # Base cruda
    raw = normalized_length * normalized_entropy

    # Penalización suave a dominios cortos
    if domain_length < 10:
        raw *= (0.85 + 0.015 * domain_length)

    # Whitelist no anula la feature, la reduce
    if wl == 1:
        raw *= 0.25

    # Logistic squashing
    dc = 1 / (1 + exp(-6 * (raw - 0.15)))

    return float(max(0.0, min(dc, 1.0)))


In [8]:
def compute_domain_complexity_v2_2(url, whitelist):
    parsed = urlparse(url)
    extract_res = tldextract.extract(url)

    registered = (extract_res.registered_domain or "").lower()
    domain = (extract_res.domain or "").lower()

    # 1) Features internas
    domain_length = len(registered)
    domain_entropy = fv2._shannon_entropy(domain)
    dws = 1 if registered in whitelist else 0

    # 2) Normalizaciones
    norm_len = min(domain_length / 20, 1.0)
    norm_ent = min(domain_entropy / 4, 1.0)

    raw = 0.65 * norm_ent + 0.35 * norm_len

    # 3) Penalización no lineal para dominios cortos
    if domain_length < 12:
        raw *= (domain_length / 12) ** 2

    # 4) Dominio whitelisted → siempre legítimo
    if dws == 1:
        raw = 0.0

    # 5) Sigmoide suave
    dc = 1 / (1 + exp(-3 * (raw - 0.20)))

    return float(max(0.0, min(dc, 1.0)))


In [11]:
df["domain_complexity_v2_2"] = df["url"].apply(
    lambda u: compute_domain_complexity_v2_2(u, whitelist_full)
)

df.groupby("label")["domain_complexity_v2_2"].describe()


  registered = (extract_res.registered_domain or "").lower()


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,244.0,0.545743,0.20182,0.354344,0.354344,0.489351,0.733364,0.892308
1,248.0,0.677155,0.182861,0.354344,0.482526,0.737933,0.856355,0.909964


In [13]:
import numpy as np

def resumen_umbral(df, col, T):
    phish = df[df["label"] == 1][col]
    legit = df[df["label"] == 0][col]
    tpr = (phish >= T).mean()
    fpr = (legit >= T).mean()
    return {"T": T, "TPR_phishing": tpr, "FPR_legit": fpr}

for T in [0.5, 0.6, 0.7]:
    print(resumen_umbral(df, "domain_complexity_v2_2", T))


{'T': 0.5, 'TPR_phishing': np.float64(0.6129032258064516), 'FPR_legit': np.float64(0.5)}
{'T': 0.6, 'TPR_phishing': np.float64(0.5685483870967742), 'FPR_legit': np.float64(0.3975409836065574)}
{'T': 0.7, 'TPR_phishing': np.float64(0.5241935483870968), 'FPR_legit': np.float64(0.3319672131147541)}


In [14]:
from math import log2  # ya lo tienes por fv2, pero por si acaso
import tldextract
from urllib.parse import urlparse  # sólo si no está ya importado

def compute_domain_complexity_v22(url: str, spanish_whitelist) -> float:
    """
    Domain complexity v2.2:
    - Usa length + entropy del dominio registrado
    - Penaliza fuerte dominios cortos
    - Añade pequeño bonus si el TLD es de riesgo
    - Si está en whitelist -> 0.0
    """
    try:
        url = (url or "").strip()
        if not url:
            return 0.0

        extract_res = tldextract.extract(url)
        registered = (extract_res.registered_domain or "").lower()

        # Internas del propio módulo
        domain_length, domain_entropy = fv2._compute_domain_features(extract_res)
        domain_whitelist_score = fv2._compute_domain_whitelist_score(
            registered, spanish_whitelist
        )
        _, tld_risk_weight, _ = fv2._compute_infra_signals(url, extract_res)

        # Normalizaciones
        normalized_length = max(0.0, min(domain_length / 20.0, 1.0))
        normalized_entropy = max(0.0, min(domain_entropy / 4.0, 1.0))

        # Peso más fuerte a la entropía
        base = 0.7 * normalized_entropy + 0.3 * normalized_length

        # Penalización fuerte a dominios cortos
        if domain_length <= 8:
            base *= 0.20
        elif domain_length <= 11:
            base *= 0.40
        elif domain_length <= 14:
            base *= 0.70
        # >14 sin penalización adicional

        # Bonus por TLD de riesgo (TLD_RISK ya viene ponderado)
        risk_bonus = 0.15 * min(max(tld_risk_weight, 0.0) / 3.0, 1.0)
        raw = base + risk_bonus

        # Whitelist manda
        if domain_whitelist_score == 1:
            raw = 0.0

        # Clamping final
        return float(max(0.0, min(raw, 1.0)))

    except Exception:
        return 0.0


In [15]:
df["domain_complexity_v22"] = df["url"].apply(
    lambda u: compute_domain_complexity_v22(u, spanish_whitelist)
)
df.groupby("label")["domain_complexity_v22"].describe()


  registered = (extract_res.registered_domain or "").lower()


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,244.0,0.359484,0.242223,0.073474,0.208966,0.242515,0.497594,0.897527
1,248.0,0.474227,0.28399,0.05,0.229474,0.362064,0.798582,1.0


In [16]:
def resumen_umbral(df, col, T):
    phish = df[df["label"] == 1][col]
    legit = df[df["label"] == 0][col]
    tpr = (phish >= T).mean()
    fpr = (legit >= T).mean()
    return {"T": T, "TPR_phishing": tpr, "FPR_legit": fpr}

for T in [0.5, 0.6, 0.7]:
    print(resumen_umbral(df, "domain_complexity_v22", T))


{'T': 0.5, 'TPR_phishing': np.float64(0.3911290322580645), 'FPR_legit': np.float64(0.20901639344262296)}
{'T': 0.6, 'TPR_phishing': np.float64(0.3387096774193548), 'FPR_legit': np.float64(0.1885245901639344)}
{'T': 0.7, 'TPR_phishing': np.float64(0.3346774193548387), 'FPR_legit': np.float64(0.1885245901639344)}


In [18]:
def compute_domain_complexity_v23(url, whitelist):
    parsed = urlparse(url)
    ext = tldextract.extract(url)
    
    registered = (ext.registered_domain or "").lower()
    core = (ext.domain or "").lower()

    # 1) Señales base
    domain_length = len(registered)
    domain_entropy = _shannon_entropy(core)

    # 2) Normalizaciones revisadas
    norm_len = min(domain_length / 18.0, 1.0)      # más sensible, rango más corto
    norm_ent = min(domain_entropy / 3.8, 1.0)      # entropía más discriminativa

    # 3) Combinación principal (más peso a entropía)
    raw = 0.78 * norm_ent + 0.22 * norm_len

    # 4) Dominio corto = aún más penalización
    if domain_length < 10:
        raw *= 0.35

    # 5) Whitelist = dominio real, no puede ser phishing
    if registered in whitelist:
        raw = 0.0

    # 6) Reescalado final: potencia < 1 amplifica diferencias
    score = raw ** 0.55

    return float(max(0.0, min(score, 1.0)))


  registered = (ext.registered_domain or "").lower()


NameError: name '_shannon_entropy' is not defined