# Scoring heurístico – Detección de URLs orientadas a España

**Objetivo:** asignar una puntuación (`score_total`) a cada URL del dataset
para estimar su probabilidad de estar dirigida al contexto español
(dominio `.es`, idioma castellano, marcas nacionales, etc.).

Este módulo aplica un sistema de reglas heurísticas,
no un modelo ML, con el fin de curar datasets y priorizar muestras.


In [7]:
import pandas as pd
import re
from urllib.parse import urlparse
from rapidfuzz import fuzz
from collections import Counter

# 1️⃣ Cargar el dataset principal
df = pd.read_csv("../../../dataset/dataset_prototipo.csv")

# 2️⃣ Cargar la whitelist española
whitelist_path = "../../../docs/spanish_domains.csv"
spanish_whitelist = pd.read_csv(whitelist_path)["domain"].str.lower().tolist()

print(f"✅ Cargado dataset con {len(df)} URLs")
print(f"✅ Cargada whitelist española con {len(spanish_whitelist)} dominios")


✅ Cargado dataset con 200 URLs
✅ Cargada whitelist española con 200 dominios


In [8]:
def score_url_v2_4(url, spanish_whitelist):
    url_low = str(url).lower()
    parsed = urlparse(url_low)
    netloc = parsed.netloc
    domain_core = netloc.split('.')[-2] if '.' in netloc else netloc

    score = 0
    signals = []

    # --- Listas de tokens y referencias ---
    POSITIVE_KEYWORDS = ['multa','pago','verificación','cliente','acceso','seguridad','confirmación','factura','tarjeta']
    SPANISH_MARKERS = ['.es', '+34', '€']
    SPANISH_BRANDS = ['santander','bbva','caixabank','ing','bankia','openbank','ionos','orange','movistar','correos','dgt']
    SPANISH_HOSTINGS = ['webcindario','rf.gd']

    GENERIC_SP_TOKENS = [
        'servicio','soporte','atencion','cliente','usuarios','ayuda','asistencia',
        'cuenta','acceso','inicio','login','sesion','datos','perfil','portal',
        'seguridad','verificacion','confirmacion','actualizacion','validacion','auth','clave','codigo',
        'envio','entrega','paquete','pedido','multa','factura','notificacion','aviso',
        'gob','oficial','tramite','tramites','agencia','impuestos','certificado'
    ]

    BANKING_TOKENS = [
        'banco','banca','bank','banking','transferencia','tarjeta','pin','clave','codigo','validacion',
        'firma','token','sms','autenticacion','movimientos','saldo','oficinavirtual','bancamovil',
        'appbanco','bancadigital','acceso','usuarios','verificacion','serviciocliente','soportecliente'
    ]

    INSTITUTIONAL_TOKENS = [
        'ayuntamiento','gob','gobierno','agencia','tramite','tramites','oficial','certificado',
        'seg-social','catastro','impuestos','tributos','dgt','hacienda','dni','salud','sanidad'
    ]

    PROFESSIONAL_TOKENS = [
        'asesoria','gestoria','abogado','despacho','consultoria','contable','laboral','fiscal','bufete','notaria'
    ]

    ECOMMERCE_TOKENS = [
        'pedido','pedidos','compra','compras','factura','facturas','recibo','recibos',
        'abonado','tarifa','tarifas','servicios','renovar','renovacion','contrato',
        'suscripcion','envio','entrega','paquete','envios','devolucion'
    ]

    LATAM_TLDS = ['.co', '.mx', '.cl', '.ar', '.br', '.pe', '.ec', '.uy', '.py', '.bo', '.sv', '.hn', '.cr', '.gt', '.do']

    # --- Reglas base ---
    for kw in POSITIVE_KEYWORDS:
        if kw in url_low:
            score += 1
            signals.append(f'has_kw:{kw}')

    # Indicadores de país (.es ahora +2)
    for m in SPANISH_MARKERS:
        if m in url_low:
            if m == '.es':
                score += 2
            else:
                score += 1
            signals.append(f'spanish_marker:{m}')

    # Marcas españolas
    for b in SPANISH_BRANDS:
        if b in url_low:
            score += 1
            signals.append(f'spanish_brand:{b}')

    # Hostings españoles frecuentes
    for h in SPANISH_HOSTINGS:
        if h in url_low:
            score += 2
            signals.append(f'spanish_hosting:{h}')

    # .com.es
    if '.com.es' in url_low:
        score += 2
        signals.append('tld_combo_com_es')

    # Acción + entrega
    verbs = ['modifica','modificar','actualiza','actualizar','cambia','cambiar','reprograma','reprogramar','ajusta','ajustar','corrige','corregir']
    delivery = ['entrega','envio','envío','pedido','paquete']
    hyphen_pattern = re.compile(r'(' + '|'.join(verbs) + r')[-_%2d]*(tu|mi|su)?[-_%2d]*(' + '|'.join(delivery) + r')', flags=re.IGNORECASE)
    if hyphen_pattern.search(url_low):
        m = hyphen_pattern.search(url_low)
        score += 2
        signals.append(f'action_delivery:{m.group(0)}')

    # Hoster sospechoso + tokens de entrega
    if any(h in url_low for h in ['github.io','forms.app','pages.dev','pages.github','netlify.app','webflow.io']):
        if any(tok in url_low for tok in delivery):
            score += 2
            signals.append('susp_hoster_plus_delivery')

    # --- Reglas semánticas compuestas ---
    if '.es' in url_low:
        if sum(tok in url_low for tok in GENERIC_SP_TOKENS) >= 2:
            score += 3
            signals.append('generic_service_combo_es')
        if any(tok in url_low for tok in BANKING_TOKENS):
            score += 3
            signals.append('banking_combo_es')
        if any(tok in url_low for tok in INSTITUTIONAL_TOKENS + PROFESSIONAL_TOKENS):
            score += 3
            signals.append('institutional_professional_es')
        if any(tok in url_low for tok in ECOMMERCE_TOKENS):
            score += 2
            signals.append('ecommerce_combo_es')

    # --- Whitelist española ---
    for legit in spanish_whitelist:
        if legit in url_low:
            score += 2
            signals.append(f'spanish_whitelist_match:{legit}')
            break
        else:
            legit_base = legit.split('.')[0]
            sim = fuzz.ratio(domain_core, legit_base)
            if sim >= 80:
                score += 2
                signals.append(f'fuzzy_whitelist_match:{legit}:{sim:.0f}')
                break

    # --- Penalizaciones LATAM ---
    for tld in LATAM_TLDS:
        if url_low.endswith(tld) or f"{tld}/" in url_low:
            score -= 2
            signals.append(f'latam_tld:{tld}')

    for pk in ['pagamento','fatura','acesso','faturas']:
        if pk in url_low:
            score -= 2
            signals.append(f'pt_kw:{pk}')

    for lb in ['banrural','pichincha','itau','bradesco','yape','daviplata']:
        if lb in url_low:
            score -= 1
            signals.append(f'latam_brand:{lb}')

    # --- Fuzzy de marca española ---
    for brand in SPANISH_BRANDS:
        sim = fuzz.ratio(domain_core, brand)
        if sim >= 80:
            score += 2
            signals.append(f'fuzzy_brand_match:{brand}:{sim:.0f}')
            break

    return score, ';'.join(signals)


In [9]:
# Aplicar la función al dataset cargado
df["score_total"], df["signals_detected"] = zip(*df["url"].apply(lambda x: score_url_v2_4(x, spanish_whitelist)))
df["label_pred"] = (df["score_total"] >= 2).astype(int)

print("✅ Scoring aplicado correctamente.")


✅ Scoring aplicado correctamente.


In [10]:
# --- Evaluación del scoring v2.4 ---

# 1️⃣ Métricas básicas
num_total = len(df)
num_phishing = int(df['label'].sum())
num_detectadas = int(df[(df['label']==1) & (df['label_pred']==1)].shape[0])
num_falsos_neg = int(df[(df['label']==1) & (df['label_pred']==0)].shape[0])
num_no_phishing = int((df['label']==0).sum())

recall = num_detectadas / num_phishing if num_phishing > 0 else 0
precision = num_detectadas / df[df['label_pred']==1].shape[0] if df[df['label_pred']==1].shape[0] > 0 else 0

print(f"Total URLs: {num_total}")
print(f"Phishing: {num_phishing}")
print(f"Detectadas como españolas: {num_detectadas}")
print(f"Falsos negativos: {num_falsos_neg}")
print(f"📈 Recall: {recall:.2%}")
print(f"🎯 Precisión (aprox.): {precision:.2%}")


Total URLs: 200
Phishing: 100
Detectadas como españolas: 47
Falsos negativos: 53
📈 Recall: 47.00%
🎯 Precisión (aprox.): 41.96%


In [11]:
from collections import Counter

def explode_signals(s):
    if pd.isna(s) or s == '':
        return []
    return s.split(';')

# Contar señales más frecuentes en phishing detectado
signals_detected = Counter([sig for sig_list in df[df['label']==1]['signals_detected'].apply(explode_signals) for sig in sig_list])
signals_detected.most_common(20)


[('spanish_marker:.es', 21),
 ('spanish_brand:ing', 15),
 ('spanish_hosting:webcindario', 11),
 ('has_kw:cliente', 9),
 ('tld_combo_com_es', 5),
 ('spanish_brand:bbva', 5),
 ('spanish_brand:dgt', 4),
 ('fuzzy_whitelist_match:google.es:100', 4),
 ('has_kw:pago', 4),
 ('has_kw:seguridad', 4),
 ('spanish_brand:ionos', 3),
 ('has_kw:factura', 3),
 ('spanish_brand:orange', 3),
 ('spanish_brand:santander', 3),
 ('spanish_brand:correos', 2),
 ('has_kw:multa', 2),
 ('has_kw:acceso', 2),
 ('fuzzy_whitelist_match:blogspot.com.es:100', 2),
 ('has_kw:tarjeta', 2),
 ('spanish_whitelist_match:madrid.es', 1)]

In [12]:
df_detectadas = df[(df['label']==1) & (df['label_pred']==1)]
df_detectadas[['url','score_total','signals_detected']].sort_values(by='score_total', ascending=False).head(10)


Unnamed: 0,url,score_total,signals_detected
193,http://bancosantander.es.hotelparadis.es/aviso...,11,spanish_marker:.es;spanish_brand:santander;gen...
195,https://bbva.es.personal-cuenta.info/personal/...,5,spanish_marker:.es;spanish_brand:bbva;spanish_...
111,https://asesoriabarrachina.es/wp-includes/stra...,5,spanish_marker:.es;institutional_professional_es
169,http://movistarui.vip,5,spanish_brand:movistar;fuzzy_whitelist_match:m...
110,http://serv-authveriline.com.es,4,spanish_marker:.es;tld_combo_com_es
102,https://robllox.com.es,4,spanish_marker:.es;tld_combo_com_es
104,https://robiox.com.es,4,spanish_marker:.es;tld_combo_com_es
105,http://unicismadrid.es/wp-content/com/index/ch...,4,spanish_marker:.es;spanish_whitelist_match:mad...
106,https://authline-checkappr0v.com.es/7aIT03j82s...,4,spanish_marker:.es;tld_combo_com_es
109,http://authline-checkappr0v.com.es,4,spanish_marker:.es;tld_combo_com_es


In [13]:
df_missed = df[(df['label']==1) & (df['label_pred']==0)]
df_missed[['url','score_total','signals_detected']].head(10)


Unnamed: 0,url,score_total,signals_detected
114,https://app-ing.direct-ayuda.com,1,spanish_brand:ing
115,https://app-ing.direct-ayuda.com/esapp/,1,spanish_brand:ing
116,https://correos-paqueteria.com/asset.php,1,spanish_brand:correos
117,https://dgtt48.lat/yyjxzpjg/e2Walj/7,1,spanish_brand:dgt
118,https://extremas.com.ar/styles/ccDGT/DGT/cc.php,-1,spanish_brand:dgt;latam_tld:.ar
119,https://ingbanksecure.com/,1,spanish_brand:ing
120,https://l.ead.me/DGTspain,1,spanish_brand:dgt
122,https://www.ing-es-movil.com,1,spanish_brand:ing
125,https://admin.multa.net/,1,has_kw:multa
127,https://ingsecusecuring.web.app/,1,spanish_brand:ing


In [14]:
print("Score medio phishing detectado:", df_detectadas['score_total'].mean())
print("Score medio phishing no detectado:", df_missed['score_total'].mean())


Score medio phishing detectado: 3.021276595744681
Score medio phishing no detectado: 0.5849056603773585


In [16]:
def score_url_v2_5(url, spanish_whitelist):
    """
    Scoring v2.5: v2.4 + correcciones para recuperar marca+token, marcas en subdominio,
    y acortadores con token 'spain'/'es'.
    spanish_whitelist: lista de dominios (strings) en minúscula cargada desde data/spanish_domains.csv
    """
    url_low = str(url).lower()
    parsed = urlparse(url_low)
    netloc = parsed.netloc or ''
    path = parsed.path or ''
    query = parsed.query or ''
    domain_core = netloc.split('.')[-2] if '.' in netloc else netloc

    score = 0
    signals = []

    # --- Listas base (idénticas a v2.4) ---
    POSITIVE_KEYWORDS = ['multa','pago','verificación','cliente','acceso','seguridad','confirmación','factura','tarjeta']
    SPANISH_MARKERS = ['.es', '+34', '€']
    SPANISH_BRANDS = ['santander','bbva','caixabank','ing','bankia','openbank','ionos','orange','movistar','correos','dgt']
    SPANISH_HOSTINGS = ['webcindario','rf.gd']

    GENERIC_SP_TOKENS = [
        'servicio','soporte','atencion','cliente','usuarios','ayuda','asistencia',
        'cuenta','acceso','inicio','login','sesion','datos','perfil','portal',
        'seguridad','verificacion','confirmacion','actualizacion','validacion','auth','clave','codigo',
        'envio','entrega','paquete','pedido','multa','factura','notificacion','aviso',
        'gob','oficial','tramite','tramites','agencia','impuestos','certificado'
    ]

    BANKING_TOKENS = [
        'banco','banca','bank','banking','transferencia','tarjeta','pin','clave','codigo','validacion',
        'firma','token','sms','autenticacion','movimientos','saldo','oficinavirtual','bancamovil',
        'appbanco','bancadigital','acceso','usuarios','verificacion','serviciocliente','soportecliente'
    ]

    INSTITUTIONAL_TOKENS = [
        'ayuntamiento','gob','gobierno','agencia','tramite','tramites','oficial','certificado',
        'seg-social','catastro','impuestos','tributos','dgt','hacienda','dni','salud','sanidad'
    ]

    PROFESSIONAL_TOKENS = [
        'asesoria','gestoria','abogado','despacho','consultoria','contable','laboral','fiscal','bufete','notaria'
    ]

    ECOMMERCE_TOKENS = [
        'pedido','pedidos','compra','compras','factura','facturas','recibo','recibos',
        'abonado','tarifa','tarifas','servicios','renovar','renovacion','contrato',
        'suscripcion','envio','entrega','paquete','envios','devolucion'
    ]

    LATAM_TLDS = ['.co', '.mx', '.cl', '.ar', '.br', '.pe', '.ec', '.uy', '.py', '.bo', '.sv', '.hn', '.cr', '.gt', '.do']

    # --- Reglas base (igual que v2.4) ---
    for kw in POSITIVE_KEYWORDS:
        if kw in url_low:
            score += 1
            signals.append(f'has_kw:{kw}')

    for m in SPANISH_MARKERS:
        if m in url_low:
            if m == '.es':
                score += 2
            else:
                score += 1
            signals.append(f'spanish_marker:{m}')

    for b in SPANISH_BRANDS:
        if b in url_low:
            score += 1
            signals.append(f'spanish_brand:{b}')

    for h in SPANISH_HOSTINGS:
        if h in url_low:
            score += 2
            signals.append(f'spanish_hosting:{h}')

    if '.com.es' in url_low:
        score += 2
        signals.append('tld_combo_com_es')

    # action + delivery patterns
    verbs = ['modifica','modificar','actualiza','actualizar','cambia','cambiar','reprograma','reprogramar','ajusta','ajustar','corrige','corregir']
    delivery = ['entrega','envio','envío','pedido','paquete']
    hyphen_pattern = re.compile(r'(' + '|'.join(verbs) + r')[-_%2d]*(tu|mi|su)?[-_%2d]*(' + '|'.join(delivery) + r')', flags=re.IGNORECASE)
    if hyphen_pattern.search(url_low):
        m = hyphen_pattern.search(url_low)
        score += 2
        signals.append(f'action_delivery:{m.group(0)}')

    if any(h in url_low for h in ['github.io','forms.app','pages.dev','pages.github','netlify.app','webflow.io']):
        if any(tok in url_low for tok in delivery):
            score += 2
            signals.append('susp_hoster_plus_delivery')

    # semánticas compuestas
    if '.es' in url_low:
        if sum(tok in url_low for tok in GENERIC_SP_TOKENS) >= 2:
            score += 3
            signals.append('generic_service_combo_es')
        if any(tok in url_low for tok in BANKING_TOKENS):
            score += 3
            signals.append('banking_combo_es')
        if any(tok in url_low for tok in INSTITUTIONAL_TOKENS + PROFESSIONAL_TOKENS):
            score += 3
            signals.append('institutional_professional_es')
        if any(tok in url_low for tok in ECOMMERCE_TOKENS):
            score += 2
            signals.append('ecommerce_combo_es')

    # whitelist (exacta + fuzzy)
    for legit in spanish_whitelist:
        if legit in url_low:
            score += 2
            signals.append(f'spanish_whitelist_match:{legit}')
            break
        else:
            legit_base = legit.split('.')[0]
            sim = fuzz.ratio(domain_core, legit_base)
            if sim >= 80:
                score += 2
                signals.append(f'fuzzy_whitelist_match:{legit}:{sim:.0f}')
                break

    # --- NUEVAS REGLAS para recuperar buenos casos ---

    # 1) brand + token español (marca + palabra española relevante) -> +2
    spanish_tokens_for_brand = ['ayuda','cliente','esapp','es','spain','movil','ayuntamiento','paqueteria','paquete','envio','entrega']
    if any(b in url_low for b in SPANISH_BRANDS) and any(tok in url_low for tok in spanish_tokens_for_brand):
        score += 2
        signals.append('brand_plus_spanish_token')

    # 2) marca en subdominio (antes del domain_core) -> +2
    # ejemplo: ing.es-movile.com  or ing.something.example.com
    try:
        # subdominio string = everything except last two labels
        parts = netloc.split('.')
        if len(parts) > 2:
            subparts = parts[:-2]  # la parte de subdominio
            subdomain_str = '.'.join(subparts)
            if any(b in subdomain_str for b in SPANISH_BRANDS):
                score += 2
                signals.append('brand_in_subdomain')
    except Exception:
        pass

    # 3) acortadores que indiquen targeting Spain (shortener + token 'spain'|'es'|'dgt'|'bbva' etc.) -> +2
    shorteners = ['l.ead.me','bit.ly','t.co','tinyurl.com','ow.ly','is.gd']
    if any(s in netloc for s in shorteners):
        # check path/query for clear tokens
        short_tokens = ['spain','es','dgt','bbva','correos','ing','santander','caixabank']
        if any(tok in path or tok in query for tok in short_tokens):
            score += 2
            signals.append('shortener_spain')

    # --- Penalizaciones LATAM ---
    for tld in LATAM_TLDS:
        if url_low.endswith(tld) or f"{tld}/" in url_low:
            score -= 2
            signals.append(f'latam_tld:{tld}')

    for pk in ['pagamento','fatura','acesso','faturas']:
        if pk in url_low:
            score -= 2
            signals.append(f'pt_kw:{pk}')

    for lb in ['banrural','pichincha','itau','bradesco','yape','daviplata']:
        if lb in url_low:
            score -= 1
            signals.append(f'latam_brand:{lb}')

    # fuzzy brand match final (mantenerlo)
    for brand in SPANISH_BRANDS:
        sim = fuzz.ratio(domain_core, brand)
        if sim >= 80:
            score += 2
            signals.append(f'fuzzy_brand_match:{brand}:{sim:.0f}')
            break

    return score, ';'.join(signals)

In [17]:
# aplicar scoring v2.5 (sin guardar todavía)
df['score_total'], df['signals_detected'] = zip(*df['url'].apply(lambda u: score_url_v2_5(u, spanish_whitelist)))
df['label_pred'] = (df['score_total'] >= 2).astype(int)

# métricas básicas
num_phishing = int(df['label'].sum())
num_detectadas = int(df[(df['label']==1) & (df['label_pred']==1)].shape[0])
recall = num_detectadas / num_phishing if num_phishing>0 else 0
precision = num_detectadas / df[df['label_pred']==1].shape[0] if df[df['label_pred']==1].shape[0]>0 else 0

print(f"Recall: {recall:.2%}, Precision (aprox): {precision:.2%}")


Recall: 64.00%, Precision (aprox): 49.23%


In [18]:
from collections import Counter
def explode_signals(s):
    return [] if pd.isna(s) else s.split(';')
signals_detected = Counter(
    sig for sig_list in df[df['label']==1]['signals_detected'].apply(explode_signals) for sig in sig_list
)
Counter({k:v for k,v in signals_detected.most_common(15)})


Counter({'brand_plus_spanish_token': 25,
         'spanish_marker:.es': 21,
         '': 20,
         'spanish_brand:ing': 15,
         'brand_in_subdomain': 12,
         'spanish_hosting:webcindario': 11,
         'has_kw:cliente': 9,
         'tld_combo_com_es': 5,
         'spanish_brand:bbva': 5,
         'spanish_brand:dgt': 4,
         'shortener_spain': 4,
         'fuzzy_whitelist_match:google.es:100': 4,
         'has_kw:pago': 4,
         'has_kw:seguridad': 4,
         'spanish_brand:ionos': 3})

In [19]:
# Qué URLs pasaron de no detectadas a detectadas (0 → 1)
df_diff = df[(df['label_pred']==1) & (df['score_total']>=2)]
df_diff[['url','score_total','signals_detected']].sort_values(by='score_total', ascending=False).head(15)


Unnamed: 0,url,score_total,signals_detected
26,https://www.ionos.es/ayuda/mi-cuenta/gestionar...,16,has_kw:acceso;spanish_marker:.es;spanish_brand...
24,https://www.ionos.es/ayuda/mi-cuenta/factura-1...,15,has_kw:factura;spanish_marker:.es;spanish_bran...
193,http://bancosantander.es.hotelparadis.es/aviso...,15,spanish_marker:.es;spanish_brand:santander;gen...
17,https://sede.dgt.gob.es/es/multas/pago-de-multas/,15,has_kw:multa;has_kw:pago;spanish_marker:.es;sp...
18,https://sede.dgt.gob.es/es/otros-tramites/cita...,13,spanish_marker:.es;spanish_brand:dgt;generic_s...
58,https://www.orange.es/pago-facturas,13,has_kw:pago;has_kw:factura;spanish_marker:.es;...
69,https://www.movistar.es/area-cliente/mi-cuenta/,13,has_kw:cliente;spanish_marker:.es;spanish_bran...
68,https://www.caixabank.es/empresa/home/empresas...,12,spanish_marker:.es;spanish_brand:caixabank;ban...
23,https://www.ionos.es/ayuda/sitios-web-tiendas/...,12,has_kw:pago;spanish_marker:.es;spanish_brand:i...
20,https://www.dgt.es/inicio/,12,spanish_marker:.es;spanish_brand:dgt;instituti...


In [20]:
# 🔍 URLs de phishing NO detectadas (falsos negativos)
df_missed = df[(df['label'] == 1) & (df['label_pred'] == 0)]

print(f"Total de phishing no detectados: {len(df_missed)}")
df_missed[['url','score_total','signals_detected']].sort_values(by='score_total', ascending=False).head(20)


Total de phishing no detectados: 36


Unnamed: 0,url,score_total,signals_detected
117,https://dgtt48.lat/yyjxzpjg/e2Walj/7,1,spanish_brand:dgt
139,https://w.aviso-cliente.net,1,has_kw:cliente
191,https://clientepresenteado.digital/inicio,1,has_kw:cliente
179,https://galiciaseguridad.myportfolio.com/,1,has_kw:seguridad
171,https://pago-master.digital/Confirmacio%CC%81n...,1,has_kw:pago
164,https://www.ing-web-inicios.com/,1,spanish_brand:ing
118,https://extremas.com.ar/styles/ccDGT/DGT/cc.php,1,spanish_brand:dgt;brand_plus_spanish_token;lat...
159,https://netflix-pagos.blog/,1,has_kw:pago
151,https://recibo-de-pago.weebly.com/,1,has_kw:pago
144,https://app.biaclientefonefacil.digital,1,has_kw:cliente


In [21]:
def score_url_v2_6(url, spanish_whitelist):
    """
    Scoring v2.6: v2.5 + boost controlado para marcas españolas en TLDs globales.
    spanish_whitelist: lista de dominios (strings) en minúscula.
    """
    url_low = str(url).lower()
    parsed = urlparse(url_low)
    netloc = parsed.netloc or ''
    path = parsed.path or ''
    query = parsed.query or ''
    domain_core = netloc.split('.')[-2] if '.' in netloc else netloc

    score = 0
    signals = []

    # --- Listas base (idénticas a v2.5) ---
    POSITIVE_KEYWORDS = ['multa','pago','verificación','cliente','acceso','seguridad','confirmación','factura','tarjeta']
    SPANISH_MARKERS = ['.es', '+34', '€']
SPANISH_BRANDS = [
    # 🏦 Banca
    "bbva", "santander", "caixabank", "bankia", "bankinter", "openbank", "evo", "abanca",
    "unicaja", "kutxabank", "cajarural", "ing", "imaginbank",

    # 📡 Telecomunicaciones
    "movistar", "orange", "jazztel", "yoigo", "masmovil", "lowi", "pepephone",

    # ✉️ Envíos y logística
    "correos", "mrw", "seur", "gls", "nacex", "dhl", "envialia",

    # ⚡ Energía
    "iberdrola", "endesa", "naturgy", "repsol", "totalenergies",

    # 🛒 E-commerce y consumo
    "elcorteingles", "zara", "aliexpress", "amazon", "carrefour", "mediamarkt", "pccomponentes",

    # 🧾 Administración y servicios públicos
    "dgt", "aeat", "seg-social", "sede", "mapfre", "fnmt", "redsara", "catastro", "interior",

    # 💻 Hosting / servicios web conocidos en ES
    "ionos", "dinahosting", "hostalia", "cdmon"
]

    SPANISH_HOSTINGS = ['webcindario','rf.gd']

    GENERIC_SP_TOKENS = [
        'servicio','soporte','atencion','cliente','usuarios','ayuda','asistencia',
        'cuenta','acceso','inicio','login','sesion','datos','perfil','portal',
        'seguridad','verificacion','confirmacion','actualizacion','validacion','auth','clave','codigo',
        'envio','entrega','paquete','pedido','multa','factura','notificacion','aviso',
        'gob','oficial','tramite','tramites','agencia','impuestos','certificado'
    ]

    BANKING_TOKENS = [
        'banco','banca','bank','banking','transferencia','tarjeta','pin','clave','codigo','validacion',
        'firma','token','sms','autenticacion','movimientos','saldo','oficinavirtual','bancamovil',
        'appbanco','bancadigital','acceso','usuarios','verificacion','serviciocliente','soportecliente'
    ]

    INSTITUTIONAL_TOKENS = [
        'ayuntamiento','gob','gobierno','agencia','tramite','tramites','oficial','certificado',
        'seg-social','catastro','impuestos','tributos','dgt','hacienda','dni','salud','sanidad'
    ]

    PROFESSIONAL_TOKENS = [
        'asesoria','gestoria','abogado','despacho','consultoria','contable','laboral','fiscal','bufete','notaria'
    ]

    ECOMMERCE_TOKENS = [
        'pedido','pedidos','compra','compras','factura','facturas','recibo','recibos',
        'abonado','tarifa','tarifas','servicios','renovar','renovacion','contrato',
        'suscripcion','envio','entrega','paquete','envios','devolucion'
    ]

    LATAM_TLDS = ['.co', '.mx', '.cl', '.ar', '.br', '.pe', '.ec', '.uy', '.py', '.bo', '.sv', '.hn', '.cr', '.gt', '.do']

    GLOBAL_TLDS = ['.com', '.app', '.net', '.org', '.io', '.web.app', '.dev']

    # --- Reglas base (igual que v2.5) ---
    for kw in POSITIVE_KEYWORDS:
        if kw in url_low:
            score += 1
            signals.append(f'has_kw:{kw}')

    for m in SPANISH_MARKERS:
        if m in url_low:
            if m == '.es':
                score += 2
            else:
                score += 1
            signals.append(f'spanish_marker:{m}')

    for b in SPANISH_BRANDS:
        if b in url_low:
            score += 1
            signals.append(f'spanish_brand:{b}')

    for h in SPANISH_HOSTINGS:
        if h in url_low:
            score += 2
            signals.append(f'spanish_hosting:{h}')

    if '.com.es' in url_low:
        score += 2
        signals.append('tld_combo_com_es')

    # action + delivery patterns
    verbs = ['modifica','modificar','actualiza','actualizar','cambia','cambiar','reprograma','reprogramar','ajusta','ajustar','corrige','corregir']
    delivery = ['entrega','envio','envío','pedido','paquete']
    hyphen_pattern = re.compile(r'(' + '|'.join(verbs) + r')[-_%2d]*(tu|mi|su)?[-_%2d]*(' + '|'.join(delivery) + r')', flags=re.IGNORECASE)
    if hyphen_pattern.search(url_low):
        m = hyphen_pattern.search(url_low)
        score += 2
        signals.append(f'action_delivery:{m.group(0)}')

    if any(h in url_low for h in ['github.io','forms.app','pages.dev','pages.github','netlify.app','webflow.io']):
        if any(tok in url_low for tok in delivery):
            score += 2
            signals.append('susp_hoster_plus_delivery')

    # semánticas compuestas
    if '.es' in url_low:
        if sum(tok in url_low for tok in GENERIC_SP_TOKENS) >= 2:
            score += 3
            signals.append('generic_service_combo_es')
        if any(tok in url_low for tok in BANKING_TOKENS):
            score += 3
            signals.append('banking_combo_es')
        if any(tok in url_low for tok in INSTITUTIONAL_TOKENS + PROFESSIONAL_TOKENS):
            score += 3
            signals.append('institutional_professional_es')
        if any(tok in url_low for tok in ECOMMERCE_TOKENS):
            score += 2
            signals.append('ecommerce_combo_es')

    # whitelist (exacta + fuzzy)
    for legit in spanish_whitelist:
        if legit in url_low:
            score += 2
            signals.append(f'spanish_whitelist_match:{legit}')
            break
        else:
            legit_base = legit.split('.')[0]
            sim = fuzz.ratio(domain_core, legit_base)
            if sim >= 80:
                score += 2
                signals.append(f'fuzzy_whitelist_match:{legit}:{sim:.0f}')
                break

    # --- REGLAS DE RECUPERACIÓN (v2.5) ---

    # 1) brand + token español (marca + palabra española relevante) -> +2
    spanish_tokens_for_brand = ['ayuda','cliente','esapp','es','spain','movil','ayuntamiento','paqueteria','paquete','envio','entrega']
    if any(b in url_low for b in SPANISH_BRANDS) and any(tok in url_low for tok in spanish_tokens_for_brand):
        score += 2
        signals.append('brand_plus_spanish_token')

    # 2) marca en subdominio (antes del domain_core) -> +2
    try:
        parts = netloc.split('.')
        if len(parts) > 2:
            subparts = parts[:-2]
            subdomain_str = '.'.join(subparts)
            if any(b in subdomain_str for b in SPANISH_BRANDS):
                score += 2
                signals.append('brand_in_subdomain')
    except Exception:
        pass

    # 3) acortadores que indiquen targeting Spain -> +2
    shorteners = ['l.ead.me','bit.ly','t.co','tinyurl.com','ow.ly','is.gd']
    if any(s in netloc for s in shorteners):
        short_tokens = ['spain','es','dgt','bbva','correos','ing','santander','caixabank']
        if any(tok in path or tok in query for tok in short_tokens):
            score += 2
            signals.append('shortener_spain')

    # --- NUEVA REGLA: BOOST marcas en TLDs globales (+1) ---
    # Si detectamos marca española en la URL y el host contiene un TLD global (ej .com/.app/.net/.org)
    # y NO contiene '.es', aplicamos un boost pequeño (+1).
    # Esto recupera casos tipo ingbanksecure.com, www.ing-es-movil.com, seguridadsabadell.com
    try:
        has_spanish_brand = any(b in url_low for b in SPANISH_BRANDS)
        host_has_global_tld = any(tld in netloc for tld in GLOBAL_TLDS)
        host_has_es = '.es' in netloc
        if has_spanish_brand and host_has_global_tld and not host_has_es:
            score += 1
            signals.append('brand_global_tld_boost')
    except Exception:
        pass

    # --- Penalizaciones LATAM ---
    for tld in LATAM_TLDS:
        if url_low.endswith(tld) or f"{tld}/" in url_low:
            score -= 2
            signals.append(f'latam_tld:{tld}')

    for pk in ['pagamento','fatura','acesso','faturas']:
        if pk in url_low:
            score -= 2
            signals.append(f'pt_kw:{pk}')

    for lb in ['banrural','pichincha','itau','bradesco','yape','daviplata']:
        if lb in url_low:
            score -= 1
            signals.append(f'latam_brand:{lb}')

    # fuzzy brand match final
    for brand in SPANISH_BRANDS:
        sim = fuzz.ratio(domain_core, brand)
        if sim >= 80:
            score += 2
            signals.append(f'fuzzy_brand_match:{brand}:{sim:.0f}')
            break

    return score, ';'.join(signals)

In [22]:
# Aplicar scoring v2.6
df['score_total'], df['signals_detected'] = zip(*df['url'].apply(lambda u: score_url_v2_6(u, spanish_whitelist)))
df['label_pred'] = (df['score_total'] >= 2).astype(int)

# Métricas
num_phishing = int(df['label'].sum())
num_detectadas = int(df[(df['label']==1) & (df['label_pred']==1)].shape[0])
recall = num_detectadas / num_phishing if num_phishing>0 else 0
precision = num_detectadas / df[df['label_pred']==1].shape[0] if df[df['label_pred']==1].shape[0]>0 else 0

print(f"Recall: {recall:.2%}, Precision (aprox): {precision:.2%}")

# URLs que pasaron 0->1 (comparativa)
# necesitas tener una copia anterior de label_pred_pre (por ejemplo antes de aplicar v2.6)
# si no la guardaste, puedes comparar con v2.5 outputs; aquí asumo df['label_pred_pre'] existe
if 'label_pred_pre' in df.columns:
    newly_detected = df[(df['label_pred_pre']==0) & (df['label_pred']==1)]
    print("Nuevas detectadas (muestra):")
    print(newly_detected[['url','score_total','signals_detected']].head(30))


Recall: 68.00%, Precision (aprox): 50.75%


In [23]:
# --- Falsos negativos: phishing (label=1) que no se detectaron (label_pred=0) ---
df_missed = df[(df["label"] == 1) & (df["label_pred"] == 0)].copy()

print(f"Total de phishing no detectados: {len(df_missed)}")

# Ordenar por score descendente (los que casi entran al umbral)
df_missed = df_missed.sort_values(by="score_total", ascending=False)

# Mostrar muestra
df_missed[["url", "score_total", "signals_detected"]].head(25)


Total de phishing no detectados: 32


Unnamed: 0,url,score_total,signals_detected
117,https://dgtt48.lat/yyjxzpjg/e2Walj/7,1,spanish_brand:dgt
144,https://app.biaclientefonefacil.digital,1,has_kw:cliente
191,https://clientepresenteado.digital/inicio,1,has_kw:cliente
179,https://galiciaseguridad.myportfolio.com/,1,has_kw:seguridad
171,https://pago-master.digital/Confirmacio%CC%81n...,1,has_kw:pago
125,https://admin.multa.net/,1,has_kw:multa
159,https://netflix-pagos.blog/,1,has_kw:pago
151,https://recibo-de-pago.weebly.com/,1,has_kw:pago
199,https://www.seguridadsabadell.com,1,has_kw:seguridad
129,https://online.multa.net/,1,has_kw:multa
