In [1]:
import pandas as pd, re
from datetime import datetime
from pathlib import Path

# Ubicación relativa
NB_DIR = Path.cwd()
ROOT = NB_DIR.parents[3]  # phishing-detector/
RAW_PHISHTANK = ROOT / "data/raw/phishing/phishtank/phishtank_01.csv"
OUT_CANDIDATAS = ROOT / "data/processed/phishing/phishtank_candidatas_segunda.csv"

# Cargar PhishTank
pt = pd.read_csv(RAW_PHISHTANK)
pt["url"] = pt["url"].astype(str)


In [2]:
# Echar un vistazo al dataset crudo de PhishTank
print("Filas:", len(pt))
print("Columnas:", list(pt.columns))

# Primeras filas
pt.head(10)

# Duplicados
dups = pt["url"].duplicated().sum()
print("Duplicados por URL:", dups)

# Valores nulos
nulls = pt.isna().sum()
print("Valores nulos por columna:\n", nulls)


Filas: 51716
Columnas: ['phish_id', 'url', 'phish_detail_url', 'submission_time', 'verified', 'verification_time', 'online', 'target', 'fecha_hora_recoleccion', 'fuente']
Duplicados por URL: 14
Valores nulos por columna:
 phish_id                  0
url                       0
phish_detail_url          0
submission_time           0
verified                  0
verification_time         0
online                    0
target                    0
fecha_hora_recoleccion    0
fuente                    0
dtype: int64


In [3]:
# Eliminar duplicados por URL y resetear índice
pt_clean = pt.drop_duplicates(subset=["url"]).reset_index(drop=True)

print("Antes:", len(pt))
print("Después de eliminar duplicados:", len(pt_clean))

# Comprobar que sigue teniendo las columnas esperadas
print("Columnas:", list(pt_clean.columns))

# Vista rápida de algunas filas
pt_clean.head(10)


Antes: 51716
Después de eliminar duplicados: 51702
Columnas: ['phish_id', 'url', 'phish_detail_url', 'submission_time', 'verified', 'verification_time', 'online', 'target', 'fecha_hora_recoleccion', 'fuente']


Unnamed: 0,phish_id,url,phish_detail_url,submission_time,verified,verification_time,online,target,fecha_hora_recoleccion,fuente
0,9167131,https://teamvoice.m-pages.com/,http://www.phishtank.com/phish_detail.php?phis...,2025-07-28T08:55:52+00:00,yes,2025-07-28T09:02:54+00:00,yes,Other,2025-07-28 12:05:48,PhishTank
1,9167130,https://merrimsg.m-pages.com/,http://www.phishtank.com/phish_detail.php?phis...,2025-07-28T08:55:29+00:00,yes,2025-07-28T09:02:54+00:00,yes,Other,2025-07-28 12:05:48,PhishTank
2,9167129,https://tinkabee.m-pages.com/,http://www.phishtank.com/phish_detail.php?phis...,2025-07-28T08:55:01+00:00,yes,2025-07-28T09:02:54+00:00,yes,Other,2025-07-28 12:05:48,PhishTank
3,9167128,https://btvoice01.m-pages.com/,http://www.phishtank.com/phish_detail.php?phis...,2025-07-28T08:54:38+00:00,yes,2025-07-28T09:02:54+00:00,yes,Other,2025-07-28 12:05:48,PhishTank
4,9167127,https://comfirm.m-pages.com/,http://www.phishtank.com/phish_detail.php?phis...,2025-07-28T08:54:26+00:00,yes,2025-07-28T09:02:54+00:00,yes,Other,2025-07-28 12:05:48,PhishTank
5,9167126,https://birejip47.m-pages.com/,http://www.phishtank.com/phish_detail.php?phis...,2025-07-28T08:53:47+00:00,yes,2025-07-28T09:02:54+00:00,yes,Other,2025-07-28 12:05:48,PhishTank
6,9167125,https://eu.jotform.com/app/251843061781356,http://www.phishtank.com/phish_detail.php?phis...,2025-07-28T08:53:42+00:00,yes,2025-07-28T09:02:54+00:00,yes,Other,2025-07-28 12:05:48,PhishTank
7,9167124,https://eu.jotform.com/app/251882591814364,http://www.phishtank.com/phish_detail.php?phis...,2025-07-28T08:53:01+00:00,yes,2025-07-28T09:02:54+00:00,yes,Other,2025-07-28 12:05:48,PhishTank
8,9167123,https://eu.jotform.com/app/251882925426566,http://www.phishtank.com/phish_detail.php?phis...,2025-07-28T08:52:29+00:00,yes,2025-07-28T09:02:54+00:00,yes,Other,2025-07-28 12:05:48,PhishTank
9,9167122,https://eu.jotform.com/app/251884095401357,http://www.phishtank.com/phish_detail.php?phis...,2025-07-28T08:46:34+00:00,yes,2025-07-28T08:52:32+00:00,yes,Other,2025-07-28 12:05:48,PhishTank


In [4]:
from urllib.parse import urlparse, unquote
import unicodedata

# Función para normalizar texto
def norm(text):
    if not isinstance(text, str):
        return ""
    text = unquote(text)
    text = unicodedata.normalize("NFKD", text)
    text = text.encode("ascii", "ignore").decode("utf-8", "ignore")
    return text.lower()

# Aplicar segmentación
pt_clean["domain"] = pt_clean["url"].apply(lambda x: urlparse(x).netloc)
pt_clean["path"] = pt_clean["url"].apply(lambda x: urlparse(x).path)
pt_clean["route"] = pt_clean["url"].apply(lambda x: urlparse(x).netloc + urlparse(x).path)

# Aplicar normalización
pt_clean["domain_norm"] = pt_clean["domain"].apply(norm)
pt_clean["path_norm"] = pt_clean["path"].apply(norm)
pt_clean["route_norm"] = pt_clean["route"].apply(norm)


In [32]:
# -------------------------------------
# Detección de marcas españolas en la URL (optimizado con regex)
# -------------------------------------
import unicodedata
import re

# Lista de marcas españolas relevantes
marcas_es = [
    "santander", "bbva", "caixabank", "sabadell", "ing", "openbank", "bankinter", "kutxabank",
    "evo banco", "unicaja", "cajamar", "abanca", "ibercaja", "targobank",
    "docusign", "teamviewer", "el corte ingles", "carrefour", "mediamarkt", "pccomponentes",
    "mercadona", "lidl", "aldi", "dia", "alcampo", "eroski", "zara", "pullandbear", "bershka",
    "stradivarius", "oysho", "mango", "desigual", "tous", "loewe", "telepizza", "vips", "ginos",
    "fridays", "100 montaditos", "domino", "mcdonalds", "burger king", "seat", "renault",
    "citroen", "mapfre", "mutua", "linea directa", "endesa", "iberdrola", "naturgy", "repsol",
    "cepsa", "holaluz", "totalenergies", "movistar", "vodafone", "orange", "yoigo", "masmovil",
    "pepephone", "digi", "euskaltel", "dgt", "sepe", "catastro", "agencia tributaria",
    "seguridad social", "inem", "suma", "iberia", "renfe", "melia", "nh hoteles", "iberostar",
    "grupo planeta", "once", "fnac", "casa del libro"
]

# Función para normalizar texto (quitar acentos, pasar a minúsculas, etc.)
def normalizar(texto):
    return unicodedata.normalize('NFKD', texto.lower()).encode('ascii', errors='ignore').decode('utf-8')

# Normalizar lista de marcas
marcas_es_norm = [normalizar(m) for m in marcas_es]

# Compilar patrón regex con todas las marcas escapadas
PATRON_MARCAS_ES = re.compile("|".join(map(re.escape, marcas_es_norm)), re.I)

# Aplicar detección a la columna normalizada
pt_clean["url_norm"] = pt_clean["url"].astype(str).apply(normalizar)
pt_clean["marca_es"] = pt_clean["url_norm"].str.contains(PATRON_MARCAS_ES)

# Visualizar resultados
print("Coincidencias con marcas españolas:", pt_clean["marca_es"].sum())
display(pt_clean[pt_clean["marca_es"]].head(10))


Coincidencias con marcas españolas: 2041


Unnamed: 0,phish_id,url,phish_detail_url,submission_time,verified,verification_time,online,target,fecha_hora_recoleccion,fuente,...,f_spanish_hints_ext,f_kw_fraude_urg,f_es_tld,f_ru_tld,f_ru_es_kw,f_tld_ruso,f_marca_fraude_urg,f_kw_admin_publica,f_tld_es_kw_sospechosa,f_kw_banca_extra
60,9167010,https://halsbenning.de/ionos-rechnung,http://www.phishtank.com/phish_detail.php?phis...,2025-07-28T05:04:17+00:00,yes,2025-07-28T05:12:05+00:00,yes,Other,2025-07-28 12:05:48,PhishTank,...,False,False,False,False,False,False,True,False,False,False
100,9166951,https://resilient-dialects-238242.framer.app,http://www.phishtank.com/phish_detail.php?phis...,2025-07-28T02:51:13+00:00,yes,2025-07-28T03:03:04+00:00,yes,Optus,2025-07-28 12:05:48,PhishTank,...,False,False,False,False,False,False,True,False,False,False
154,9166844,https://snjsdiowmeks.salesbread.com/?maintenan...,http://www.phishtank.com/phish_detail.php?phis...,2025-07-27T19:26:04+00:00,yes,2025-07-27T19:41:43+00:00,yes,Comcast,2025-07-28 12:05:48,PhishTank,...,False,False,False,False,False,False,False,False,False,False
196,9166766,https://swisspass.no-betaling.click/ch/2FA/ind...,http://www.phishtank.com/phish_detail.php?phis...,2025-07-27T15:30:10+00:00,yes,2025-07-27T15:51:39+00:00,yes,Other,2025-07-28 12:05:48,PhishTank,...,False,False,False,False,False,False,True,False,False,False
215,9166746,https://docs.google.com/drawings/d/1a6ZGyT2TzR...,http://www.phishtank.com/phish_detail.php?phis...,2025-07-27T15:28:58+00:00,yes,2025-07-27T15:51:39+00:00,yes,Other,2025-07-28 12:05:48,PhishTank,...,False,False,False,False,False,False,True,False,False,False
216,9166747,https://docs.google.com/drawings/d/1a6ZGyT2TzR...,http://www.phishtank.com/phish_detail.php?phis...,2025-07-27T15:28:58+00:00,yes,2025-07-27T15:51:39+00:00,yes,Other,2025-07-28 12:05:48,PhishTank,...,False,False,False,False,False,False,True,False,False,False
229,9166731,https://uwbezorging.wasmer.app/wp-content/ddh/...,http://www.phishtank.com/phish_detail.php?phis...,2025-07-27T15:28:08+00:00,yes,2025-07-27T15:51:40+00:00,yes,Other,2025-07-28 12:05:48,PhishTank,...,False,False,False,False,False,False,True,False,False,False
231,9166729,http://emergingtechnosoft.com/daimlerobi/dainm...,http://www.phishtank.com/phish_detail.php?phis...,2025-07-27T15:27:59+00:00,yes,2025-07-27T15:51:40+00:00,yes,Other,2025-07-28 12:05:48,PhishTank,...,False,False,False,False,False,False,True,False,False,False
238,9166721,https://haunting-quintessential-script-hyvarna...,http://www.phishtank.com/phish_detail.php?phis...,2025-07-27T15:27:05+00:00,yes,2025-07-27T15:41:55+00:00,yes,Other,2025-07-28 12:05:48,PhishTank,...,False,False,False,False,False,False,True,False,False,False
268,9166690,https://digitzenphluxoria-neurovendotex.shop/,http://www.phishtank.com/phish_detail.php?phis...,2025-07-27T15:24:55+00:00,yes,2025-07-27T15:41:55+00:00,yes,Other,2025-07-28 12:05:48,PhishTank,...,False,False,False,False,False,False,True,False,False,False


In [33]:
from collections import Counter

# --- Extraer todas las coincidencias por URL ---
def extraer_marcas(texto):
    return PATRON_MARCAS_ES.findall(texto)

# Lista de todas las marcas encontradas en URLs etiquetadas como marca_es = True
coincidencias = pt_clean.loc[pt_clean["marca_es"], "url_norm"].apply(extraer_marcas)

# Aplanar la lista de listas y contar frecuencias
todas_las_marcas = [marca.lower() for sublista in coincidencias for marca in sublista]
conteo_marcas = Counter(todas_las_marcas)

# Convertir a DataFrame para mostrarlo ordenado
df_marcas = pd.DataFrame(conteo_marcas.items(), columns=["marca_detectada", "frecuencia"]).sort_values(by="frecuencia", ascending=False)

# Mostrar tabla
print(f"Total de URLs con marcas detectadas: {pt_clean['marca_es'].sum()}")
display(df_marcas.head(30))


Total de URLs con marcas detectadas: 2041


Unnamed: 0,marca_detectada,frecuencia
0,ing,1347
1,dia,340
6,orange,175
3,digi,107
7,dgt,72
4,once,29
2,inem,12
12,aldi,9
10,docusign,6
13,seat,6


In [6]:
pt_clean[["url", "domain", "path", "route", "domain_norm", "route_norm"]].head(5)


Unnamed: 0,url,domain,path,route,domain_norm,route_norm
0,https://teamvoice.m-pages.com/,teamvoice.m-pages.com,/,teamvoice.m-pages.com/,teamvoice.m-pages.com,teamvoice.m-pages.com/
1,https://merrimsg.m-pages.com/,merrimsg.m-pages.com,/,merrimsg.m-pages.com/,merrimsg.m-pages.com,merrimsg.m-pages.com/
2,https://tinkabee.m-pages.com/,tinkabee.m-pages.com,/,tinkabee.m-pages.com/,tinkabee.m-pages.com,tinkabee.m-pages.com/
3,https://btvoice01.m-pages.com/,btvoice01.m-pages.com,/,btvoice01.m-pages.com/,btvoice01.m-pages.com,btvoice01.m-pages.com/
4,https://comfirm.m-pages.com/,comfirm.m-pages.com,/,comfirm.m-pages.com/,comfirm.m-pages.com,comfirm.m-pages.com/


In [15]:
# === Palabras clave extendidas (fraude + urgencia) ===
KW_FRAUDE_URG_RE = re.compile(
    r"(factura|multa|notificaci[oó]n|aviso|env[ií]o|paquete|entrega|"
    r"declaraci[oó]n|impuestos?|pago|recibo|clientes?|acceso|cuenta|"
    r"seguridad|verificar|iniciar[-_]?sesi[oó]n|bloqueo|restringido|"
    r"confirmaci[oó]n|urgente|intento|actividad|clave|contrase[nñ]a|"
    r"reembolso|suspendid[ao]|mensaje|sms|movimientos?|transacci[oó]n|"
    r"saldo|banca|appmovil|descarga|pin|actualizaci[oó]n|"
    r"confirmar|reanudar|soporte|servicio|bloquead[ao]|restricci[oó]n|"
    r"pagina|bloqueada|suspendida|urgencia|inmediato|"
    r"restringida|problema|incidencia|fallo|inseguro|robo|"
    r"an[oó]malo|fraude|detenido|detenci[oó]n|revisi[oó]n|sospechoso|"
    r"actividad sospechosa|alerta|riesgo|intru|cierre|reclamaci[oó]n|"
    r"atenci[oó]n inmediata|acci[oó]n requerida|"
    r"ha sido suspendid[ao]|cuenta cerrada|cuenta bloqueada)",
    re.I
)

# Aplicar al dataset
pt_clean["f_kw_fraude_urg"] = pt_clean["route_norm"].fillna("").str.contains(KW_FRAUDE_URG_RE)

# Diagnóstico (opcional)
print("Coincidencias con KW fraude/urgencia:", pt_clean["f_kw_fraude_urg"].sum())


  pt_clean["f_kw_fraude_urg"] = pt_clean["route_norm"].fillna("").str.contains(KW_FRAUDE_URG_RE)


Coincidencias con KW fraude/urgencia: 456


In [34]:
from collections import Counter

# --- Extraer todas las coincidencias por ruta ---
def extraer_keywords(texto):
    return KW_FRAUDE_URG_RE.findall(texto)

# Aplicar sobre las URLs que fueron marcadas como True por el patrón
coincidencias_kw = pt_clean.loc[pt_clean["f_kw_fraude_urg"], "route_norm"].apply(extraer_keywords)

# Aplanar la lista de listas
todas_kw = [kw.lower() for sublista in coincidencias_kw for kw in sublista]

# Contar frecuencia
conteo_kw = Counter(todas_kw)

# Convertir a DataFrame para ordenar y visualizar
df_kw = pd.DataFrame(conteo_kw.items(), columns=["keyword_detectada", "frecuencia"]).sort_values(by="frecuencia", ascending=False)

# Mostrar resultados
print(f"Total de URLs con alguna palabra clave de fraude/urgencia: {pt_clean['f_kw_fraude_urg'].sum()}")
display(df_kw.head(30))


Total de URLs con alguna palabra clave de fraude/urgencia: 456


Unnamed: 0,keyword_detectada,frecuencia
0,pin,92
1,sms,88
14,pagina,63
8,cliente,33
2,banca,32
7,aviso,26
4,servicio,23
3,pago,18
10,factura,16
9,cuenta,15


In [18]:
# TLDs típicos de España
TLD_ES = (".es", ".gob.es", ".com.es", ".org.es")

pt_clean["f_es_tld"] = pt_clean["domain_norm"].apply(lambda x: any(x.endswith(tld) for tld in TLD_ES))
print("Con TLD español:", pt_clean["f_es_tld"].sum())
# Ver algunas coincidencias para revisar calidad
display(pt_clean[pt_clean["f_es_tld"] == True][["url", "domain_norm"]].head(10))



Con TLD español: 50


Unnamed: 0,url,domain_norm
1024,http://soporte_ionos.viajoencoche.es/?rid=48Y9hLX,soporte_ionos.viajoencoche.es
1900,https://muuimh.poqjfrm.es/J0bnwVyqR7!7e9O/,muuimh.poqjfrm.es
2575,https://vyy.rqfjspfe.es/N4XPv42MsB@3JqvUh/?=$r...,vyy.rqfjspfe.es
3088,https://u58o.yurgltyju.es/u1TFlAprT4GEiE@/$,u58o.yurgltyju.es
3703,https://asesoriabarrachina.es/wp-includes/stra...,asesoriabarrachina.es
3993,https://todoenxenon.es/docusign/document/docus...,todoenxenon.es
4170,https://directedmein.com.es/v,directedmein.com.es
4581,https://walletconnect.com-secure.es/,walletconnect.com-secure.es
4909,https://ver-sec-auth05a.com.es/3/index.php,ver-sec-auth05a.com.es
5463,https://machintios.com.es/ssa/,machintios.com.es


In [36]:
KW_ADMIN_PUBLICA_RE = re.compile(
    r"(dgt|aeat|catastro|sepe|certificado|dni|sede|tramites|"
    r"administracion|gobierno|agenciatributaria|inss|cita previa|justicia|mpt|"
    r"padr[oó]n|ayuntamiento|empadronamiento|"
    r"vida laboral|prestaci[oó]n|jubilaci[oó]n|paro|"
    r"permiso|carn[eé]t|matr[ií]cula|"
    r"reservar|turno|solicitud|"
    r"documentaci[oó]n|formulario|registro|expediente)",
    re.I
)

# Aplicar flag al dataset
pt_clean["f_kw_admin_publica"] = pt_clean["route_norm"].fillna("").str.contains(KW_ADMIN_PUBLICA_RE)

# Diagnóstico general
print("URLs con indicios de administración pública española:", pt_clean["f_kw_admin_publica"].sum())

# ---- Distribución individual de términos ----
from collections import Counter
import unicodedata

# Lista de términos normalizados manualmente (sin regex)
palabras_admin_publica = [
    "dgt", "aeat", "catastro", "sepe", "certificado", "dni", "sede", "tramites",
    "administracion", "gobierno", "agenciatributaria", "inss", "cita previa", "justicia", "mpt",
    "padron", "ayuntamiento", "empadronamiento",
    "vida laboral", "prestacion", "jubilacion", "paro",
    "permiso", "carnet", "matricula",
    "reservar", "turno", "solicitud",
    "documentacion", "formulario", "registro", "expediente"
]

# Función para normalizar textos (igual que en norm)
def normalizar(texto):
    return unicodedata.normalize('NFKD', texto.lower()).encode('ascii', errors='ignore').decode('utf-8')

conteo_admin = Counter()

# Contar cada término individual en route_norm
for palabra in palabras_admin_publica:
    coincidencias = pt_clean["route_norm"].str.contains(palabra, case=False, na=False)
    conteo_admin[palabra] = coincidencias.sum()

# Mostrar resultados
print("\nDistribución de términos de administración pública:")
for palabra, total in conteo_admin.most_common():
    print(f"{palabra}: {total}")


  pt_clean["f_kw_admin_publica"] = pt_clean["route_norm"].fillna("").str.contains(KW_ADMIN_PUBLICA_RE)


URLs con indicios de administración pública española: 146

Distribución de términos de administración pública:
dgt: 66
mpt: 39
dni: 17
registro: 6
solicitud: 5
sede: 4
sepe: 3
inss: 3
certificado: 1
gobierno: 1
formulario: 1
aeat: 0
catastro: 0
tramites: 0
administracion: 0
agenciatributaria: 0
cita previa: 0
justicia: 0
padron: 0
ayuntamiento: 0
empadronamiento: 0
vida laboral: 0
prestacion: 0
jubilacion: 0
paro: 0
permiso: 0
carnet: 0
matricula: 0
reservar: 0
turno: 0
documentacion: 0
expediente: 0


In [23]:
# Muestra de URLs marcadas como administración pública
display(
    pt_clean[pt_clean["f_kw_admin_publica"] == True][["url", "route_norm"]].head(25)
)


Unnamed: 0,url,route_norm
230,https://lufi-dekoracio.hu/wp-adminss/serveletr...,lufi-dekoracio.hu/wp-adminss/serveletrsservere...
897,https://ceresdigital.co/R/1HvxMCWQsmsWrKkRDnSN...,ceresdigital.co/r/1hvxmcwqsmswrkkrdnsnlcgldwfb...
952,https://metascoachin.work.gd/ludnibmameremavie...,metascoachin.work.gd/ludnibmameremaviemonamour...
1489,https://docto-info-compte.com/,docto-info-compte.com/
1490,https://docto-info-compte.com/captcha.php,docto-info-compte.com/captcha.php
2107,https://dgtt48.lat/yyjxzpjg/e2Walj/7,dgtt48.lat/yyjxzpjg/e2walj/7
2433,https://regularisation-compte.info/pages/billi...,regularisation-compte.info/pages/billing.php
2434,https://regularisation-compte.info/pages/card.php,regularisation-compte.info/pages/card.php
4504,https://www.verfication-compte.com/captcha.php,www.verfication-compte.com/captcha.php
5046,https://ulys-mon-compte.com/,ulys-mon-compte.com/


In [37]:
KW_ENERGIA_RE = re.compile(
    r"(endesa|iberdrola|naturgy|repsol|cepsa|holaluz|totalenergies|"
    r"energia|electricidad|gas|luz|suministro|corte de luz|factura electrica|"
    r"tarifa|comercializadora|contador|consumo|lectura|distribuidora)",
    re.I
)

# Aplicar al dataset
pt_clean["f_kw_energia"] = pt_clean["route_norm"].fillna("").str.contains(KW_ENERGIA_RE)

# Diagnóstico global
print("URLs con indicios de energía/suministros:", pt_clean["f_kw_energia"].sum())

# --- Análisis de distribución ---
from collections import Counter

palabras_energia = [
    "endesa", "iberdrola", "naturgy", "repsol", "cepsa", "holaluz", "totalenergies",
    "energia", "electricidad", "gas", "luz", "suministro", "corte de luz",
    "factura electrica", "tarifa", "comercializadora", "contador", "consumo", "lectura", "distribuidora"
]

conteo_energia = Counter()

for palabra in palabras_energia:
    coincidencias = pt_clean["route_norm"].str.contains(palabra, case=False, na=False)
    conteo_energia[palabra] = coincidencias.sum()

# Mostrar distribución
print("\nDistribución de términos energéticos detectados:")
for palabra, total in conteo_energia.most_common():
    print(f"{palabra}: {total}")


  pt_clean["f_kw_energia"] = pt_clean["route_norm"].fillna("").str.contains(KW_ENERGIA_RE)


URLs con indicios de energía/suministros: 55

Distribución de términos energéticos detectados:
gas: 32
luz: 16
energia: 3
tarifa: 2
distribuidora: 2
endesa: 0
iberdrola: 0
naturgy: 0
repsol: 0
cepsa: 0
holaluz: 0
totalenergies: 0
electricidad: 0
suministro: 0
corte de luz: 0
factura electrica: 0
comercializadora: 0
contador: 0
consumo: 0
lectura: 0


In [38]:
# Lista de columnas de heurística aplicadas (ajústala si has añadido más)
cols_flags = [
    "marca_es", 
    "f_kw_fraude_urg", 
    "f_kw_admin_publica", 
    "f_kw_banca_extra", 
    "f_kw_energia", 
    "f_es_tld"
]

# Asegúrate de que todas las columnas existen
for col in cols_flags:
    if col not in pt_clean.columns:
        pt_clean[col] = False

# Calcular score total
pt_clean["score_total"] = pt_clean[cols_flags].sum(axis=1)


In [39]:
# Filtrar fuera URLs ya descartadas manualmente o que tienen score 0
candidatas = pt_clean[
    (~pt_clean.get("descartado_manual", False)) & 
    (pt_clean["score_total"] >= 1)
].copy()

print(f"Total candidatas con al menos 1 heurística positiva: {len(candidatas)}")


Total candidatas con al menos 1 heurística positiva: 2723


In [40]:
# Ordenar para priorizar las más sospechosas
candidatas_sorted = candidatas.sort_values(by="score_total", ascending=False)
candidatas_sorted[["url", "route_norm", "score_total"]].head(25)


Unnamed: 0,url,route_norm,score_total
11253,https://q-r.to/Multa-DGT,q-r.to/multa-dgt,3
897,https://ceresdigital.co/R/1HvxMCWQsmsWrKkRDnSN...,ceresdigital.co/r/1hvxmcwqsmswrkkrdnsnlcgldwfb...,3
11251,https://qrco.de/Multa-DGT,qrco.de/multa-dgt,3
11254,https://l.ead.me/Multa-DGT,l.ead.me/multa-dgt,3
36996,https://www.gasthof-kolpinghaus.de/wordpress/j...,www.gasthof-kolpinghaus.de/wordpress/js/jatman...,3
6067,https://distribuidoramaguis.com/js/webmail.aru...,distribuidoramaguis.com/js/webmail.aruba.it/us...,2
13603,https://th80p45y4gdhrtjtyjrtrteygtgretgffdgtr6...,th80p45y4gdhrtjtyjrtrteygtgretgffdgtr67grttggt...,2
20425,https://supercaracol.ar/wp-content/plugins/aki...,supercaracol.ar/wp-content/plugins/akismet/dgt...,2
46626,https://docs.google.com/presentation/d/e/2PACX...,docs.google.com/presentation/d/e/2pacx-1vtn0gf...,2
12018,https://share-eu1.hsforms.com/21h34fiDGT_qB56m...,share-eu1.hsforms.com/21h34fidgt_qb56m4xowj_w2...,2


In [41]:
# Guardar el CSV de revisión manual (en tu ruta de processed)
candidatas_sorted.to_csv(OUT_CANDIDATAS, index=False)
print(f"Guardado CSV con {len(candidatas_sorted)} URLs candidatas en: {OUT_CANDIDATAS}")


Guardado CSV con 2723 URLs candidatas en: /Users/test/Desktop/phishing-detector/data/processed/phishing/phishtank_candidatas_segunda.csv
