In [7]:
import pandas as pd
from pathlib import Path

# Ruta al CSV dentro de la carpeta phishtank
path_pt = Path("data/raw/phishing/phishtank/phishtank_01.csv")

assert path_pt.exists(), f"No existe: {path_pt.resolve()}"

pt = pd.read_csv(path_pt)
print("Filas:", len(pt))
print("Columnas:", pt.columns.tolist())
pt.head()


Filas: 51716
Columnas: ['phish_id', 'url', 'phish_detail_url', 'submission_time', 'verified', 'verification_time', 'online', 'target', 'fecha_hora_recoleccion', 'fuente']


Unnamed: 0,phish_id,url,phish_detail_url,submission_time,verified,verification_time,online,target,fecha_hora_recoleccion,fuente
0,9167131,https://teamvoice.m-pages.com/,http://www.phishtank.com/phish_detail.php?phis...,2025-07-28T08:55:52+00:00,yes,2025-07-28T09:02:54+00:00,yes,Other,2025-07-28 12:05:48,PhishTank
1,9167130,https://merrimsg.m-pages.com/,http://www.phishtank.com/phish_detail.php?phis...,2025-07-28T08:55:29+00:00,yes,2025-07-28T09:02:54+00:00,yes,Other,2025-07-28 12:05:48,PhishTank
2,9167129,https://tinkabee.m-pages.com/,http://www.phishtank.com/phish_detail.php?phis...,2025-07-28T08:55:01+00:00,yes,2025-07-28T09:02:54+00:00,yes,Other,2025-07-28 12:05:48,PhishTank
3,9167128,https://btvoice01.m-pages.com/,http://www.phishtank.com/phish_detail.php?phis...,2025-07-28T08:54:38+00:00,yes,2025-07-28T09:02:54+00:00,yes,Other,2025-07-28 12:05:48,PhishTank
4,9167127,https://comfirm.m-pages.com/,http://www.phishtank.com/phish_detail.php?phis...,2025-07-28T08:54:26+00:00,yes,2025-07-28T09:02:54+00:00,yes,Other,2025-07-28 12:05:48,PhishTank


In [10]:
# 2) Seleccionar solo columnas relevantes
cols = ["url", "target", "fecha_hora_recoleccion", "fuente"]
df = pt[cols].copy()

print("Filas:", len(df))
df.head()

Filas: 51716


Unnamed: 0,url,target,fecha_hora_recoleccion,fuente
0,https://teamvoice.m-pages.com/,Other,2025-07-28 12:05:48,PhishTank
1,https://merrimsg.m-pages.com/,Other,2025-07-28 12:05:48,PhishTank
2,https://tinkabee.m-pages.com/,Other,2025-07-28 12:05:48,PhishTank
3,https://btvoice01.m-pages.com/,Other,2025-07-28 12:05:48,PhishTank
4,https://comfirm.m-pages.com/,Other,2025-07-28 12:05:48,PhishTank


In [14]:
from validators import url as validate_url

def safe_is_url(x):
    try:
        return bool(validate_url(x))
    except Exception:
        return False

# 3) Validar URLs sin romper
mask = df["url"].map(lambda x: safe_is_url(x) if pd.notna(x) else False)
df = df.loc[mask].copy()

print("Tras filtrar inválidas:", len(df))
df.head()


Tras filtrar inválidas: 51243


Unnamed: 0,url,target,fecha_hora_recoleccion,fuente
0,https://teamvoice.m-pages.com/,Other,2025-07-28 12:05:48,PhishTank
1,https://merrimsg.m-pages.com/,Other,2025-07-28 12:05:48,PhishTank
2,https://tinkabee.m-pages.com/,Other,2025-07-28 12:05:48,PhishTank
3,https://btvoice01.m-pages.com/,Other,2025-07-28 12:05:48,PhishTank
4,https://comfirm.m-pages.com/,Other,2025-07-28 12:05:48,PhishTank


In [15]:
# 4) Distribución de targets
target_counts = df["target"].value_counts().head(20)

print("Top 20 targets:")
print(target_counts)



Top 20 targets:
target
Other                                  47409
Allegro                                 1009
Internal Revenue Service                 810
Facebook                                 372
AT&amp;T                                 301
Optus                                    200
Microsoft                                170
Netflix                                   95
Coinbase                                  70
British Telecom                           57
Orange                                    52
Amazon.com                                51
HSBC Group                                43
eBay, Inc.                                42
Bank of America Corporation               41
Steam                                     39
Adobe                                     38
Sumitomo Mitsui Banking Corporation       37
PayPal                                    35
JPMorgan Chase and Co.                    31
Name: count, dtype: int64


In [16]:
import unicodedata

def normalize(text):
    """Minúsculas y sin tildes"""
    return ''.join(
        c for c in unicodedata.normalize('NFD', text.lower())
        if unicodedata.category(c) != 'Mn'
    )

# Diccionario de patrones inicial, generado a partir de tu tabla
patrones = {
    "santander": ["santander", "openbank"],
    "bbva": ["bbva", "banco bilbao"],
    "caixabank": ["caixabank", "la caixa"],
    "sabadell": ["sabadell", "bancsabadell"],
    "bankinter": ["bankinter"],
    "unicaja": ["unicaja"],
    "iberdrola": ["iberdrola"],
    "endesa": ["endesa"],
    "naturgy": ["naturgy", "gas natural"],
    "repsol": ["repsol"],
    "telefonica": ["telefonica", "movistar"],
    "vodafone": ["vodafone"],
    "orange": ["orange", "jazztel", "amenas"],
    "masmovil": ["masmovil", "yoigo", "pepephone"],
    "netflix": ["netflix"],
    "hbo": ["hbo", "max"],
    "amazon": ["amazon", "primevideo"],
    "ebay": ["ebay"],
    "paypal": ["paypal"],
    "microsoft": ["microsoft", "outlook", "office365"],
    "facebook": ["facebook", "instagram", "meta"],
    "google": ["google", "gmail", "youtube"],
    "apple": ["apple", "icloud"],
}


In [17]:
def match_empresa(url, target):
    text = normalize(url + " " + str(target))
    for empresa, keys in patrones.items():
        for k in keys:
            if k in text:
                return empresa
    return None

df["matched_target"] = df.apply(lambda r: match_empresa(r["url"], r["target"]), axis=1)


In [18]:
df_matched = df[df["matched_target"].notna()]
print("URLs asociadas a empresas de la tabla:", len(df_matched))
df_matched.head()


URLs asociadas a empresas de la tabla: 9030


Unnamed: 0,url,target,fecha_hora_recoleccion,fuente,matched_target
31,https://translators-exec.vercel.app/facebook-m...,Facebook,2025-07-28 12:05:48,PhishTank,facebook
32,https://translators-exec.vercel.app/wrapper-float,Facebook,2025-07-28 12:05:48,PhishTank,facebook
33,https://rebrand.ly/wc6756,Facebook,2025-07-28 12:05:48,PhishTank,facebook
65,https://metamasklginn0.webflow.io/,Other,2025-07-28 12:05:48,PhishTank,facebook
74,https://docs.google.com/presentation/d/e/2PACX...,Optus,2025-07-28 12:05:48,PhishTank,google


In [20]:
df_matched.to_csv

<bound method NDFrame.to_csv of                                                      url     target  \
31     https://translators-exec.vercel.app/facebook-m...   Facebook   
32     https://translators-exec.vercel.app/wrapper-float   Facebook   
33                             https://rebrand.ly/wc6756   Facebook   
65                    https://metamasklginn0.webflow.io/      Other   
74     https://docs.google.com/presentation/d/e/2PACX...      Optus   
...                                                  ...        ...   
51681                 https://orange69.godaddysites.com/      Other   
51682                 https://orange14.godaddysites.com/      Other   
51685            https://accueilorange.godaddysites.com/      Other   
51689     http://www.imxprs.com/free/emailupdatee/owaweb  Microsoft   
51702  http://creativeingredient.com/wp-includes/imag...     PayPal   

      fecha_hora_recoleccion     fuente matched_target  
31       2025-07-28 12:05:48  PhishTank       facebook  
3

In [32]:
import re
assert 'matched_target' in df_matched.columns, "Falta matched_target en df_matched"

# 1) listas y patrones
cc_tlds = (".es", ".cat", ".gal", ".eus", ".madrid", ".barcelona")
tokens_es = [
    "es", "es-es", "iniciar-sesion", "acceso", "identificacion", "dni", "nif",
    "factura", "pago", "banca-online", "area-cliente", "contraseña", "contrasena",
    "clave", "transferencia", "recibos", "agenciatributaria", "aeat", "correos",
    "seg-social", "seguridadsocial", "dgt", "sepe", "boe", "catastro"
]

marcas_es = [
    # banca
    "bbva","santander","caixabank","sabadell","bankinter","unicaja","ibercaja","kutxabank","abanca","evo","openbank","cajamar",
    # telco/servicios
    "movistar","telefonica","vodafone","orange","yoigo","masmovil","jazztel","simyo","lowi","pepephone",
    # energía/seguros
    "endesa","naturgy","iberdrola","repsol","mapfre",
    # ecommerce/otros relevantes en ES
    "elcorteingles","corteingles","pccomponentes","mediamarkt","carrefour","correos","renfe","iberia"
]

def score_es_from_url_target(url: str, target: str) -> int:
    u = (url or "").lower()
    t = (str(target) or "").lower()
    s = 0

    # ccTLDs fuertes
    if any(u.endswith(suf) or f"{suf}/" in u for suf in cc_tlds):
        s += 3

    # idioma/señales en ruta
    if any(re.search(rf"(^|/){tok}(/|$)", u) for tok in tokens_es):
        s += 2

    # marca española (en URL o target)
    if any(m in u or m in t for m in marcas_es):
        s += 3

    return s

df_matched = df_matched.copy()
df_matched["score_es"] = df_matched.apply(lambda r: score_es_from_url_target(r["url"], r["matched_target"]), axis=1)

print("Distribución score_es en matched:")
print(df_matched["score_es"].value_counts().sort_index())
df_matched.head(8)

Distribución score_es en matched:
score_es
0    8811
2       2
3     217
Name: count, dtype: int64


Unnamed: 0,url,target,fecha_hora_recoleccion,fuente,matched_target,score_es
31,https://translators-exec.vercel.app/facebook-m...,Facebook,2025-07-28 12:05:48,PhishTank,facebook,0
32,https://translators-exec.vercel.app/wrapper-float,Facebook,2025-07-28 12:05:48,PhishTank,facebook,0
33,https://rebrand.ly/wc6756,Facebook,2025-07-28 12:05:48,PhishTank,facebook,0
65,https://metamasklginn0.webflow.io/,Other,2025-07-28 12:05:48,PhishTank,facebook,0
74,https://docs.google.com/presentation/d/e/2PACX...,Optus,2025-07-28 12:05:48,PhishTank,google,0
168,https://sbisec3-co-jp-wekkgdkhvhbopwwndlmytvtx...,Other,2025-07-28 12:05:48,PhishTank,hbo,0
169,https://sbisec3-co-jp-wekkgdkhvhbopwwndlmytvtx...,Other,2025-07-28 12:05:48,PhishTank,hbo,0
204,http://srv237601.hoster-test.ru/Netflix_Telegr...,Other,2025-07-28 12:05:48,PhishTank,netflix,0


In [36]:
# 1) Diccionario de empresas españolas por sector
empresas_es = {
    "banca": [
        "santander", "bbva", "caixabank", "sabadell", "bankinter", 
        "kutxabank", "unicaja", "ibercaja", "cajamar", "openbank"
    ],
    "pagos": [
        "bizum", "paypal", "redsys"
    ],
    "telecom": [
        "movistar", "vodafone", "orange", "yoigo"
    ],
    "energia": [
        "endesa", "iberdrola", "naturgy"
    ],
    "seguros": [
        "mapfre", "linea directa", "mutua madrilena"
    ]
}

# 2) Aplanar a lista para búsqueda rápida
empresas_es_planas = [e.lower() for lista in empresas_es.values() for e in lista]

# 3) Función de scoring
def score_especifico(url, target):
    url_l = str(url).lower()
    target_l = str(target).lower()
    score = 0
    
    # si el target coincide con alguno de la lista
    if any(emp in target_l for emp in empresas_es_planas):
        score += 1
    # si la URL contiene el nombre de la empresa
    if any(emp in url_l for emp in empresas_es_planas):
        score += 1
    # si el dominio termina en .es
    if url_l.endswith(".es") or ".es/" in url_l:
        score += 1
    return score

# 4) Calcular columna
df["score_es"] = df.apply(lambda row: score_especifico(row["url"], row["target"]), axis=1)

# 5) Ver resultados
print("Distribución de score_es:")
print(df["score_es"].value_counts())
df[df["score_es"] > 0].head(10)


Distribución de score_es:
score_es
0    50964
1      253
2       26
Name: count, dtype: int64


Unnamed: 0,url,target,fecha_hora_recoleccion,fuente,matched_target,score_es
1758,https://paypal1l04.wixsite.com/my-site,Other,2025-07-28 12:05:48,PhishTank,paypal,1
1900,https://muuimh.poqjfrm.es/J0bnwVyqR7!7e9O/,Other,2025-07-28 12:05:48,PhishTank,,1
2033,http://f1150151.xsph.ru/Orange/login.php,Other,2025-07-28 12:05:48,PhishTank,orange,1
2034,http://f1150151.xsph.ru/Orange,Other,2025-07-28 12:05:48,PhishTank,orange,1
2199,https://orange-re-regularisation.com/,Orange,2025-07-28 12:05:48,PhishTank,orange,2
2200,https://orange-re-regularisation.com/pages/ind...,Orange,2025-07-28 12:05:48,PhishTank,orange,2
2575,https://vyy.rqfjspfe.es/N4XPv42MsB@3JqvUh/?=$r...,Other,2025-07-28 12:05:48,PhishTank,,1
2626,https://pp-sec-identcom.com/,PayPal,2025-07-28 12:05:48,PhishTank,paypal,1
2627,https://pp-sec-identcom.com/pages/index.php,PayPal,2025-07-28 12:05:48,PhishTank,paypal,1
2895,https://helpaypal.com/,PayPal,2025-07-28 12:05:48,PhishTank,paypal,2


In [37]:
top_empresas_es = (
    df[df["score_es"] > 0]
    .assign(empresa_es=df["url"].str.lower().apply(
        lambda u: next((e for e in empresas_es_planas if e in u), None)
    ))
    .groupby("empresa_es")["url"]
    .count()
    .sort_values(ascending=False)
)
print(top_empresas_es.head(10))


empresa_es
orange       147
paypal        10
bbva           6
santander      2
caixabank      1
redsys         1
vodafone       1
Name: url, dtype: int64


In [42]:
# Filtramos URLs con score_es >= 1
urls_es = df[df["score_es"] > 1].copy()

# Nos quedamos con unas columnas útiles
urls_es = urls_es[["url", "target", "matched_target", "score_es"]]

# Mostrar primeras 20 para inspección
print(urls_es.head(50).to_string())


                                                           url  target matched_target  score_es
2199                     https://orange-re-regularisation.com/  Orange         orange         2
2200      https://orange-re-regularisation.com/pages/index.php  Orange         orange         2
2895                                    https://helpaypal.com/  PayPal         paypal         2
5119                 https://paypal.sevalhamzic.com/index.html  PayPal         paypal         2
9182                        https://facture-orange.vercel.app/  Orange         orange         2
9213                 https://service-orange1.godaddysites.com/  Orange         orange         2
9657                         https://recoverybypaypal.com/code  PayPal         paypal         2
10259              https://vocalorangeau0707.godaddysites.com/  Orange         orange         2
10272             https://orange-connexion38.godaddysites.com/  Orange         orange         2
10335              https://orange-connex

In [44]:
import re
import pandas as pd

df = df.copy()

# --- Señales positivas ES (suman) ---
cc_tlds_es = (".es", ".cat", ".gal", ".eus", ".madrid", ".barcelona")
tokens_es_fuerte = [
    # gobierno y trámites
    "agenciatributaria","aeat","seg-social","seguridadsocial","dgt","sepe","boe","catastro",
    # banca/servicios comunes
    "banca-online","area-cliente","iniciar-sesion","acceso","clave","contrasena","contraseña",
    "dni","nif","iban","bizum","recibos","factura","facturacion","pago","tarifa","renfe","correos"
]
marcas_es = [
    "santander","bbva","caixabank","sabadell","bankinter","unicaja","ibercaja","kutxabank","abanca","cajamar","openbank",
    "movistar","telefonica","vodafone","orange","yoigo","masmovil","jazztel","simyo","lowi","pepephone",
    "endesa","iberdrola","naturgy","repsol",
    "mapfre","linea directa","mutua madrilena",
    "elcorteingles","corteingles","pccomponentes","mediamarkt","carrefour","renfe","correos","bizum","redsys"
]

# --- Señales negativas (restan) ---
# Francia / México / UK (para separar Orange FR, PayPal MX, etc.)
neg_tlds = (".fr", ".mx", ".co.uk", ".uk", ".com.mx")
tokens_fr = ["orangefr", "facture", "messagerie", "connexion", "vocale", "fixe", "serviceorange", "votreligne"]
tokens_mx = ["mx-","banamex","bbva.mx","santander.com.mx"]
tokens_uk = ["co.uk","hmrc","royalmail"]

def score_esplus(url, target, matched_target=None):
    u = (url or "").lower()
    t = (str(target) or "").lower()
    mt = (str(matched_target) or "").lower()
    s = 0

    # Base: si ya tenías score_es (0/1/2), úsalo como arranque
    base = 0
    if "score_es" in df.columns:
        # OJO: no leas de df dentro de apply; aquí solo usamos la firma de la función.
        pass

    # ++ Señales positivas
    if any(u.endswith(suf) or f"{suf}/" in u for suf in cc_tlds_es):
        s += 3
    if any(re.search(rf"(^|/){tok}(/|$)", u) for tok in tokens_es_fuerte):
        s += 2
    if any(m in u or m in t or m in (mt or "") for m in marcas_es):
        s += 2

    # -- Señales negativas (indicios de no-ES)
    if any(u.endswith(suf) or f"{suf}/" in u for suf in neg_tlds):
        s -= 2
    if any(tok in u for tok in tokens_fr):
        s -= 2
    if any(tok in u for tok in tokens_mx):
        s -= 2
    if any(tok in u for tok in tokens_uk):
        s -= 2

    # Ajuste: hostings genéricos (ruido) → exigir una señal ES fuerte
    generic_hosts = ("godaddysites.com","webflow.io","vercel.app","wixsite.com","sites.google.com","blogspot.com")
    if any(h in u for h in generic_hosts) and s < 2:
        s = 0  # sin pistas fuertes de ES, descártalo

    return s

df["score_esplus"] = df.apply(lambda r: score_esplus(r["url"], r["target"], r.get("matched_target")), axis=1)

print("Distribución score_esplus:")
print(df["score_esplus"].value_counts().sort_index())

# Subset recomendado para España (umbral conservador)
df_es_fuerte = df[df["score_esplus"] >= 3][["url","target","matched_target","score_es","score_esplus"]]
print("Candidatas ES (score_esplus>=2):", len(df_es_fuerte))
df_es_fuerte.head(20)


Distribución score_esplus:
score_esplus
-4       33
-2      240
 0    50742
 2      178
 3       50
Name: count, dtype: int64
Candidatas ES (score_esplus>=2): 50


Unnamed: 0,url,target,matched_target,score_es,score_esplus
1900,https://muuimh.poqjfrm.es/J0bnwVyqR7!7e9O/,Other,,1,3
2575,https://vyy.rqfjspfe.es/N4XPv42MsB@3JqvUh/?=$r...,Other,,1,3
3088,https://u58o.yurgltyju.es/u1TFlAprT4GEiE@/$,Other,,1,3
3703,https://asesoriabarrachina.es/wp-includes/stra...,Other,,1,3
3993,https://todoenxenon.es/docusign/document/docus...,Other,,1,3
4170,https://directedmein.com.es/v,Other,,1,3
4581,https://walletconnect.com-secure.es/,Other,,1,3
4909,https://ver-sec-auth05a.com.es/3/index.php,Internal Revenue Service,,1,3
5463,https://machintios.com.es/ssa/,Internal Revenue Service,,1,3
5762,https://machintios.com.es/irs/,Internal Revenue Service,,1,3


In [45]:
 # --- 1) Filtro fuerte: candidatos españoles confiables ---
candidatos_es_fuerte = df[df["score_esplus"] >= 3].copy()
print("Candidatos fuertes (score_esplus=3):", len(candidatos_es_fuerte))

# Exportar a CSV
candidatos_es_fuerte.to_csv("candidatos_es_fuerte.csv", index=False)

# --- 2) Filtro intermedio: posibles candidatos ---
candidatos_es_medio = df[df["score_esplus"] == 2].copy()
print("Candidatos medios (score_esplus=2):", len(candidatos_es_medio))

# Exportar a CSV
candidatos_es_medio.to_csv("candidatos_es_medio.csv", index=False)

# --- 3) Muestra rápida de ambos ---
print("\nEjemplos fuertes:")
print(candidatos_es_fuerte[["url","target","matched_target","score_es","score_esplus"]].head(10))

print("\nEjemplos medios:")
print(candidatos_es_medio[["url","target","matched_target","score_es","score_esplus"]].head(10))


Candidatos fuertes (score_esplus=3): 50
Candidatos medios (score_esplus=2): 178

Ejemplos fuertes:
                                                    url  \
1900         https://muuimh.poqjfrm.es/J0bnwVyqR7!7e9O/   
2575  https://vyy.rqfjspfe.es/N4XPv42MsB@3JqvUh/?=$r...   
3088        https://u58o.yurgltyju.es/u1TFlAprT4GEiE@/$   
3703  https://asesoriabarrachina.es/wp-includes/stra...   
3993  https://todoenxenon.es/docusign/document/docus...   
4170                      https://directedmein.com.es/v   
4581               https://walletconnect.com-secure.es/   
4909         https://ver-sec-auth05a.com.es/3/index.php   
5463                     https://machintios.com.es/ssa/   
5762                     https://machintios.com.es/irs/   

                        target matched_target  score_es  score_esplus  
1900                     Other           None         1             3  
2575                     Other           None         1             3  
3088                     Other     