In [112]:
import pandas as pd

# Ruta del CSV base; ajústala si trabajas desde otro directorio
PATH = "../data/clean/dataset_base_v21.csv"

# 1. Cargar el CSV en df
df = pd.read_csv(PATH)

# 2. Mostrar estructura solicitada
print("df.shape:", df.shape)
print("\ndf.columns:\n", df.columns.tolist())
print("\ndf['label'].value_counts():")
print(df["label"].value_counts(dropna=False))
print("\ndf.head(5):")
print(df.head(5))



df.shape: (492, 31)

df.columns:
 ['campaign', 'categoria', 'confianza', 'confidence', 'dataset_split', 'domain', 'entidad', 'free_hosting', 'inclusion', 'is_https', 'label', 'matched_target', 'notas', 'route_type', 'ruido', 'score_total_v2', 'sector', 'sector_norm', 'source', 'timestamp', 'url', 'url_norm', 'domain_complexity', 'host_entropy', 'domain_whitelist_score', 'suspicious_path_token', 'token_density', 'trusted_token_context', 'infra_risk', 'fake_tld_in_subdomain_or_path', 'param_count_boost']

df['label'].value_counts():
label
1    248
0    244
Name: count, dtype: int64

df.head(5):
  campaign categoria  confianza  confidence dataset_split domain   entidad  \
0      NaN     banca        NaN         NaN           NaN    NaN       NaN   
1      NaN       NaN        NaN        95.0           NaN    NaN  Ibercaja   
2      NaN       NaN        NaN        90.0           NaN    NaN    WiZink   
3      NaN       NaN        NaN        95.0           NaN    NaN    WiZink   
4      NaN

In [113]:
campos_core = ["url", "sector", "entidad", "label"]

# Detectar NaN
mask_nan = df[campos_core].isna()

# Detectar strings vacíos o solo espacios
mask_empty = df[campos_core].apply(lambda col: col.astype(str).str.strip() == "")

missing_mask = mask_nan | mask_empty
missing_core_fields = df[missing_mask.any(axis=1)]

print("Filas con metadatos críticos faltantes:", len(missing_core_fields))
missing_core_fields.head(10)



Filas con metadatos críticos faltantes: 192


Unnamed: 0,campaign,categoria,confianza,confidence,dataset_split,domain,entidad,free_hosting,inclusion,is_https,...,url_norm,domain_complexity,host_entropy,domain_whitelist_score,suspicious_path_token,token_density,trusted_token_context,infra_risk,fake_tld_in_subdomain_or_path,param_count_boost
0,,banca,,,,,,,,,...,https://www.caixabank.es/particular/banca-digi...,0.508495,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0
151,,público,,,,,,,,,...,https://pasarela.clave.gob.es/proxy2/servicepr...,0.489317,0.40094,0.0,0.0,0.0,0.0,0.0,0.0,0.0
152,,banca,,,,,,,,,...,https://www.bancomediolanum.es/es-es/la-banca-...,0.648962,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0
153,,banca,,,,,,,,,...,https://www.bancosantander.es/particulares,0.613856,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
154,,telecomunicaciones,,,,,,,,,...,https://www.movistar.es/area-cliente/mi-cuenta,0.514996,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0
155,,telecomunicaciones,,,,,,,,,...,https://m.vodafone.es/mves/login,0.50125,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0
156,,banca,,,,,,,,,...,https://www.ing.es/seguridad-internet,0.398959,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0
157,,banca,,,,,,,,,...,https://www.openbank.es/app-openbank,0.50125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
158,,banca,,,,,,,,,...,https://www.ing.es,0.398959,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
159,,banca,,,,,,,,,...,https://www.openbank.es/?togglelogin&go-to-app=0,0.50125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.666667


In [114]:
from urllib.parse import urlparse
try:
    import tldextract
except ImportError:
    tldextract = None

# Descomponer URL en columnas base sin sobrescribir valores existentes
if tldextract is None:
    print("tldextract no disponible: no se crean columnas domain/subdomain/tld/path")
else:
    def _decompose_url(url):
        if pd.isna(url):
            return {"domain": "", "subdomain": "", "tld": "", "path": ""}
        texto = str(url)
        ext = tldextract.extract(texto)
        parsed = urlparse(texto)
        return {
            "domain": (ext.registered_domain or "").lower(),
            "subdomain": (ext.subdomain or "").lower(),
            "tld": (ext.suffix or "").lower(),
            "path": (parsed.path or "").lower(),
        }

    url_parts = df["url"].apply(_decompose_url).apply(pd.Series)

    target_map = {
        "domain": "domain",
        "subdomain": "subdomain",
        "tld": "tld",
        "path": "path",
    }

    for col, source_col in target_map.items():
        if col not in df.columns:
            df[col] = url_parts[source_col]
        else:
            mask_missing = df[col].isna() | (df[col].astype(str).str.strip() == "")
            df.loc[mask_missing, col] = url_parts.loc[mask_missing, source_col]

    print("Columnas domain/subdomain/tld/path actualizadas sin sobrescribir existentes.")



Columnas domain/subdomain/tld/path actualizadas sin sobrescribir existentes.


  "domain": (ext.registered_domain or "").lower(),


In [115]:
df[["url", "domain", "subdomain", "tld", "path"]].head(20)


Unnamed: 0,url,domain,subdomain,tld,path
0,https://www.caixabank.es/particular/banca-digi...,caixabank.es,www,es,/particular/banca-digital.html
1,https://www.ibercaja.es/particulares/,ibercaja.es,www,es,/particulares/
2,https://www.wizink.es/tarjetas,wizink.es,www,es,/tarjetas
3,https://www.wizink.es/,wizink.es,www,es,/
4,https://www.cetelem.es/credito-y-prestamos/,cetelem.es,www,es,/credito-y-prestamos/
5,https://www.cetelem.es/,cetelem.es,www,es,/
6,https://www.cajamar.es/es/particulares/product...,cajamar.es,www,es,/es/particulares/productos-y-servicios/banca-a...
7,https://www.ibercaja.es/empresas/,ibercaja.es,www,es,/empresas/
8,https://www.cajamar.es/es/particulares/,cajamar.es,www,es,/es/particulares/
9,https://portal.kutxabank.es/cs/Satellite/kb/es...,kutxabank.es,portal,es,/cs/satellite/kb/es/particulares/banca-online


In [116]:
# Cargar whitelist y obtener dominios registrados en minúsculas
WL_PATH = "../docs/whitelist.csv"
wl = pd.read_csv(WL_PATH)

try:
    import tldextract
except ImportError:
    tldextract = None

if tldextract is None:
    wl_domains = set()
    print("tldextract no disponible: wl_domains vacío")
else:
    def _reg_domain(url):
        if pd.isna(url):
            return ""
        ext = tldextract.extract(str(url))
        return (ext.registered_domain or "").lower()

    url_col = next((c for c in ["url", "domain", "host"] if c in wl.columns), None)
    if url_col is None:
        wl_domains = set()
        print("No se encontró columna URL en whitelist; wl_domains vacío")
    else:
        wl_domains = set(filter(None, wl[url_col].map(_reg_domain)))
        print(f"Dominios en whitelist: {len(wl_domains)}")



Dominios en whitelist: 297


  return (ext.registered_domain or "").lower()


In [117]:
# Completar entidad usando whitelist para filas cuyo domain está en wl_domains
if 'domain' not in df.columns:
    print("No existe columna 'domain'; no se rellenó 'entidad'")
elif 'entidad' not in df.columns:
    print("No existe columna 'entidad'; no se rellenó")
else:
    mask_empty_entidad = df['entidad'].isna() | (df['entidad'].astype(str).str.strip() == '')
    mask_whitelist = df['domain'].astype(str).str.lower().isin(wl_domains)
    target_rows = mask_empty_entidad & mask_whitelist

    def _entity_from_domain(dom):
        dom = str(dom).lower()
        parts = dom.split('.')
        return parts[0] if parts else ''

    df.loc[target_rows, 'entidad'] = df.loc[target_rows, 'domain'].map(_entity_from_domain)
    print(f"Entidades rellenadas desde whitelist: {target_rows.sum()}")



Entidades rellenadas desde whitelist: 74


In [118]:
# Rellenar entidad vacía usando tokens seguros en domain/subdomain/primer path
if 'entidad' not in df.columns:
    print("No existe columna 'entidad'; no se rellenó")
else:
    tokens = [
        "bbva", "santander", "ing", "caixabank", "openbank", "bankinter",
        "unicaja", "abanca", "cetelem", "wizink", "mediolanum",
        "correos", "seur", "dhl", "gls", "mrw", "ups",
        "movistar", "vodafone", "orange", "yoigo", "digi",
        "dgt", "clave", "agenciatributaria", "sede", "sepe",
    ]

    # Prep columnas necesarias; si faltan, se usan strings vacíos
    dom_col = df.get('domain', '').astype(str).str.lower()
    sub_col = df.get('subdomain', '').astype(str).str.lower()
    path_col = df.get('path', '').astype(str)

    mask_empty_entidad = df['entidad'].isna() | (df['entidad'].astype(str).str.strip() == '')

    def _infer_entidad(idx):
        if not mask_empty_entidad.at[idx]:
            return df.at[idx, 'entidad']
        domain = dom_col.at[idx]
        subdomain = sub_col.at[idx]
        path = path_col.at[idx]
        first_path = path.lstrip('/').split('/')[0].lower()
        for tok in tokens:
            if domain.startswith(tok + '.') or domain.startswith(tok + '-') or subdomain == tok or first_path == tok or first_path.startswith(tok + '-'):
                return tok
        return df.at[idx, 'entidad']

    df.loc[mask_empty_entidad, 'entidad'] = df.loc[mask_empty_entidad].index.to_series().map(_infer_entidad)
    print("Entidades rellenadas por tokens en URLs (solo vacías)")



Entidades rellenadas por tokens en URLs (solo vacías)


In [119]:
df["entidad"].isna().sum(), (df["entidad"].astype(str).str.strip() == "").sum()


(np.int64(101), np.int64(0))

In [120]:
df[df["domain"].isin(wl_domains)][["url", "domain", "entidad"]].sort_values("entidad")


Unnamed: 0,url,domain,entidad
92,https://sede.agenciatributaria.gob.es/Sede/otr...,agenciatributaria.gob.es,AEAT
90,https://sede.agenciatributaria.gob.es/Sede/not...,agenciatributaria.gob.es,AEAT
93,https://www.amazon.es/gp/css/order-details?ord...,amazon.es,Amazon España
91,https://www.amazon.es/gp/your-account/order-hi...,amazon.es,Amazon España
76,https://www.amazon.es/gp/help/customer/display...,amazon.es,Amazon España
...,...,...,...
155,https://m.vodafone.es/mves/login,vodafone.es,vodafone
167,https://walletconnect.network,walletconnect.network,walletconnect
194,https://wetransfer.com,wetransfer.com,wetransfer
178,https://web.whatsapp.com,whatsapp.com,whatsapp


In [121]:
df["entidad"].isna().sum()


np.int64(101)

In [122]:
(df["entidad"].astype(str).str.strip() == "").sum()


np.int64(0)

In [123]:
df[df["entidad"].isna() | (df["entidad"].astype(str).str.strip() == "")][["url", "domain", "subdomain", "path"]].head(20)


Unnamed: 0,url,domain,subdomain,path
162,https://www.hbomax.com/es/es/sign-in,hbomax.com,www,/es/es/sign-in
170,https://accounts.binance.com/es/login,binance.com,accounts,/es/login
171,https://ver.flixole.com/log-in,flixole.com,ver,/log-in
180,https://venta.renfe.com/vol/loginParticular.do,renfe.com,venta,/vol/loginparticular.do
181,https://www.iberia.com/es/iberiaplus/recuperar...,iberia.com,www,/es/iberiaplus/recuperar-password/
183,https://login.yahoo.com/,yahoo.com,login,/
187,https://www.mutua.es/acceso-area-personal/#/login,mutua.es,www,/acceso-area-personal/
188,https://areadeclientes.mapfre.es,mapfre.es,areadeclientes,
190,https://mibp.es/es/welcome,mibp.es,,/es/welcome
191,https://www.habbo.es,habbo.es,www,


In [137]:
# Tokens ampliados de marcas/servicios
BRAND_TOKENS_EXTENDED = [
    "bbva","santander","ing","caixabank","openbank","bankinter",
    "unicaja","abanca","cetelem","wizink","mediolanum","kutxabank",
    "cajamar","ibercaja",
    "correos","seur","dhl","gls","mrw","ups",
    "movistar","vodafone","orange","yoigo","digi",
    "dgt","agenciatributaria","sede","clave","sepe",
    "binance","coinbase","kraken","walletconnect",
    "hbomax","netflix","disneyplus","flixole",
    "renfe","iberia","ryanair",
    "zoom","wetransfer","dropbox","microsoft","google","yahoo",
    "amazon","aliexpress",
    "mapfre","mutua","lineadirecta",
    "roblox","habbo","steam",
    "bit2me","masmovil","mercadona","zara","sabadell","orangebank",
    "caixa", "dhl", "bp"
]



In [None]:
# Helper para inferir entidad a partir de tokens conocidos

def infer_entity_from_tokens(row):
    domain = str(row.get("domain", "") or "").lower()
    subdomain = str(row.get("subdomain", "") or "").lower()
    path = str(row.get("path", "") or "").lower()
    first_path = path.lstrip("/").split("/", 1)[0] if path else ""
    segments = [s.lower() for s in path.lstrip("/").split("/") if s]

    for token in BRAND_TOKENS_EXTENDED:
        tok = token.lower()
        if (
            domain.startswith(tok + ".")
            or domain.startswith(tok + "-")
            or subdomain == tok
            or subdomain.startswith(tok)
            or first_path == tok
            or first_path.startswith(tok + "-")
            or any(segment == tok or segment.startswith(tok + "-") for segment in segments)
        ):
            return tok
    return None



In [126]:
# Rellenar entidad vacía usando infer_entity_from_tokens
if 'entidad' not in df.columns:
    print("No existe columna 'entidad'; no se aplicó inferencia")
else:
    mask_empty_entidad = df['entidad'].isna() | (df['entidad'].astype(str).str.strip() == '')
    if mask_empty_entidad.any():
        inferred = df.loc[mask_empty_entidad].apply(infer_entity_from_tokens, axis=1)
        matches = inferred.dropna()
        df.loc[matches.index, 'entidad'] = matches
        print(f"Entidades inferidas y rellenadas: {len(matches)}")
    else:
        print("No hay filas con entidad vacía para inferir")



Entidades inferidas y rellenadas: 32


In [127]:
# Filas que tenían entidad vacía antes y ahora tienen valor
# Necesitamos comparar estado antes vs después -> pero si no guardaste copia, usamos heurística:

df_new = df[df["entidad"].notna() & (~df["domain"].isin(wl_domains))]
df_new[["url", "domain", "subdomain", "path", "entidad"]].head(18)


Unnamed: 0,url,domain,subdomain,path,entidad
2,https://www.wizink.es/tarjetas,wizink.es,www,/tarjetas,WiZink
3,https://www.wizink.es/,wizink.es,www,/,WiZink
4,https://www.cetelem.es/credito-y-prestamos/,cetelem.es,www,/credito-y-prestamos/,Cetelem
5,https://www.cetelem.es/,cetelem.es,www,/,Cetelem
6,https://www.cajamar.es/es/particulares/product...,cajamar.es,www,/es/particulares/productos-y-servicios/banca-a...,Cajamar
8,https://www.cajamar.es/es/particulares/,cajamar.es,www,/es/particulares/,Cajamar
14,https://www.abanca.com/es/banca-personal/,abanca.com,www,/es/banca-personal/,ABANCA
34,https://www.abanca.com/es/banca-a-distancia/ba...,abanca.com,www,/es/banca-a-distancia/banca-electronica/,ABANCA
35,https://www.abanca.com/es/empresas/,abanca.com,www,/es/empresas/,ABANCA
41,https://www.bbva.com/es/,bbva.com,www,/es/,BBVA


In [128]:
df["entidad"].isna().sum() + (df["entidad"].astype(str).str.strip() == "").sum()


np.int64(69)

In [129]:
mask_empty = df["entidad"].isna() | (df["entidad"].astype(str).str.strip() == "")
df[mask_empty][["url","domain","subdomain","path"]].head(30)


Unnamed: 0,url,domain,subdomain,path
190,https://mibp.es/es/welcome,mibp.es,,/es/welcome
368,https://recibo-de-pago.weebly.com/,weebly.com,recibo-de-pago,/
370,https://my.forms.app/dpdservicioexpress/modulo...,forms.app,my,/dpdservicioexpress/modulodiaperturacontodpd
375,https://ampliaciones.zonasolicitudeampliacion....,zonasolicitudeampliacion.click,ampliaciones,/
376,https://appcanaldigital.com/,appcanaldigital.com,,/
377,https://clientepresenteado.digital/inicio,clientepresenteado.digital,,/inicio
378,http://bancosantaderonline.com,bancosantaderonline.com,,
379,http://bancosantander.es.hotelparadis.es/aviso...,hotelparadis.es,bancosantander.es,/aviso/db5359497aecd83/login.php
380,https://banco-galicia-online.com/galicia/confirm,banco-galicia-online.com,,/galicia/confirm
383,https://corroes-ep.top/es,corroes-ep.top,,/es


In [130]:
mask_empty = df["entidad"].isna() | (df["entidad"].astype(str).str.strip() == "")
df.loc[mask_empty, "entidad"] = df.loc[mask_empty].apply(infer_entity_from_tokens, axis=1)


In [131]:
df["entidad"].isna().sum() + (df["entidad"].astype(str).str.strip() == "").sum()


np.int64(69)

In [132]:
mask_empty = df["entidad"].isna() | (df["entidad"].astype(str).str.strip() == "")
df[mask_empty][["url","domain","subdomain","path"]].iloc[20:40]


Unnamed: 0,url,domain,subdomain,path
396,https://w.aviso-cliente.net,aviso-cliente.net,w,
397,https://misaki4552.github.io/Pagina-instagram/,github.io,misaki4552,/pagina-instagram/
398,https://l.ead.me/modifica-tu-entrega,ead.me,l,/modifica-tu-entrega
399,https://kmctayurvedacollege.org/ingresoseguro/,kmctayurvedacollege.org,,/ingresoseguro/
400,https://hdseguridad.webcindario.com/,webcindario.com,hdseguridad,/
402,https://validar-tarjeta.webcindario.com/,webcindario.com,validar-tarjeta,/
405,https://l.ead.me/BBVA24,ead.me,l,/bbva24
406,https://particulares-netbancosantander.web.app/,web.app,particulares-netbancosantander,/
407,https://soportetuexperto-orange-es.webflow.io/,webflow.io,soportetuexperto-orange-es,/
410,https://www.binance.com.loginsn.cc/,loginsn.cc,www.binance.com,/


In [133]:
mask_empty = df["entidad"].isna() | (df["entidad"].astype(str).str.strip() == "")
df[mask_empty][["url","domain","subdomain","path"]].head(10)


Unnamed: 0,url,domain,subdomain,path
190,https://mibp.es/es/welcome,mibp.es,,/es/welcome
368,https://recibo-de-pago.weebly.com/,weebly.com,recibo-de-pago,/
370,https://my.forms.app/dpdservicioexpress/modulo...,forms.app,my,/dpdservicioexpress/modulodiaperturacontodpd
375,https://ampliaciones.zonasolicitudeampliacion....,zonasolicitudeampliacion.click,ampliaciones,/
376,https://appcanaldigital.com/,appcanaldigital.com,,/
377,https://clientepresenteado.digital/inicio,clientepresenteado.digital,,/inicio
378,http://bancosantaderonline.com,bancosantaderonline.com,,
379,http://bancosantander.es.hotelparadis.es/aviso...,hotelparadis.es,bancosantander.es,/aviso/db5359497aecd83/login.php
380,https://banco-galicia-online.com/galicia/confirm,banco-galicia-online.com,,/galicia/confirm
383,https://corroes-ep.top/es,corroes-ep.top,,/es


In [134]:
mask_empty = df["entidad"].isna() | (df["entidad"].astype(str).str.strip() == "")
df_empty = df[mask_empty][["url","domain","subdomain","path"]]

df_empty.shape


(69, 4)

In [135]:
df_empty


Unnamed: 0,url,domain,subdomain,path
190,https://mibp.es/es/welcome,mibp.es,,/es/welcome
368,https://recibo-de-pago.weebly.com/,weebly.com,recibo-de-pago,/
370,https://my.forms.app/dpdservicioexpress/modulo...,forms.app,my,/dpdservicioexpress/modulodiaperturacontodpd
375,https://ampliaciones.zonasolicitudeampliacion....,zonasolicitudeampliacion.click,ampliaciones,/
376,https://appcanaldigital.com/,appcanaldigital.com,,/
...,...,...,...,...
485,https://asesoriabarrachina.es/wp-includes/stra...,asesoriabarrachina.es,,/wp-includes/strato/
486,http://serv-authveriline.com.es,serv-authveriline.com.es,,
487,http://authline-checkappr0v.com.es,authline-checkappr0v.com.es,,
489,https://koinbay-login.es,koinbay-login.es,,


In [138]:
df.to_csv("dataset_v2_snapshot_before_manual_fix.csv", index=False)


In [139]:
df.to_csv("dataset_v2_working.csv", index=False)


In [140]:
import os

[x for x in os.listdir() if "dataset_v2" in x]


['limpieza_dataset_v2.ipynb',
 'dataset_v2_snapshot_before_manual_fix.csv',
 'dataset_v2_working.csv']

In [143]:
df["entidad_final"] = df["entidad"]

mask_empty = df["entidad"].isna() | (df["entidad"].astype(str).str.strip() == "")
df.loc[mask_empty, "entidad_final"] = df.loc[mask_empty, "matched_target"]

df["entidad_final"].isna().sum() + (df["entidad_final"].astype(str).str.strip() == "").sum()

df[df["entidad_final"].isna() | (df["entidad_final"].astype(str).str.strip() == "")][["url","domain","subdomain","path"]].head(20)

df["entidad_final"].isna().sum() + (df["entidad_final"].astype(str).str.strip() == "").sum()

df[df["entidad_final"].isna() | (df["entidad_final"].astype(str).str.strip() == "")][["url","domain","subdomain","path"]].head(20)


Unnamed: 0,url,domain,subdomain,path


In [145]:
import unicodedata

# Normalizar entidad (minúsculas, sin acentos, variantes comunes)

def normalize_entity(x):
    if pd.isna(x):
        return ""
    s = str(x).strip().lower()
    s = "".join(
        c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn"
    )
    # Mapas de variantes -> forma canónica (extensible según necesidad)
    variants = {
        "wizink": "wizink",
        "cetelem": "cetelem",
        "bbva": "bbva",
    }
    return variants.get(s, s)

if "entidad_final" in df.columns:
    df["entidad_norm"] = df["entidad_final"].apply(normalize_entity)
else:
    print("No existe columna 'entidad_final'; no se creó 'entidad_norm'")



In [146]:
df = df.drop(columns=["entidad", "matched_target"])


In [147]:
df.columns


Index(['campaign', 'categoria', 'confianza', 'confidence', 'dataset_split',
       'domain', 'free_hosting', 'inclusion', 'is_https', 'label', 'notas',
       'route_type', 'ruido', 'score_total_v2', 'sector', 'sector_norm',
       'source', 'timestamp', 'url', 'url_norm', 'domain_complexity',
       'host_entropy', 'domain_whitelist_score', 'suspicious_path_token',
       'token_density', 'trusted_token_context', 'infra_risk',
       'fake_tld_in_subdomain_or_path', 'param_count_boost', 'subdomain',
       'tld', 'path', 'entidad_final', 'entidad_norm'],
      dtype='object')

In [148]:
df["entidad_norm"].value_counts().head(50)


entidad_norm
correos             74
generico            58
santander           43
caixabank           26
bbva                26
ing                 26
dgt                 11
bankinter            9
binance              8
netflix              7
coinbase             7
ionos                7
openbank             7
abanca               6
iberdrola            5
google               5
ibercaja             5
kutxabank            5
microsoft            5
banco sabadell       5
roblox               4
orange               4
seur                 4
unicajabanco         4
bancosantander       3
amazon espana        3
cajamar              3
bancsabadell         3
mapfre               3
nacex                3
unicaja banco        3
vodafone             2
cetelem              2
wizink               2
linea directa        2
movistar             2
correosexpress       2
myinvestor           2
amazon               2
walletconnect        2
zoom                 2
instagram            2
unicismadrid         

In [149]:
df["entidad_norm"].nunique()


111

In [150]:
(df["entidad_norm"].astype(str).str.strip() == "").sum() + df["entidad_norm"].isna().sum()


np.int64(0)

In [151]:
df["entidad"] = df["entidad_norm"]
df = df.drop(columns=["entidad_final", "entidad_norm"], errors="ignore")


In [152]:
df["entidad"].value_counts().head(20)


entidad
correos           74
generico          58
santander         43
caixabank         26
bbva              26
ing               26
dgt               11
bankinter          9
binance            8
netflix            7
coinbase           7
ionos              7
openbank           7
abanca             6
iberdrola          5
google             5
ibercaja           5
kutxabank          5
microsoft          5
banco sabadell     5
Name: count, dtype: int64

In [153]:
df.loc[df["entidad"] == "banco sabadell", "entidad"] = "sabadell"


In [154]:
# Derivar sector a partir de la entidad

def derive_sector(ent):
    if pd.isna(ent):
        return "generico"
    e = str(ent).strip().lower()

    sector_map = {
        "banca": [
            "bbva","santander","ing","caixabank","openbank","bankinter",
            "unicaja","abanca","cetelem","wizink","mediolanum","cajamar",
            "ibercaja","sabadell",
        ],
        "crypto": ["binance","coinbase","kraken","bit2me"],
        "logistica": ["correos","seur","dhl","gls","mrw","ups"],
        "telecom": ["movistar","vodafone","orange","yoigo","digi","masmovil"],
        "energia": ["iberdrola","endesa"],
        "retail": ["amazon","aliexpress","mercadona","zara"],
        "tecnologia": ["google","microsoft","yahoo","zoom","dropbox","wetransfer"],
        "seguros": ["mapfre","mutua","lineadirecta"],
        "entretenimiento": ["netflix","disneyplus","hbomax","flixole","roblox","habbo","steam"],
        "gobierno": ["dgt","agenciatributaria","sede","clave","sepe"],
    }

    for sector, tokens in sector_map.items():
        if e in tokens:
            return sector
    return "generico"

if "entidad" in df.columns:
    df["sector_final"] = df["entidad"].apply(derive_sector)
else:
    print("No existe columna 'entidad'; no se creó 'sector_final'")



In [157]:
df["sector_final"].value_counts()


sector_final
banca              161
generico           160
logistica           82
crypto              19
tecnologia          16
entretenimiento     15
gobierno            13
telecom             11
energia              7
seguros              4
retail               4
Name: count, dtype: int64

In [159]:
df_train = df[
    ["url", "label", "sector", "entidad", "notas", "campaign"]
].copy()


In [162]:
df_train["url"] = df_train["url"].str.strip()
df_train["sector"] = df_train["sector"].str.strip().str.lower()
df_train["entidad"] = df_train["entidad"].str.strip().str.lower()


In [163]:
assert df_train["url"].isna().sum() == 0


In [164]:
df_nulls = df_train[df_train.isna().any(axis=1)]
df_nulls.head(20)
len(df_nulls)


492

In [165]:
df_train.isna().sum()


url           0
label         0
sector      192
entidad       0
notas       150
campaign    394
dtype: int64

In [166]:
df_missing_sector = df_train[df_train["sector"].isna()][["url","entidad"]]
df_missing_sector.head(20)


Unnamed: 0,url,entidad
0,https://www.caixabank.es/particular/banca-digi...,caixabank
151,https://pasarela.clave.gob.es/Proxy2/ServicePr...,clave
152,https://www.bancomediolanum.es/es-ES/la-banca-...,bancomediolanum
153,https://www.bancosantander.es/particulares,bancosantander
154,https://www.movistar.es/area-cliente/mi-cuenta/,movistar
155,https://m.vodafone.es/mves/login,vodafone
156,https://www.ing.es/seguridad-internet,ing
157,https://www.openbank.es/app-openbank,openbank
158,https://www.ing.es/,ing
159,https://www.openbank.es/?toggleLogin&go-to-app=0,openbank


In [167]:
sorted(df_missing_sector["entidad"].unique())


['abanca',
 'agenciatributaria',
 'amazon',
 'banco galicia',
 'banco santander',
 'bancomediolanum',
 'bancosantander',
 'bancsabadell',
 'bankinter',
 'bbva',
 'binance',
 'bit2me',
 'bmedonline',
 'bp',
 'caixa',
 'caixabank',
 'caja rural',
 'cajaruraldeasturias',
 'carrefour',
 'clave',
 'coinbase',
 'correos',
 'correosexpress',
 'decathlon',
 'dgt',
 'dhl',
 'digi',
 'digimobil',
 'dpd',
 'dropbox',
 'financieraelcorteingles',
 'flixole',
 'generico',
 'google',
 'grupocajarural',
 'habbo',
 'hbomax',
 'ibercaja',
 'iberdrola',
 'iberia',
 'ing',
 'instagram',
 'ionos',
 'kutxabank',
 'mapfre',
 'masmovil',
 'mediamarkt',
 'mercadona',
 'microsoft',
 'movistar',
 'mutua',
 'netflix',
 'openbank',
 'orange',
 'orangebank',
 'outlook',
 'pccomponentes',
 'renfe',
 'roblox',
 'ruralcentral',
 'sabadell',
 'santander',
 'seg-social',
 'serviciodecorreo',
 'unicajabanco',
 'unicismadrid',
 'vodafone',
 'walletconnect',
 'wetransfer',
 'whatsapp',
 'yahoo',
 'yoigo',
 'zara',
 'zoom']

In [168]:
sector_map = {
    # 🏦 Banca / Fintech
    "abanca": "banca",
    "banco galicia": "banca",
    "banco santander": "banca",
    "bancosantander": "banca",
    "bancomediolanum": "banca",
    "bancsabadell": "banca",
    "bankinter": "banca",
    "bbva": "banca",
    "bmedonline": "banca",
    "caixa": "banca",
    "caixabank": "banca",
    "caja rural": "banca",
    "cajaruraldeasturias": "banca",
    "grupocajarural": "banca",
    "ibercaja": "banca",
    "ing": "banca",
    "kutxabank": "banca",
    "openbank": "banca",
    "ruralcentral": "banca",
    "sabadell": "banca",
    "santander": "banca",
    "unicajabanco": "banca",

    # 💳 Cripto
    "binance": "cripto",
    "bit2me": "cripto",
    "coinbase": "cripto",
    "walletconnect": "cripto",

    # 🚚 Logística
    "correos": "logistica",
    "correosexpress": "logistica",
    "dhl": "logistica",
    "dpd": "logistica",
    "serviciodecorreo": "logistica",

    # 🛒 Retail / e-commerce
    "amazon": "ecommerce",
    "carrefour": "ecommerce",
    "decathlon": "ecommerce",
    "mediamarkt": "ecommerce",
    "pccomponentes": "ecommerce",
    "mercadona": "ecommerce",
    "zara": "ecommerce",

    # ☁️ SaaS / Cloud / Storage
    "google": "saas",
    "dropbox": "saas",
    "microsoft": "saas",
    "outlook": "saas",
    "wetransfer": "saas",
    "zoom": "saas",

    # 🎮 Gaming / plataformas digitales
    "habbo": "gaming",
    "roblox": "gaming",

    # 🎬 Streaming
    "flixole": "streaming",
    "hbomax": "streaming",
    "netflix": "streaming",

    # 📱 Telecom / Móviles
    "digi": "teleco",
    "digimobil": "teleco",
    "masmovil": "teleco",
    "movistar": "teleco",
    "orange": "teleco",
    "orangebank": "teleco",
    "vodafone": "teleco",
    "yoigo": "teleco",

    # 🛫 Turismo / transporte
    "iberia": "viajes",
    "renfe": "viajes",

    # 🔌 Energía / seguros
    "bp": "energia",
    "iberdrola": "energia",
    "mapfre": "seguros",
    "mutua": "seguros",

    # 🏛️ Público / administración
    "agenciatributaria": "publico",
    "clave": "publico",
    "dgt": "publico",
    "seg-social": "publico",

    # 🔧 Genérico / no clasificable
    "financieraelcorteingles": "generico",
    "generico": "generico",
    "instagram": "generico",
    "whatsapp": "generico",
    "unicismadrid": "generico",
    "yahoo": "generico",
}


In [171]:
df_train.loc[df_train["sector"].isna(), "sector"] = \
    df_train.loc[df_train["sector"].isna(), "entidad"].map(sector_map)


In [172]:
import unicodedata

# Normalizar df_train['sector'] a un conjunto cerrado

def _strip_accents(s: str) -> str:
    return "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")


def normalize_sector_value(val):
    if pd.isna(val):
        return "generico"
    v = str(val).strip().lower()
    v_noacc = _strip_accents(v)

    if v in ("banca", "teleco"):
        return v

    if v.startswith("logística") or v.startswith("logistica") or v_noacc.startswith("logistica"):
        return "logistica"
    if v in {"saas / cloud / plataformas", "saas"}:
        return "saas"
    if v in {"retail / e-commerce / streaming", "ecommerce"}:
        return "ecommerce"
    if v in {"público / administración", "administración pública", "publico"} or v_noacc in {"publico / administracion", "administracion publica", "publico"}:
        return "publico"
    if v in {"cripto / fintech", "cripto"} or v_noacc == "cripto / fintech":
        return "cripto"
    if v in {"energía / seguros", "energia"} or v_noacc == "energia / seguros":
        return "energia"
    if v == "seguros":
        return "seguros"
    if v == "streaming":
        return "streaming"
    if v == "gaming":
        return "gaming"
    if v == "viajes":
        return "viajes"
    if v in {"generico", "genérico / otros"} or v_noacc == "generico / otros":
        return "generico"

    return "generico"

if "sector" in df_train.columns:
    df_train["sector"] = df_train["sector"].apply(normalize_sector_value)
else:
    print("No existe columna 'sector' en df_train; no se normalizó")



In [173]:
# Normalización de sectores

df_train["sector"] = df_train["sector"].str.lower().str.strip()

reemplazos = {
    # logística
    "logística": "logistica",
    "logistica": "logistica",

    # saas / cloud
    "saas": "saas",
    "saas / cloud / plataformas": "saas",

    # ecommerce / retail
    "ecommerce": "ecommerce",
    "retail / e-commerce / streaming": "ecommerce",

    # publico
    "público / administración": "publico",
    "administración pública": "publico",
    "publico": "publico",

    # cripto
    "cripto / fintech": "cripto",
    "cripto": "cripto",

    # energía / seguros
    "energía / seguros": "energia",
    "energia": "energia",
    "seguros": "seguros",

    # streaming
    "streaming": "streaming",

    # gaming
    "gaming": "gaming",

    # viajes
    "viajes": "viajes",

    # generico
    "generico": "generico",
    "genérico / otros": "generico",
}

df_train["sector"] = df_train["sector"].replace(reemplazos)

# Cualquier etiqueta fuera del set permitido → "generico"
validos = {
    "banca", "logistica", "saas", "teleco", "cripto",
    "publico", "ecommerce", "streaming", "gaming",
    "energia", "seguros", "viajes", "generico"
}

df_train.loc[~df_train["sector"].isin(validos), "sector"] = "generico"

df_train["sector"].value_counts()



sector
banca        197
logistica    101
generico      70
saas          23
cripto        21
publico       18
ecommerce     17
energia       13
teleco        13
streaming      9
gaming         6
viajes         2
seguros        2
Name: count, dtype: int64

In [174]:
df_generico = df_train[df_train["sector"] == "generico"][["url", "entidad", "notas"]]
df_generico.head(30)
len(df_generico)



70

In [177]:
sorted(df_generico["entidad"].unique())


['amazon web services',
 'auth0',
 'cloudflare',
 'financieraelcorteingles',
 'generico',
 'instagram',
 'ionos',
 'linkedin',
 'microsoft azure',
 'okta',
 'paypal espana',
 'redsys espana',
 'stripe espana',
 'twitter',
 'unicismadrid',
 'whatsapp',
 'yahoo']

In [176]:
# Añadir categoría redessociales y corregir mapeos

df_train.loc[df_train["entidad"].isin([
    "instagram", "twitter", "linkedin", "whatsapp", "yahoo"
]), "sector"] = "redessociales"

df_train.loc[df_train["entidad"].isin([
    "amazon web services", "auth0", "cloudflare", "ionos",
    "microsoft azure", "okta", "stripe espana", "paypal espana"
]), "sector"] = "saas"

df_train.loc[df_train["entidad"].isin([
    "redsys espana"
]), "sector"] = "cripto"   # fintech/pagos

df_train.loc[df_train["entidad"].isin([
    "financieraelcorteingles", "generico", "unicismadrid"
]), "sector"] = "generico"

df_train["sector"].value_counts()


sector
banca            193
logistica         94
generico          61
saas              36
cripto            22
publico           18
ecommerce         17
energia           13
teleco            13
streaming          9
redessociales      6
gaming             6
viajes             2
seguros            2
Name: count, dtype: int64

In [178]:
sorted(df_train[df_train["sector"] == "generico"]["entidad"].unique())


['financieraelcorteingles', 'generico', 'unicismadrid']

In [179]:
df_train.loc[df_train["entidad"] == "financieraelcorteingles", "sector"] = "ecommerce"
df_train["sector"].value_counts()


sector
banca            193
logistica         94
generico          60
saas              36
cripto            22
ecommerce         18
publico           18
energia           13
teleco            13
streaming          9
redessociales      6
gaming             6
viajes             2
seguros            2
Name: count, dtype: int64

In [180]:
df_train.loc[df_train["entidad"] == "unicismadrid", "sector"] = "publico"
df_train["sector"].value_counts()


sector
banca            193
logistica         94
generico          58
saas              36
cripto            22
publico           20
ecommerce         18
energia           13
teleco            13
streaming          9
redessociales      6
gaming             6
viajes             2
seguros            2
Name: count, dtype: int64

In [181]:
df_generico = df_train[df_train["sector"] == "generico"][["url", "entidad", "notas"]]
df_generico.head(50)


Unnamed: 0,url,entidad,notas
247,http://webseguridadcuenta-9e626b.ingress-bonde...,generico,generico | generico | Verificación | free_host...
251,https://suponsoro22-ba9799.ingress-daribow.ewp...,generico,generico | generico | Login | free_hosting;mar...
252,https://suponsoo22-ba6aa2.ingress-florina.ewp....,generico,generico | generico | Login | free_hosting;mar...
253,https://supoertas22-bb468f.ingress-bonde.ewp.l...,generico,generico | generico | Verificación | free_host...
254,https://supanort22-baa4c7.ingress-erytho.ewp.l...,generico,generico | generico | Login | free_hosting;mar...
256,https://sistemaactivada-bb172f.ingress-earth.e...,generico,banca | generico | Login | free_hosting;marca_...
261,https://particulares-es-b965d0.ingress-compore...,generico,generico | generico | Login | free_hosting;mar...
262,https://particulares-es-1-bd5e31.ingress-darib...,generico,generico | generico | Login | free_hosting;mar...
268,https://informacion-cliente-spainespain3412230...,generico,generico | generico | Login | marca_en_path;sp...
272,http://dinamicaenlineatusucursal.com/verificac...,generico,banca | generico | Verificación | marca_en_pat...


In [182]:
pd.set_option('display.max_rows', None)
df_generico


Unnamed: 0,url,entidad,notas
247,http://webseguridadcuenta-9e626b.ingress-bonde...,generico,generico | generico | Verificación | free_host...
251,https://suponsoro22-ba9799.ingress-daribow.ewp...,generico,generico | generico | Login | free_hosting;mar...
252,https://suponsoo22-ba6aa2.ingress-florina.ewp....,generico,generico | generico | Login | free_hosting;mar...
253,https://supoertas22-bb468f.ingress-bonde.ewp.l...,generico,generico | generico | Verificación | free_host...
254,https://supanort22-baa4c7.ingress-erytho.ewp.l...,generico,generico | generico | Login | free_hosting;mar...
256,https://sistemaactivada-bb172f.ingress-earth.e...,generico,banca | generico | Login | free_hosting;marca_...
261,https://particulares-es-b965d0.ingress-compore...,generico,generico | generico | Login | free_hosting;mar...
262,https://particulares-es-1-bd5e31.ingress-darib...,generico,generico | generico | Login | free_hosting;mar...
268,https://informacion-cliente-spainespain3412230...,generico,generico | generico | Login | marca_en_path;sp...
272,http://dinamicaenlineatusucursal.com/verificac...,generico,banca | generico | Verificación | marca_en_pat...


In [183]:
df_train.loc[df_train["url"].str.contains("hdseguridad", case=False, na=False), "sector"] = "energia"
df_train["sector"].value_counts()


sector
banca            193
logistica         94
generico          57
saas              36
cripto            22
publico           20
ecommerce         18
energia           14
teleco            13
streaming          9
redessociales      6
gaming             6
viajes             2
seguros            2
Name: count, dtype: int64

In [184]:
df_train[df_train["url"].duplicated(keep=False)]


Unnamed: 0,url,label,sector,entidad,notas,campaign


In [185]:
df_train["url"].duplicated().sum()


np.int64(0)

In [188]:
# Construir df_generico si no existe aún
df_generico = df_train[df_train["sector"] == "generico"].copy()

# Inspección básica
print("Shape:", df_generico.shape)
display(df_generico.head(10))

# Entidades dentro de generico
print("\nEntidades únicas:")
print(sorted(df_generico["entidad"].unique()))

# Comprobación de nulos
print("\nNulos por columna:")
print(df_generico.isna().sum())


Shape: (57, 6)


Unnamed: 0,url,label,sector,entidad,notas,campaign
247,http://webseguridadcuenta-9e626b.ingress-bonde...,1,generico,generico,generico | generico | Verificación | free_host...,
251,https://suponsoro22-ba9799.ingress-daribow.ewp...,1,generico,generico,generico | generico | Login | free_hosting;mar...,
252,https://suponsoo22-ba6aa2.ingress-florina.ewp....,1,generico,generico,generico | generico | Login | free_hosting;mar...,
253,https://supoertas22-bb468f.ingress-bonde.ewp.l...,1,generico,generico,generico | generico | Verificación | free_host...,
254,https://supanort22-baa4c7.ingress-erytho.ewp.l...,1,generico,generico,generico | generico | Login | free_hosting;mar...,
256,https://sistemaactivada-bb172f.ingress-earth.e...,1,generico,generico,banca | generico | Login | free_hosting;marca_...,
261,https://particulares-es-b965d0.ingress-compore...,1,generico,generico,generico | generico | Login | free_hosting;mar...,
262,https://particulares-es-1-bd5e31.ingress-darib...,1,generico,generico,generico | generico | Login | free_hosting;mar...,
268,https://informacion-cliente-spainespain3412230...,1,generico,generico,generico | generico | Login | marca_en_path;sp...,
272,http://dinamicaenlineatusucursal.com/verificac...,1,generico,generico,banca | generico | Verificación | marca_en_pat...,



Entidades únicas:
['generico']

Nulos por columna:
url          0
label        0
sector       0
entidad      0
notas        0
campaign    29
dtype: int64


In [189]:
print("Shape:", df_train.shape)
print("\nColumnas:", df_train.columns.tolist())

print("\nTipos de datos:")
display(df_train.dtypes)

print("\nNulos por columna:")
display(df_train.isna().sum())

print("\nPrimeras 10 filas:")
display(df_train.head(10))

print("\nÚltimas 10 filas:")
display(df_train.tail(10))

print("\nSectores:")
display(df_train["sector"].value_counts())

print("\nEntidades (top 30):")
display(df_train["entidad"].value_counts().head(30))

print("\nCampañas únicas (solo phishing):")
display(sorted(df_train[df_train["label"]==1]["campaign"].unique()))


Shape: (492, 6)

Columnas: ['url', 'label', 'sector', 'entidad', 'notas', 'campaign']

Tipos de datos:


url         object
label        int64
sector      object
entidad     object
notas       object
campaign    object
dtype: object


Nulos por columna:


url           0
label         0
sector        0
entidad       0
notas       150
campaign    150
dtype: int64


Primeras 10 filas:


Unnamed: 0,url,label,sector,entidad,notas,campaign
0,https://www.caixabank.es/particular/banca-digi...,0,banca,caixabank,home bancaria oficial,
1,https://www.ibercaja.es/particulares/,0,banca,ibercaja,,
2,https://www.wizink.es/tarjetas,0,banca,wizink,,
3,https://www.wizink.es/,0,banca,wizink,,
4,https://www.cetelem.es/credito-y-prestamos/,0,banca,cetelem,,
5,https://www.cetelem.es/,0,banca,cetelem,,
6,https://www.cajamar.es/es/particulares/product...,0,banca,cajamar,,
7,https://www.ibercaja.es/empresas/,0,banca,ibercaja,,
8,https://www.cajamar.es/es/particulares/,0,banca,cajamar,,
9,https://portal.kutxabank.es/cs/Satellite/kb/es...,0,banca,kutxabank,,



Últimas 10 filas:


Unnamed: 0,url,label,sector,entidad,notas,campaign
482,https://app-ing.direct-ayuda.com,1,banca,ing,subdominio “app-ing”+ español en dominio ⇒ phi...,direct-ayuda.com
483,http://secretosbp.es/DROPBOX/DROPBOX/vinc/,1,saas,dropbox,.es + ruta de marca (Dropbox) → impersonación ...,secretosbp.es/dropbox
484,https://coinbasewalletsupport.es,1,cripto,coinbase,abuso de marca Coinbase + .es,coinbasewalletsupport.es
485,https://asesoriabarrachina.es/wp-includes/stra...,1,generico,generico,web .es comprometida(WordPress),asesoriabarrachina.es/wp-includes
486,http://serv-authveriline.com.es,1,generico,generico,.com.es + mezcla auth/verify → fake login,serv-authveriline.com.es
487,http://authline-checkappr0v.com.es,1,generico,generico,.com.es + lexemas auth/approve con “0” → fake ...,authline-checkappr0v.com.es
488,https://boxperience.es/cuenta/es-ing/ing,1,banca,ing,.es + posible web comprometida + marca ING en ...,boxperience.es/cuenta
489,https://koinbay-login.es,1,cripto,coinbase,typosquatting de Coinbase + .es,koinbay-login.es
490,https://l.ead.me/DGTspain,1,publico,dgt,acortador + DGT; dejamos 1representativo,l.ead.me/dgtspain
491,https://accesociaxatarieta-b31c49.ingress-eryt...,1,generico,generico,banca | generico | Login | free_hosting;marca_...,



Sectores:


sector
banca            193
logistica         94
generico          57
saas              36
cripto            22
publico           20
ecommerce         18
energia           14
teleco            13
streaming          9
redessociales      6
gaming             6
viajes             2
seguros            2
Name: count, dtype: int64


Entidades (top 30):


entidad
correos           74
generico          58
santander         43
caixabank         26
bbva              26
ing               26
dgt               11
bankinter          9
binance            8
netflix            7
coinbase           7
ionos              7
openbank           7
abanca             6
sabadell           6
microsoft          5
google             5
iberdrola          5
ibercaja           5
kutxabank          5
roblox             4
unicajabanco       4
orange             4
seur               4
cajamar            3
nacex              3
bancsabadell       3
unicaja banco      3
bancosantander     3
amazon espana      3
Name: count, dtype: int64


Campañas únicas (solo phishing):


TypeError: '<' not supported between instances of 'str' and 'float'

In [192]:
df_train.to_csv(
    "../data/interim/dataset_entrenamiento_v2.csv",
    index=False,
    encoding="utf-8"
)
