In [1]:
import pandas as pd

# --- Ajusta esta ruta a tu dataset ya featurizado ---
PATH = "../data/clean/dataset_base_v21.csv"

df = pd.read_csv(PATH)

# Comprobación mínima de columnas necesarias
cols = [
    "url",
    "label",
    "subdomain",
    "host_entropy",
    "domain_complexity",
    "infra_risk",
    "suspicious_path_token",
    "trusted_token_context",
    "tld"
]

missing = [c for c in cols if c not in df.columns]
print("Columnas faltantes:", missing if missing else "OK – todas presentes")

df.head(10)


Columnas faltantes: ['subdomain', 'tld']


Unnamed: 0,campaign,categoria,confianza,confidence,dataset_split,domain,entidad,free_hosting,inclusion,is_https,...,url_norm,domain_complexity,host_entropy,domain_whitelist_score,suspicious_path_token,token_density,trusted_token_context,infra_risk,fake_tld_in_subdomain_or_path,param_count_boost
0,,banca,,,,,,,,,...,https://www.caixabank.es/particular/banca-digi...,0.508495,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0
1,,,,95.0,,,Ibercaja,0.0,,1.0,...,https://www.ibercaja.es/particulares,0.50125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,,,,90.0,,,WiZink,0.0,,1.0,...,https://www.wizink.es/tarjetas,0.451477,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0
3,,,,95.0,,,WiZink,0.0,,1.0,...,https://www.wizink.es,0.451477,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,,,,90.0,,,Cetelem,0.0,,1.0,...,https://www.cetelem.es/credito-y-prestamos,0.456514,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,,,,95.0,,,Cetelem,0.0,,1.0,...,https://www.cetelem.es,0.456514,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,,,,90.0,,,Cajamar,0.0,,1.0,...,https://www.cajamar.es/es/particulares/product...,0.456514,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0
7,,,,90.0,,,Ibercaja,0.0,,1.0,...,https://www.ibercaja.es/empresas,0.50125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,,,,95.0,,,Cajamar,0.0,,1.0,...,https://www.cajamar.es/es/particulares,0.456514,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,,,,90.0,,,Kutxabank,0.0,,1.0,...,https://portal.kutxabank.es/cs/satellite/kb/es...,0.513526,0.32312,0.0,0.0,0.107143,0.0,0.0,1.0,0.0


In [2]:
import pandas as pd
import tldextract

# Si df ya está cargado correctamente con df = pd.read_csv(...)
# Generamos subdomain y tld desde cero

def extract_parts(url):
    ext = tldextract.extract(url)
    return pd.Series({
        "subdomain": ext.subdomain.lower(),
        "tld": ext.suffix.lower()
    })

df[["subdomain", "tld"]] = df["url"].apply(extract_parts)

df[["url", "subdomain", "tld", "host_entropy"]].head(10)


Unnamed: 0,url,subdomain,tld,host_entropy
0,https://www.caixabank.es/particular/banca-digi...,www,es,0.0
1,https://www.ibercaja.es/particulares/,www,es,0.0
2,https://www.wizink.es/tarjetas,www,es,0.0
3,https://www.wizink.es/,www,es,0.0
4,https://www.cetelem.es/credito-y-prestamos/,www,es,0.0
5,https://www.cetelem.es/,www,es,0.0
6,https://www.cajamar.es/es/particulares/product...,www,es,0.0
7,https://www.ibercaja.es/empresas/,www,es,0.0
8,https://www.cajamar.es/es/particulares/,www,es,0.0
9,https://portal.kutxabank.es/cs/Satellite/kb/es...,portal,es,0.32312


In [3]:
df_phish = df[df["label"] == 1].copy()

df_phish[["url", "subdomain", "tld", "host_entropy"]].head(20)


Unnamed: 0,url,subdomain,tld,host_entropy
244,https://caixabank-es-883f1e.ingress-erytho.eas...,caixabank-es-883f1e.ingress-erytho,com,1.0
245,http://122.114.173.242:30/bancosantander.es/pa...,,,0.0
246,http://wingerdgastehuis.co.za/Bienvenido%20a%2...,,co.za,0.0
247,http://webseguridadcuenta-9e626b.ingress-bonde...,webseguridadcuenta-9e626b.ingress-bonde,com,0.99271
248,https://www.mobile.kinman.com/.well-known/acme...,www.mobile,com,0.495301
249,https://wififpt.com.vn/es/bankia.es/es/acceso-...,,com.vn,0.0
250,https://web-5.builderallwppro.com/necorreos/vv...,web-5,com,0.241868
251,https://suponsoro22-ba9799.ingress-daribow.ewp...,suponsoro22-ba9799.ingress-daribow,live,0.976934
252,https://suponsoo22-ba6aa2.ingress-florina.ewp....,suponsoo22-ba6aa2.ingress-florina,live,0.94133
253,https://supoertas22-bb468f.ingress-bonde.ewp.l...,supoertas22-bb468f.ingress-bonde,live,1.0


In [5]:
df[df["label"]==1].head(30)[["url","subdomain","domain","host_entropy"]]


Unnamed: 0,url,subdomain,domain,host_entropy
244,https://caixabank-es-883f1e.ingress-erytho.eas...,caixabank-es-883f1e.ingress-erytho,caixabank-es-883f1e.ingress-erytho.easywp.com,1.0
245,http://122.114.173.242:30/bancosantander.es/pa...,,122.114.173.242:30,0.0
246,http://wingerdgastehuis.co.za/Bienvenido%20a%2...,,wingerdgastehuis.co.za,0.0
247,http://webseguridadcuenta-9e626b.ingress-bonde...,webseguridadcuenta-9e626b.ingress-bonde,webseguridadcuenta-9e626b.ingress-bonde.easywp...,0.99271
248,https://www.mobile.kinman.com/.well-known/acme...,www.mobile,www.mobile.kinman.com,0.495301
249,https://wififpt.com.vn/es/bankia.es/es/acceso-...,,wififpt.com.vn,0.0
250,https://web-5.builderallwppro.com/necorreos/vv...,web-5,web-5.builderallwppro.com,0.241868
251,https://suponsoro22-ba9799.ingress-daribow.ewp...,suponsoro22-ba9799.ingress-daribow,suponsoro22-ba9799.ingress-daribow.ewp.live,0.976934
252,https://suponsoo22-ba6aa2.ingress-florina.ewp....,suponsoo22-ba6aa2.ingress-florina,suponsoo22-ba6aa2.ingress-florina.ewp.live,0.94133
253,https://supoertas22-bb468f.ingress-bonde.ewp.l...,supoertas22-bb468f.ingress-bonde,supoertas22-bb468f.ingress-bonde.ewp.live,1.0


In [6]:
def extract_host_base(url):
    ext = tldextract.extract(url)
    return ext.subdomain + "." + ext.domain if ext.subdomain else ext.domain

df["host_base"] = df["url"].apply(extract_host_base)

df[["url","host_base","host_entropy"]].head(30)


Unnamed: 0,url,host_base,host_entropy
0,https://www.caixabank.es/particular/banca-digi...,www.caixabank,0.0
1,https://www.ibercaja.es/particulares/,www.ibercaja,0.0
2,https://www.wizink.es/tarjetas,www.wizink,0.0
3,https://www.wizink.es/,www.wizink,0.0
4,https://www.cetelem.es/credito-y-prestamos/,www.cetelem,0.0
5,https://www.cetelem.es/,www.cetelem,0.0
6,https://www.cajamar.es/es/particulares/product...,www.cajamar,0.0
7,https://www.ibercaja.es/empresas/,www.ibercaja,0.0
8,https://www.cajamar.es/es/particulares/,www.cajamar,0.0
9,https://portal.kutxabank.es/cs/Satellite/kb/es...,portal.kutxabank,0.32312


In [7]:
def inspect_url(pattern, n=10):
    mask = df["url"].str.contains(pattern, na=False)
    sel = df[mask].copy()
    cols = [
        "url",
        "label",
        "subdomain",
        "domain",
        "host_base",
        "host_entropy",
        "domain_complexity",
        "infra_risk",
        "suspicious_path_token",
        "trusted_token_context",
    ]
    cols = [c for c in cols if c in sel.columns]
    print(f"Coincidencias para patrón '{pattern}': {sel.shape[0]}")
    print(sel[cols].head(n).to_string(index=False))


In [8]:
inspect_url("sncrly-bbva.xyz")


Coincidencias para patrón 'sncrly-bbva.xyz': 1
                                                                 url  label subdomain          domain   host_base  host_entropy  domain_complexity  infra_risk  suspicious_path_token  trusted_token_context
https://sncrly-bbva.xyz/bbva/area_cliente/smsconfirmacionerreur.html      1           sncrly-bbva.xyz sncrly-bbva           0.0           0.865088         2.0                    1.0                   -1.0


In [9]:
inspect_url("ingress")


Coincidencias para patrón 'ingress': 84
                                                                                                                    url  label                                            subdomain                                                        domain                                                host_base  host_entropy  domain_complexity  infra_risk  suspicious_path_token  trusted_token_context
     https://caixabank-es-883f1e.ingress-erytho.easywp.com/particular-tarjetas/auth/access/clients/sms.php?verification      1                   caixabank-es-883f1e.ingress-erytho                 caixabank-es-883f1e.ingress-erytho.easywp.com                caixabank-es-883f1e.ingress-erytho.easywp      1.000000           0.479260         0.0                    1.0                   -1.0
http://webseguridadcuenta-9e626b.ingress-bonde.easywp.com/cuenta/eses/auth/access/clients/sms.php?error=1&verification=      1              webseguridadcuenta-9e626b.ingress-bonde 

In [10]:
inspect_url("builderall")


Coincidencias para patrón 'builderall': 3
                                                                                   url  label      subdomain                             domain                      host_base  host_entropy  domain_complexity  infra_risk  suspicious_path_token  trusted_token_context
     https://web-5.builderallwppro.com/necorreos/vv/cors/pagomente/recibir_paquete.php      1          web-5          web-5.builderallwppro.com          web-5.builderallwppro      0.241868           0.659895         0.0                    1.0                    0.0
https://loikolaka-1.builderallwppro.com/correosi/vv/cors/pagomente/recibir_paquete.php      1    loikolaka-1    loikolaka-1.builderallwppro.com    loikolaka-1.builderallwppro      0.626120           0.659895         0.0                    1.0                    0.0
                 http://mooneygroup-it.builderallwppro.com/correos/recibir_paquete.php      1 mooneygroup-it mooneygroup-it.builderallwppro.com mooneygroup-it.b

In [11]:
inspect_url("cprapid")


Coincidencias para patrón 'cprapid': 3
                                                                       url  label                      subdomain                                     domain                              host_base  host_entropy  domain_complexity  infra_risk  suspicious_path_token  trusted_token_context
                  https://cuentas.170-64-148-61.cprapid.com/app/bbva_light      1          cuentas.170-64-148-61          cuentas.170-64-148-61.cprapid.com          cuentas.170-64-148-61.cprapid      0.911610           0.488692         0.0                    0.0                    0.0
                     https://cuentas.15-237-142-160.cprapid.com/bbva_light      1         cuentas.15-237-142-160         cuentas.15-237-142-160.cprapid.com         cuentas.15-237-142-160.cprapid      0.961058           0.488692         0.0                    0.0                    0.0
https://banco.online.bbva.146-70-81-49.cprapid.com/bbva-movil/es/index.php      1 banco.online.bbva.146

In [12]:
import pandas as pd
import tldextract
import numpy as np

df = df.copy()  # seguridad

# -----------------------------
# 1) Subdominio + TLD
# -----------------------------
def extract_parts(url):
    ext = tldextract.extract(url)
    return pd.Series({
        "subdomain": ext.subdomain.lower(),
        "tld": ext.suffix.lower()
    })

df[["subdomain", "tld"]] = df["url"].apply(extract_parts)

# -----------------------------
# 2) host_entropy (Shannon)
# -----------------------------
def shannon_entropy(s):
    if not s:
        return 0.0
    freq = pd.Series(list(s)).value_counts(normalize=True)
    return -(freq * np.log2(freq)).sum()

def compute_host_entropy(sub):
    clean = sub.replace(".", "")
    return shannon_entropy(clean) if clean else 0.0

df["host_entropy_calc"] = df["subdomain"].apply(compute_host_entropy)

# -----------------------------
# 3) subdomain_missing_flag
# -----------------------------
df["subdomain_missing_flag"] = (
    (df["subdomain"] == "") &
    (df["tld"] != "es")
).astype(int)

# -----------------------------
# 4) Resumen global
# -----------------------------
print("Comparación host_entropy original vs calculada:")
display(df[["host_entropy", "host_entropy_calc"]].head())

print("\nDistribución host_entropy_calc por clase:")
display(df.groupby("label")["host_entropy_calc"].describe())

print("\nDistribución subdomain_missing_flag por clase:")
display(df.groupby("label")["subdomain_missing_flag"].mean())

df.head()


Comparación host_entropy original vs calculada:


Unnamed: 0,host_entropy,host_entropy_calc
0,0.0,-0.0
1,0.0,-0.0
2,0.0,-0.0
3,0.0,-0.0
4,0.0,-0.0



Distribución host_entropy_calc por clase:


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,244.0,0.541622,1.01457,-0.0,-0.0,-0.0,0.0,3.32782
1,248.0,2.536607,1.624116,-0.0,0.0,3.264621,3.873337,4.351837



Distribución subdomain_missing_flag por clase:


label
0    0.032787
1    0.169355
Name: subdomain_missing_flag, dtype: float64

Unnamed: 0,campaign,categoria,confianza,confidence,dataset_split,domain,entidad,free_hosting,inclusion,is_https,...,token_density,trusted_token_context,infra_risk,fake_tld_in_subdomain_or_path,param_count_boost,subdomain,tld,host_base,host_entropy_calc,subdomain_missing_flag
0,,banca,,,,,,,,,...,0.125,0.0,0.0,0.0,0.0,www,es,www.caixabank,-0.0,0
1,,,,95.0,,,Ibercaja,0.0,,1.0,...,0.0,0.0,0.0,0.0,0.0,www,es,www.ibercaja,-0.0,0
2,,,,90.0,,,WiZink,0.0,,1.0,...,0.5,0.0,0.0,0.0,0.0,www,es,www.wizink,-0.0,0
3,,,,95.0,,,WiZink,0.0,,1.0,...,0.0,0.0,0.0,0.0,0.0,www,es,www.wizink,-0.0,0
4,,,,90.0,,,Cetelem,0.0,,1.0,...,0.0,0.0,0.0,0.0,0.0,www,es,www.cetelem,-0.0,0
