In [1]:
import pandas as pd
import re

df = pd.read_csv("../data/clean/dataset_base_v21.csv")

# Extraemos registered_domain para proteger dominions legítimos
if "registered_domain" not in df.columns:
    import tldextract
    df["registered_domain"] = df["url"].apply(lambda u: tldextract.extract(u).registered_domain)

# 1. Construir lista de marcas desde WHITELIST
# Asumimos que whitelist ya cargada en tu proyecto como set de dominios.
# Si no, rompe en tokens la columna registered_domain de URLs legítimas (label 0)

brands = (
    df[df["label"] == 0]["registered_domain"]
    .dropna()
    .apply(lambda d: d.split(".")[0].lower())
    .unique()
)

brands_set = set(brands)

# 2. Función para detectar marca en PATH (solo tokens)
def brand_in_path(url):
    try:
        path = url.split("/", 3)[-1].lower()
    except:
        return 0

    # Tokenización estricta
    tokens = re.split(r"[\/\-\_\.\=\&\?\%]", path)

    for b in brands_set:
        if b in tokens:
            return 1
    return 0

df["brand_in_path_raw"] = df["url"].apply(brand_in_path)

# 3. Activación SOLO si dominio NO es legítimo
df["brand_in_path_final"] = df.apply(
    lambda row: 1 if (row["brand_in_path_raw"] == 1 and row["registered_domain"] not in df[df["label"]==0]["registered_domain"].values) else 0,
    axis=1
)

print("### DISTRIBUCIÓN por clase (raw) ###")
display(df.groupby("label")["brand_in_path_raw"].describe())

print("\n### DISTRIBUCIÓN por clase (final, con protección dominio legítimo) ###")
display(df.groupby("label")["brand_in_path_final"].describe())

print("\n### FALSOS POSITIVOS (si hay alguno, esta feature se DESCARTA) ###")
display(df[(df["label"] == 0) & (df["brand_in_path_final"] == 1)][["url","registered_domain"]].head(20))

print("\n### Casos phishing donde la feature se activa ###")
display(df[(df["label"] == 1) & (df["brand_in_path_final"] == 1)][["url","registered_domain"]].head(20))


### DISTRIBUCIÓN por clase (raw) ###


  df["registered_domain"] = df["url"].apply(lambda u: tldextract.extract(u).registered_domain)


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,244.0,0.106557,0.309184,0.0,0.0,0.0,0.0,1.0
1,248.0,0.258065,0.438455,0.0,0.0,0.0,1.0,1.0



### DISTRIBUCIÓN por clase (final, con protección dominio legítimo) ###


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,244.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,248.0,0.258065,0.438455,0.0,0.0,0.0,1.0,1.0



### FALSOS POSITIVOS (si hay alguno, esta feature se DESCARTA) ###


Unnamed: 0,url,registered_domain



### Casos phishing donde la feature se activa ###


Unnamed: 0,url,registered_domain
245,http://122.114.173.242:30/bancosantander.es/pa...,
255,https://sncrly-bbva.xyz/bbva/area_cliente/smsc...,sncrly-bbva.xyz
270,https://eniar-codigos-postales-oficinas-user56...,codeanyapp.com
271,https://cuentas.170-64-148-61.cprapid.com/app/...,cprapid.com
273,https://cuentas.15-237-142-160.cprapid.com/bbv...,cprapid.com
275,http://ing-banco.es.swtest.ru/ing/ing/sms_code...,swtest.ru
279,https://bobo-bc1ece.ingress-erytho.ewp.live/sa...,ewp.live
280,https://invetsofia-b96935.ingress-earth.ewp.li...,ewp.live
281,https://oldoone-ba3214.ingress-comporellon.ewp...,ewp.live
285,https://service-2-a0ec7f.ingress-erytho.ewp.li...,ewp.live


In [2]:
df_no_activation = df[
    (df["label"] == 1) &
    (df["brand_in_path_raw"] == 0) &
    (df["brand_in_path_final"] == 0)
][["url", "registered_domain"]]

df_no_activation.head(30)


Unnamed: 0,url,registered_domain
244,https://caixabank-es-883f1e.ingress-erytho.eas...,easywp.com
246,http://wingerdgastehuis.co.za/Bienvenido%20a%2...,wingerdgastehuis.co.za
247,http://webseguridadcuenta-9e626b.ingress-bonde...,easywp.com
248,https://www.mobile.kinman.com/.well-known/acme...,kinman.com
249,https://wififpt.com.vn/es/bankia.es/es/acceso-...,wififpt.com.vn
250,https://web-5.builderallwppro.com/necorreos/vv...,builderallwppro.com
251,https://suponsoro22-ba9799.ingress-daribow.ewp...,ewp.live
252,https://suponsoo22-ba6aa2.ingress-florina.ewp....,ewp.live
253,https://supoertas22-bb468f.ingress-bonde.ewp.l...,ewp.live
254,https://supanort22-baa4c7.ingress-erytho.ewp.l...,ewp.live


In [3]:
df_borderline = df[
    (df["label"] == 1) &
    (df["brand_in_path_raw"] == 1) &
    (df["brand_in_path_final"] == 0)
][["url", "registered_domain"]]

df_borderline.head(50)


Unnamed: 0,url,registered_domain
