In [4]:
import pandas as pd
from urllib.parse import urlparse

# --------------------------
# 1) Cargar dataset
# --------------------------
PATH = "../data/clean/dataset_base_v21.csv"
df = pd.read_csv(PATH).copy()

# --------------------------
# 2) Trusted tokens (v2.4)
# --------------------------
TRUSTED_TOKENS = [
    "acceso", "login", "clientes", "personal",
    "particulares", "seguridad", "banca",
    "oficina", "cuentas", "ayuda"
]

# --------------------------
# 3) Extract path + whitelist
# (asumimos que ya tienes column: domain_whitelist_score)
# Si no, recalculamos:
# --------------------------
import tldextract
def get_registered_domain(url):
    ext = tldextract.extract(url)
    return f"{ext.domain.lower()}.{ext.suffix.lower()}" if ext.domain and ext.suffix else ""

WL_PATH = "../docs/whitelist.csv"
wl = set(pd.read_csv(WL_PATH, header=None)[0].str.lower().tolist())

df["registered_domain"] = df["url"].apply(get_registered_domain)
df["domain_whitelist_calc"] = df["registered_domain"].isin(wl).astype(int)

# --------------------------
# 4) trusted_token_context_calc
# --------------------------
def compute_ttc(url, whitelist_flag):
    path = urlparse(url).path.lower()
    has_token = any(t in path for t in TRUSTED_TOKENS)
    if not has_token:
        return 0
    return 1 if whitelist_flag == 1 else -1

df["trusted_token_context_calc"] = df.apply(
    lambda row: compute_ttc(row["url"], row["domain_whitelist_calc"]), axis=1
)

# --------------------------
# 5) Métricas por clase
# --------------------------
print("Distribución por clase:")
display(df.groupby("label")["trusted_token_context_calc"].mean())

# --------------------------
# 6) Ejemplos de -1 (sospechosos)
# --------------------------
print("\nEjemplos TTC = -1 (esto debería ser phishing):")
display(df[df["trusted_token_context_calc"] == -1].head(15)[["url","label"]])

# --------------------------
# 7) Ejemplos de +1 (legítimos)
# --------------------------
print("\nEjemplos TTC = +1 (deben ser legítimos .es):")
display(df[df["trusted_token_context_calc"] == 1].head(15)[["url","label"]])


Distribución por clase:


label
0    0.286885
1   -0.193548
Name: trusted_token_context_calc, dtype: float64


Ejemplos TTC = -1 (esto debería ser phishing):


Unnamed: 0,url,label
6,https://www.cajamar.es/es/particulares/product...,0
8,https://www.cajamar.es/es/particulares/,0
14,https://www.abanca.com/es/banca-personal/,0
34,https://www.abanca.com/es/banca-a-distancia/ba...,0
78,https://www.mapfre.es/particulares/contacto/da...,0
79,https://www.endesa.com/es/te-ayudamos/gestione...,0
83,https://www.mapfre.es/particulares/,0
84,https://www.endesa.com/es/te-ayudamos/sobre-tu...,0
170,https://accounts.binance.com/es/login,0
179,https://miyoigo.yoigo.com/login,0



Ejemplos TTC = +1 (deben ser legítimos .es):


Unnamed: 0,url,label
0,https://www.caixabank.es/particular/banca-digi...,0
1,https://www.ibercaja.es/particulares/,0
9,https://portal.kutxabank.es/cs/Satellite/kb/es...,0
10,https://portal.kutxabank.es/cs/Satellite/kb/es...,0
11,https://www.unicajabanco.es/es/particulares/hi...,0
13,https://www.unicajabanco.es/es/banca-digital,0
15,https://portal.kutxabank.es/cs/Satellite/kb/es...,0
17,https://www.myinvestor.es/cuentas/,0
18,https://mioficina.correos.es/es/es/login,0
21,https://www.correos.es/es/es/particulares/envi...,0


In [5]:
df[df["label"] == 0]["registered_domain"].value_counts().head(30)


registered_domain
correos.es                  23
bbva.es                     11
ing.es                       9
bancosantander.es            9
bancsabadell.com             8
caixabank.es                 7
binance.com                  7
bankinter.com                7
unicajabanco.es              7
openbank.es                  7
abanca.com                   6
ibercaja.es                  5
iberdrola.es                 5
kutxabank.es                 5
coinbase.com                 5
ionos.es                     4
amazon.es                    4
dgt.gob.es                   3
google.com                   3
carrefour.es                 3
netflix.com                  3
agenciatributaria.gob.es     3
microsoft.com                3
nacex.es                     3
cajamar.es                   3
mapfre.es                    3
mediamarkt.es                2
myinvestor.es                2
seur.com                     2
mrw.es                       2
Name: count, dtype: int64

In [6]:
import pandas as pd
import tldextract

# Partimos de df ya cargado desde dataset_base_v21
# Si no existe registered_domain, lo recomputamos
if "registered_domain" not in df.columns:
    def get_registered_domain(url):
        ext = tldextract.extract(url)
        return f"{ext.domain.lower()}.{ext.suffix.lower()}" if ext.domain and ext.suffix else ""
    df["registered_domain"] = df["url"].apply(get_registered_domain)

# 1) Lista mínima de marcas legítimas (derivada de tu top)
BRAND_KEYWORDS = [
    "correos", "bbva", "ing", "bancosantander", "santander",
    "bancsabadell", "sabadell", "caixabank", "binance",
    "bankinter", "unicajabanco", "openbank", "abanca",
    "ibercaja", "iberdrola", "kutxabank", "coinbase",
    "ionos", "amazon", "dgt", "carrefour", "netflix",
    "agenciatributaria", "microsoft", "nacex", "cajamar",
    "mapfre", "mediamarkt", "myinvestor", "seur", "mrw"
]

def brand_match(domain: str) -> int:
    d = domain.lower()
    return int(any(brand in d for brand in BRAND_KEYWORDS))

df["brand_match_flag"] = df["registered_domain"].apply(brand_match)

# 2) Distribución por clase
print("brand_match_flag por label:")
display(df.groupby("label")["brand_match_flag"].mean().to_frame("mean_flag"))

print("\nEjemplos brand_match_flag = 1 (muestra):")
display(df[df["brand_match_flag"] == 1].head(15)[["url", "registered_domain", "label"]])


brand_match_flag por label:


Unnamed: 0_level_0,mean_flag
label,Unnamed: 1_level_1
0,0.70082
1,0.08871



Ejemplos brand_match_flag = 1 (muestra):


Unnamed: 0,url,registered_domain,label
0,https://www.caixabank.es/particular/banca-digi...,caixabank.es,0
1,https://www.ibercaja.es/particulares/,ibercaja.es,0
6,https://www.cajamar.es/es/particulares/product...,cajamar.es,0
7,https://www.ibercaja.es/empresas/,ibercaja.es,0
8,https://www.cajamar.es/es/particulares/,cajamar.es,0
9,https://portal.kutxabank.es/cs/Satellite/kb/es...,kutxabank.es,0
10,https://portal.kutxabank.es/cs/Satellite/kb/es...,kutxabank.es,0
11,https://www.unicajabanco.es/es/particulares/hi...,unicajabanco.es,0
12,https://www.unicajabanco.es/es/empresas-y-auto...,unicajabanco.es,0
13,https://www.unicajabanco.es/es/banca-digital,unicajabanco.es,0


Distribución TTC_v26:


label
0   -0.368852
1   -0.209677
Name: trusted_token_context_v26, dtype: float64


TTC = -1 (deben ser phishing):


Unnamed: 0,url,registered_domain,label
0,https://www.caixabank.es/particular/banca-digi...,caixabank.es,0
1,https://www.ibercaja.es/particulares/,ibercaja.es,0
6,https://www.cajamar.es/es/particulares/product...,cajamar.es,0
8,https://www.cajamar.es/es/particulares/,cajamar.es,0
9,https://portal.kutxabank.es/cs/Satellite/kb/es...,kutxabank.es,0
10,https://portal.kutxabank.es/cs/Satellite/kb/es...,kutxabank.es,0
11,https://www.unicajabanco.es/es/particulares/hi...,unicajabanco.es,0
13,https://www.unicajabanco.es/es/banca-digital,unicajabanco.es,0
14,https://www.abanca.com/es/banca-personal/,abanca.com,0
15,https://portal.kutxabank.es/cs/Satellite/kb/es...,kutxabank.es,0



TTC = +1 (legítimos .es):


Unnamed: 0,url,registered_domain,label


In [10]:
# Normalizar whitelist para evitar ints u objetos no string
wl_clean = []

for d in wl:
    if isinstance(d, str):
        wl_clean.append(d.strip().lower())
    else:
        # convertir a string si es convertible (ej: 123 → "123")
        try:
            wl_clean.append(str(d).strip().lower())
        except:
            pass

# reemplazar la lista original
wl = wl_clean


In [12]:
# --------------------------
# brand_match_flag_v1 (solo marcas españolas)
# --------------------------

BRANDS_ES = {
    "caixabank", "bbva", "ing", "santander", "openbank",
    "kutxabank", "bankinter", "abanca", "unicajabanco",
    "ibercaja", "cajamar", "mapfre", "correos",
    "nacex", "mrw", "seur", "masmovil", "yoigo",
    "renfe", "carrefour", "mediamarkt"
}

def brand_match_v1(domain: str) -> int:
    d = domain.lower()
    return int(any(b in d for b in BRANDS_ES))

df["brand_match_flag_v1"] = df["registered_domain"].apply(brand_match_v1)

print("Distribución brand_match_flag_v1 por clase:")
display(df.groupby("label")["brand_match_flag_v1"].mean())


Distribución brand_match_flag_v1 por clase:


label
0    0.512295
1    0.060484
Name: brand_match_flag_v1, dtype: float64

In [13]:
# --------------------------
# 1) Preparar flags previos
# --------------------------
wl = df["domain_whitelist_flag"] if "domain_whitelist_flag" in df.columns else df["domain_whitelist_calc"]
bm = df["brand_match_flag_v1"]
tk = df["token_flag"]

# --------------------------
# 2) Calcular TTC_v27
# --------------------------
def compute_ttc_v27(row):
    token = row["token_flag"]
    wl_flag = row["domain_whitelist_flag"] if "domain_whitelist_flag" in row else row["domain_whitelist_calc"]
    brand = row["brand_match_flag_v1"]
    
    if token == 0:
        return 0
    if wl_flag == 1:
        return +1
    if brand == 1:
        return 0
    return -1

df["trusted_token_context_v27"] = df.apply(compute_ttc_v27, axis=1)

# --------------------------
# 3) Métricas
# --------------------------
print("Distribución TTC_v27 por clase:")
display(df.groupby("label")["trusted_token_context_v27"].mean())

# --------------------------
# 4) Ejemplos TTC_v27 = -1 (deben ser phishing)
# --------------------------
print("\nEjemplos TTC_v27 = -1:")
display(df[df["trusted_token_context_v27"] == -1].head(15)[["url","registered_domain","label"]])

# --------------------------
# 5) Ejemplos TTC_v27 = +1 (dominios oficiales .es)
# --------------------------
print("\nEjemplos TTC_v27 = +1:")
display(df[df["trusted_token_context_v27"] == 1].head(15)[["url","registered_domain","label"]])


Distribución TTC_v27 por clase:


label
0   -0.118852
1   -0.201613
Name: trusted_token_context_v27, dtype: float64


Ejemplos TTC_v27 = -1:


Unnamed: 0,url,registered_domain,label
17,https://www.myinvestor.es/cuentas/,myinvestor.es,0
63,https://www.bancsabadell.com/bsnacional/es/cen...,bancsabadell.com,0
65,https://www.bancsabadell.com/bsnacional/es/par...,bancsabadell.com,0
66,https://www.bancsabadell.com/cs/Satellite/SabA...,bancsabadell.com,0
67,https://www.bancsabadell.com/bsnacional/es/par...,bancsabadell.com,0
79,https://www.endesa.com/es/te-ayudamos/gestione...,endesa.com,0
82,https://www.lineadirecta.com/clientes/login.in...,lineadirecta.com,0
84,https://www.endesa.com/es/te-ayudamos/sobre-tu...,endesa.com,0
107,https://www.pccomponentes.com/login?srsltid=Af...,pccomponentes.com,0
131,https://www.office.com/login?es=UnauthClick,office.com,0



Ejemplos TTC_v27 = +1:


Unnamed: 0,url,registered_domain,label


In [14]:
df[(df["label"] == 0) & (df["trusted_token_context_v27"] == -1)][["registered_domain"]].value_counts().head(30)


registered_domain      
bancsabadell.com           6
ionos.es                   3
endesa.com                 2
pccomponentes.com          2
netflix.com                2
bancomediolanum.es         1
myinvestor.es              1
stripe.com                 1
ruralcentral.es            1
roblox.com                 1
office.com                 1
lineadirecta.com           1
mutua.es                   1
iberdrola.es               1
google.com                 1
dropbox.com                1
cajaruraldeasturias.com    1
binance.com                1
vodafone.es                1
Name: count, dtype: int64

In [15]:
# -----------------------------------
# brand_match_flag_v2 — marcas españolas reales
# -----------------------------------

BRANDS_ES_V2 = {
    "caixabank", "bbva", "ing", "santander", "openbank",
    "kutxabank", "bankinter", "abanca", "unicajabanco",
    "ibercaja", "cajamar", "mapfre", "correos", "nacex",
    "mrw", "seur", "masmovil", "yoigo", "renfe",
    "carrefour", "mediamarkt",

    # añadidas tras ver falsos positivos legítimos
    "bancsabadell", "bancomediolanum",
    "cajaruraldeasturias", "ruralcentral",
    "myinvestor", "endesa", "iberdrola",
    "mutua", "lineadirecta", "vodafone",
    "pccomponentes", "ionos"
}

def brand_match_v2(domain: str) -> int:
    d = domain.lower()
    return int(any(b in d for b in BRANDS_ES_V2))

df["brand_match_flag_v2"] = df["registered_domain"].apply(brand_match_v2)

print("Distribución brand_match_flag_v2 por clase:")
display(df.groupby("label")["brand_match_flag_v2"].mean())


Distribución brand_match_flag_v2 por clase:


label
0    0.635246
1    0.060484
Name: brand_match_flag_v2, dtype: float64

In [16]:
# Recalcular TTC_v27 con brand_match_flag_v2
def compute_ttc_v27(row):
    token = row["token_flag"]
    wl_flag = row["domain_whitelist_flag"] if "domain_whitelist_flag" in row else row["domain_whitelist_calc"]
    brand = row["brand_match_flag_v2"]
    
    if token == 0:
        return 0
    if wl_flag == 1:
        return +1
    if brand == 1:
        return 0
    return -1

df["trusted_token_context_v27"] = df.apply(compute_ttc_v27, axis=1)

print("Distribución TTC_v27 por clase:")
display(df.groupby("label")["trusted_token_context_v27"].mean())

print("\nTTC_v27 = -1 (deben ser phishing):")
display(df[df["trusted_token_context_v27"] == -1].head(10)[["url","registered_domain","label"]])

print("\nTTC_v27 = +1 (solo dominios oficiales .es):")
display(df[df["trusted_token_context_v27"] == 1].head(10)[["url","registered_domain","label"]])


Distribución TTC_v27 por clase:


label
0   -0.032787
1   -0.201613
Name: trusted_token_context_v27, dtype: float64


TTC_v27 = -1 (deben ser phishing):


Unnamed: 0,url,registered_domain,label
131,https://www.office.com/login?es=UnauthClick,office.com,0
142,https://dashboard.stripe.com/login,stripe.com,0
170,https://accounts.binance.com/es/login,binance.com,0
203,https://www.dropbox.com/es_ES/login,dropbox.com,0
212,https://www.roblox.com/es/login,roblox.com,0
215,https://accounts.google.com/ServiceLogin?servi...,google.com,0
235,https://www.netflix.com/es/loginHelp,netflix.com,0
239,https://www.netflix.com/es/login,netflix.com,0
245,http://122.114.173.242:30/bancosantander.es/pa...,,1
249,https://wififpt.com.vn/es/bankia.es/es/acceso-...,wififpt.com.vn,1



TTC_v27 = +1 (solo dominios oficiales .es):


Unnamed: 0,url,registered_domain,label


In [17]:
df[(df["label"] == 0) & (df["token_flag"] == 1)]["registered_domain"].value_counts().head(30)


registered_domain
correos.es                 15
bancosantander.es           9
bbva.es                     7
caixabank.es                6
bancsabadell.com            6
unicajabanco.es             5
bankinter.com               5
kutxabank.es                4
ibercaja.es                 4
ionos.es                    3
ing.es                      2
endesa.com                  2
mapfre.es                   2
pccomponentes.com           2
openbank.es                 2
abanca.com                  2
cajamar.es                  2
netflix.com                 2
elcorteingles.es            1
office.com                  1
cajaruraldeasturias.com     1
ruralcentral.es             1
masmovil.es                 1
lineadirecta.com            1
google.com                  1
roblox.com                  1
dropbox.com                 1
mutua.es                    1
iberdrola.es                1
myinvestor.es               1
Name: count, dtype: int64

In [18]:
# -----------------------------------
# TTC_v28 — versión limpia sin token_flag
# -----------------------------------

def compute_ttc_v28(row):
    wl_flag = row["domain_whitelist_flag"] if "domain_whitelist_flag" in row else row["domain_whitelist_calc"]
    brand = row["brand_match_flag_v2"]

    if wl_flag == 1:
        return +1
    if brand == 1:
        return 0
    return -1

df["trusted_token_context_v28"] = df.apply(compute_ttc_v28, axis=1)

# Métricas
print("Distribución TTC_v28 por clase:")
display(df.groupby("label")["trusted_token_context_v28"].mean())

# Ejemplos
print("\nTTC_v28 = -1 (deben ser phishing):")
display(df[df["trusted_token_context_v28"] == -1].head(10)[["url","registered_domain","label"]])

print("\nTTC_v28 = +1 (deben ser dominios oficiales .es legítimos):")
display(df[df["trusted_token_context_v28"] == 1].head(10)[["url","registered_domain","label"]])


Distribución TTC_v28 por clase:


label
0   -0.364754
1   -0.939516
Name: trusted_token_context_v28, dtype: float64


TTC_v28 = -1 (deben ser phishing):


Unnamed: 0,url,registered_domain,label
2,https://www.wizink.es/tarjetas,wizink.es,0
3,https://www.wizink.es/,wizink.es,0
4,https://www.cetelem.es/credito-y-prestamos/,cetelem.es,0
5,https://www.cetelem.es/,cetelem.es,0
76,https://www.amazon.es/gp/help/customer/display...,amazon.es,0
87,https://sede.dgt.gob.es/es/,dgt.gob.es,0
88,https://sede.seg-social.gob.es/wps/portal/sede...,seg-social.gob.es,0
89,https://sede.seg-social.gob.es/wps/portal/sede...,seg-social.gob.es,0
90,https://sede.agenciatributaria.gob.es/Sede/not...,agenciatributaria.gob.es,0
91,https://www.amazon.es/gp/your-account/order-hi...,amazon.es,0



TTC_v28 = +1 (deben ser dominios oficiales .es legítimos):


Unnamed: 0,url,registered_domain,label
