In [70]:
import pandas as pd

PATH = "../data/clean/dataset_base_v21.csv"  # ajusta si lo tienes en otra ruta
df = pd.read_csv(PATH)
df.shape


(492, 31)

In [71]:
df["label"].value_counts(dropna=False)


label
1    248
0    244
Name: count, dtype: int64

In [72]:
df["source"].value_counts(dropna=False)


source
NaN       342
manual    150
Name: count, dtype: int64

In [73]:
df["sector"].value_counts(dropna=False)


sector
NaN                                192
Banca                              133
Logística                           91
Genérico / Otros                    28
Cripto / Fintech                    12
Retail / e-commerce / Streaming     10
Energía / Seguros                   10
SaaS / Cloud / Plataformas          10
Público / Administración             5
Administración pública               1
Name: count, dtype: int64

In [74]:
df["label"].unique()


array([0, 1])

In [75]:
df.columns.tolist()


['campaign',
 'categoria',
 'confianza',
 'confidence',
 'dataset_split',
 'domain',
 'entidad',
 'free_hosting',
 'inclusion',
 'is_https',
 'label',
 'matched_target',
 'notas',
 'route_type',
 'ruido',
 'score_total_v2',
 'sector',
 'sector_norm',
 'source',
 'timestamp',
 'url',
 'url_norm',
 'domain_complexity',
 'host_entropy',
 'domain_whitelist_score',
 'suspicious_path_token',
 'token_density',
 'trusted_token_context',
 'infra_risk',
 'fake_tld_in_subdomain_or_path',
 'param_count_boost']

In [76]:
df.groupby("label")["sector"].value_counts(dropna=False)


label  sector                         
0      NaN                                94
       Banca                              58
       Logística                          35
       Cripto / Fintech                   12
       Energía / Seguros                  10
       Genérico / Otros                   10
       Retail / e-commerce / Streaming    10
       SaaS / Cloud / Plataformas         10
       Público / Administración            5
1      NaN                                98
       Banca                              75
       Logística                          56
       Genérico / Otros                   18
       Administración pública              1
Name: count, dtype: int64

In [77]:
df[df["sector"].isna()][["url","domain","entidad","matched_target","categoria"]].head(20)


Unnamed: 0,url,domain,entidad,matched_target,categoria
0,https://www.caixabank.es/particular/banca-digi...,,,caixabank,banca
151,https://pasarela.clave.gob.es/Proxy2/ServicePr...,,,cl@ve,público
152,https://www.bancomediolanum.es/es-ES/la-banca-...,,,mediolanum,banca
153,https://www.bancosantander.es/particulares,,,santander,banca
154,https://www.movistar.es/area-cliente/mi-cuenta/,,,movistar,telecomunicaciones
155,https://m.vodafone.es/mves/login,,,vodafone,telecomunicaciones
156,https://www.ing.es/seguridad-internet,,,ing,banca
157,https://www.openbank.es/app-openbank,,,openbank,banca
158,https://www.ing.es/,,,ing,banca
159,https://www.openbank.es/?toggleLogin&go-to-app=0,,,openbank,banca


In [78]:
import re

# Guardamos una muestra de filas con sector nulo para validar
nan_subset = df[df["sector"].isna()][["url", "domain", "entidad", "matched_target", "categoria", "sector"]].head(20)
nan_indices = nan_subset.index

filled_by_rule = {
    "regla_1_matched_target": 0,
    "regla_2_categoria": 0,
    "regla_3_tokens": 0,
    "regla_4_default": 0,
}

# Regla 1: matched_target -> sector
matched_sector_map = {
    "caixabank": "Banca",
    "mediolanum": "Banca",
    "santander": "Banca",
    "ing": "Banca",
    "openbank": "Banca",
    "bbva": "Banca",
    "unicajabanco": "Banca",
    "movistar": "Telecomunicaciones",
    "vodafone": "Telecomunicaciones",
    "orange": "Telecomunicaciones",
    "correos": "Logística",
    "correosexpress": "Logística",
    "walletconnect": "Cripto / Fintech",
    "decathlon": "Retail / e-commerce / Streaming",
    "amazon": "Retail / e-commerce / Streaming",
    "hbo": "Retail / e-commerce / Streaming",
    "clave": "Público / Administración",
    "cl@ve": "Público / Administración",
}
matched_lower = df["matched_target"].astype(str).str.lower().str.strip()
mask_rule1 = df["sector"].isna() & matched_lower.map(matched_sector_map).notna()
filled_by_rule["regla_1_matched_target"] = int(mask_rule1.sum())
df.loc[mask_rule1, "sector"] = matched_lower.map(matched_sector_map)

# Regla 2: categoria -> sector
categoria_sector_map = {
    "banca": "Banca",
    "telecomunicaciones": "Telecomunicaciones",
    "logística": "Logística",
    "logistica": "Logística",
    "e-commerce": "Retail / e-commerce / Streaming",
    "streaming": "Retail / e-commerce / Streaming",
    "público": "Público / Administración",
    "publico": "Público / Administración",
}
categoria_lower = df["categoria"].astype(str).str.lower().str.strip()
mask_rule2 = df["sector"].isna() & categoria_lower.map(categoria_sector_map).notna()
filled_by_rule["regla_2_categoria"] = int(mask_rule2.sum())
df.loc[mask_rule2, "sector"] = categoria_lower.map(categoria_sector_map)

# Regla 3: tokens en URL/domain -> sector
sector_tokens = [
    ("Banca", ["bbva", "santander", "caixabank", "ing", "openbank", "unicaja"]),
    ("Logística", ["correos", "seur", "mrw", "correosexpress"]),
    ("Telecomunicaciones", ["movistar", "vodafone", "orange", "o2"]),
    ("Cripto / Fintech", ["crypto", "wallet", "metamask"]),
    ("Retail / e-commerce / Streaming", ["amazon", "ikea", "zalando", "decathlon"]),
    ("Retail / e-commerce / Streaming", ["hbo", "netflix"]),
]
for sector_value, tokens in sector_tokens:
    token_pattern = "|".join(re.escape(t) for t in tokens)
    token_hits = df["url"].str.contains(token_pattern, case=False, na=False) | df["domain"].str.contains(token_pattern, case=False, na=False)
    mask_rule3 = df["sector"].isna() & token_hits
    count_rule3 = int(mask_rule3.sum())
    if count_rule3:
        df.loc[mask_rule3, "sector"] = sector_value
        filled_by_rule["regla_3_tokens"] += count_rule3

# Regla 4: sin coincidencia -> Genérico / Otros
mask_rule4 = df["sector"].isna()
filled_by_rule["regla_4_default"] = int(mask_rule4.sum())
df.loc[mask_rule4, "sector"] = "Genérico / Otros"

print("Filas rellenadas por regla:")
for regla, cantidad in filled_by_rule.items():
    print(f"- {regla}: {cantidad}")

after_subset = df.loc[nan_indices, ["url", "domain", "entidad", "matched_target", "categoria", "sector"]]
print("\nAntes (sector NaN):")
display(nan_subset)
print("Después de aplicar reglas:")
display(after_subset)



Filas rellenadas por regla:
- regla_1_matched_target: 64
- regla_2_categoria: 68
- regla_3_tokens: 2
- regla_4_default: 58

Antes (sector NaN):


Unnamed: 0,url,domain,entidad,matched_target,categoria,sector
0,https://www.caixabank.es/particular/banca-digi...,,,caixabank,banca,
151,https://pasarela.clave.gob.es/Proxy2/ServicePr...,,,cl@ve,público,
152,https://www.bancomediolanum.es/es-ES/la-banca-...,,,mediolanum,banca,
153,https://www.bancosantander.es/particulares,,,santander,banca,
154,https://www.movistar.es/area-cliente/mi-cuenta/,,,movistar,telecomunicaciones,
155,https://m.vodafone.es/mves/login,,,vodafone,telecomunicaciones,
156,https://www.ing.es/seguridad-internet,,,ing,banca,
157,https://www.openbank.es/app-openbank,,,openbank,banca,
158,https://www.ing.es/,,,ing,banca,
159,https://www.openbank.es/?toggleLogin&go-to-app=0,,,openbank,banca,


Después de aplicar reglas:


Unnamed: 0,url,domain,entidad,matched_target,categoria,sector
0,https://www.caixabank.es/particular/banca-digi...,,,caixabank,banca,Banca
151,https://pasarela.clave.gob.es/Proxy2/ServicePr...,,,cl@ve,público,Público / Administración
152,https://www.bancomediolanum.es/es-ES/la-banca-...,,,mediolanum,banca,Banca
153,https://www.bancosantander.es/particulares,,,santander,banca,Banca
154,https://www.movistar.es/area-cliente/mi-cuenta/,,,movistar,telecomunicaciones,Telecomunicaciones
155,https://m.vodafone.es/mves/login,,,vodafone,telecomunicaciones,Telecomunicaciones
156,https://www.ing.es/seguridad-internet,,,ing,banca,Banca
157,https://www.openbank.es/app-openbank,,,openbank,banca,Banca
158,https://www.ing.es/,,,ing,banca,Banca
159,https://www.openbank.es/?toggleLogin&go-to-app=0,,,openbank,banca,Banca


In [79]:
# Filtrar filas con sector genérico
## df_gen contiene únicamente las filas donde sector es "Genérico / Otros"
df_gen = df[df["sector"] == "Genérico / Otros"].copy()
print(f"Filas en df_gen: {len(df_gen)}")
display(df_gen.head(20))


Filas en df_gen: 86


Unnamed: 0,campaign,categoria,confianza,confidence,dataset_split,domain,entidad,free_hosting,inclusion,is_https,...,url_norm,domain_complexity,host_entropy,domain_whitelist_score,suspicious_path_token,token_density,trusted_token_context,infra_risk,fake_tld_in_subdomain_or_path,param_count_boost
95,,,,93.0,,,Twitter,0.0,,1.0,...,https://help.twitter.com/es,0.467092,0.166667,0.0,0.0,0.0,0.0,0.0,1.0,0.0
96,,,,93.0,,,LinkedIn,0.0,,1.0,...,https://www.linkedin.com/help/linkedin,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,,,,92.0,,,Okta,0.0,,1.0,...,https://developer.okta.com/docs/guides/impleme...,0.430454,0.495301,0.0,0.0,0.0,-1.0,0.0,0.0,0.0
98,,,,92.0,,,Auth0,0.0,,1.0,...,https://auth0.com/docs/get-started/authenticat...,0.454612,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0
99,,,,93.0,,,Microsoft Azure,0.0,,1.0,...,https://status.azure.com/es-es/status,0.454612,0.239787,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100,,,,93.0,,,Cloudflare,0.0,,1.0,...,https://www.cloudflarestatus.com,0.679179,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101,,,,93.0,,,Amazon Web Services,0.0,,1.0,...,https://status.aws.amazon.com,0.462651,0.411967,0.0,0.0,0.0,0.0,0.0,0.0,0.0
102,,,,93.0,,,Stripe España,0.0,,1.0,...,https://stripe.com/es-es/docs/payments,0.47926,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
103,,,,93.0,,,PayPal España,0.0,,1.0,...,https://www.paypal.com/es/signin,0.446125,0.0,0.0,0.0,0.0,-1.0,0.0,1.0,0.0
104,,,,94.0,,,Redsys España,0.0,,1.0,...,https://www.redsys.es,0.451477,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [80]:
# Sectorización adicional para filas con "Genérico / Otros"
# Solo modifica las filas cuyo sector actual es "Genérico / Otros"

before_generic = df[df["sector"] == "Genérico / Otros"][["url", "domain", "entidad", "matched_target", "categoria", "sector"]].head(20)
before_indices = before_generic.index

rule_counts = {
    "SaaS / Cloud / Plataformas": 0,
    "Fintech / Pagos": 0,
    "Cripto / Fintech": 0,
    "RRSS / Redes Sociales": 0,
    "Viajes / Transporte": 0,
    "Energía / Seguros": 0,
}

sector_token_map = [
    ("SaaS / Cloud / Plataformas", ["twitter", "linkedin", "okta", "auth0", "microsoft azure", "azure", "cloudflare", "aws"]),
    ("Fintech / Pagos", ["paypal", "stripe", "redsys"]),
    ("Cripto / Fintech", ["binance"]),
    ("RRSS / Redes Sociales", ["whatsapp", "instagram"]),
    ("Viajes / Transporte", ["renfe", "iberia"]),
    ("Energía / Seguros", ["iberdrola", "mutua", "mapfre"]),
]

for sector_value, tokens in sector_token_map:
    pattern = "|".join(re.escape(t) for t in tokens)
    hits = (
        df["entidad"].str.contains(pattern, case=False, na=False) |
        df["url"].str.contains(pattern, case=False, na=False) |
        df["domain"].str.contains(pattern, case=False, na=False)
    )
    mask = (df["sector"] == "Genérico / Otros") & hits
    count = int(mask.sum())
    if count:
        df.loc[mask, "sector"] = sector_value
        rule_counts[sector_value] += count

print("Filas sectorizadas por categoría (solo desde 'Genérico / Otros'):")
for categoria, cnt in rule_counts.items():
    print(f"- {categoria}: {cnt}")

after_generic = df.loc[before_indices, ["url", "domain", "entidad", "matched_target", "categoria", "sector"]]
print("\nAntes (ejemplo de 20 filas con sector Genérico / Otros):")
display(before_generic)
print("Después de aplicar las reglas adicionales:")
display(after_generic)


Filas sectorizadas por categoría (solo desde 'Genérico / Otros'):
- SaaS / Cloud / Plataformas: 7
- Fintech / Pagos: 3
- Cripto / Fintech: 2
- RRSS / Redes Sociales: 3
- Viajes / Transporte: 2
- Energía / Seguros: 3

Antes (ejemplo de 20 filas con sector Genérico / Otros):


Unnamed: 0,url,domain,entidad,matched_target,categoria,sector
95,https://help.twitter.com/es,,Twitter,,,Genérico / Otros
96,https://www.linkedin.com/help/linkedin,,LinkedIn,,,Genérico / Otros
97,https://developer.okta.com/docs/guides/impleme...,,Okta,,,Genérico / Otros
98,https://auth0.com/docs/get-started/authenticat...,,Auth0,,,Genérico / Otros
99,https://status.azure.com/es-es/status,,Microsoft Azure,,,Genérico / Otros
100,https://www.cloudflarestatus.com/,,Cloudflare,,,Genérico / Otros
101,https://status.aws.amazon.com/,,Amazon Web Services,,,Genérico / Otros
102,https://stripe.com/es-es/docs/payments,,Stripe España,,,Genérico / Otros
103,https://www.paypal.com/es/signin,,PayPal España,,,Genérico / Otros
104,https://www.redsys.es/,,Redsys España,,,Genérico / Otros


Después de aplicar las reglas adicionales:


Unnamed: 0,url,domain,entidad,matched_target,categoria,sector
95,https://help.twitter.com/es,,Twitter,,,SaaS / Cloud / Plataformas
96,https://www.linkedin.com/help/linkedin,,LinkedIn,,,SaaS / Cloud / Plataformas
97,https://developer.okta.com/docs/guides/impleme...,,Okta,,,SaaS / Cloud / Plataformas
98,https://auth0.com/docs/get-started/authenticat...,,Auth0,,,SaaS / Cloud / Plataformas
99,https://status.azure.com/es-es/status,,Microsoft Azure,,,SaaS / Cloud / Plataformas
100,https://www.cloudflarestatus.com/,,Cloudflare,,,SaaS / Cloud / Plataformas
101,https://status.aws.amazon.com/,,Amazon Web Services,,,SaaS / Cloud / Plataformas
102,https://stripe.com/es-es/docs/payments,,Stripe España,,,Fintech / Pagos
103,https://www.paypal.com/es/signin,,PayPal España,,,Fintech / Pagos
104,https://www.redsys.es/,,Redsys España,,,Fintech / Pagos


In [81]:
# Sectorización final usando únicamente la columna `entidad` sobre Genérico / Otros
# Reglas: solo filas con sector == "Genérico / Otros" y sin sobrescribir sectores válidos

entidad_normalizada = (
    df["entidad"]
    .astype(str)
    .str.lower()
    .str.replace(r"\s+", " ", regex=True)
    .str.strip()
)

entidad_sector_map = {
    # SaaS / Cloud / Plataformas
    "twitter": "SaaS / Cloud / Plataformas",
    "linkedin": "SaaS / Cloud / Plataformas",
    "okta": "SaaS / Cloud / Plataformas",
    "auth0": "SaaS / Cloud / Plataformas",
    "microsoft azure": "SaaS / Cloud / Plataformas",
    "cloudflare": "SaaS / Cloud / Plataformas",
    "amazon web services": "SaaS / Cloud / Plataformas",
    # Fintech / Pagos
    "stripe españa": "Fintech / Pagos",
    "paypal españa": "Fintech / Pagos",
    "redsys españa": "Fintech / Pagos",
}

proposed_sector = entidad_normalizada.map(entidad_sector_map)
mask_generic = df["sector"] == "Genérico / Otros"
mask_assign = mask_generic & proposed_sector.notna()

# Asignar sectores solo a las filas Genérico / Otros con entidad mapeada
if mask_assign.any():
    df.loc[mask_assign, "sector"] = proposed_sector[mask_assign]

# Contadores por categoría asignada en esta pasada
counts_assigned = proposed_sector[mask_assign].value_counts().to_dict()
print("Filas sectorizadas por categoría (solo desde 'Genérico / Otros' usando entidad):")
for categoria in [
    "SaaS / Cloud / Plataformas",
    "Fintech / Pagos",
]:
    print(f"- {categoria}: {counts_assigned.get(categoria, 0)}")

# df_gen actualizado tras la asignación
df_gen = df[df["sector"] == "Genérico / Otros"].copy()
print(f"\nFilas restantes en df_gen (Genérico / Otros): {len(df_gen)}")
display(df_gen.head(20))


Filas sectorizadas por categoría (solo desde 'Genérico / Otros' usando entidad):
- SaaS / Cloud / Plataformas: 0
- Fintech / Pagos: 0

Filas restantes en df_gen (Genérico / Otros): 66


Unnamed: 0,campaign,categoria,confianza,confidence,dataset_split,domain,entidad,free_hosting,inclusion,is_https,...,url_norm,domain_complexity,host_entropy,domain_whitelist_score,suspicious_path_token,token_density,trusted_token_context,infra_risk,fake_tld_in_subdomain_or_path,param_count_boost
183,,saas,,,,,,,,,...,https://login.yahoo.com,0.436826,0.241868,0.0,0.0,0.0,0.0,0.0,0.0,0.0
190,,energía,,,,,,,,,...,https://mibp.es/es/welcome,0.420676,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
191,,gaming,,,,,,,,,...,https://www.habbo.es,0.427394,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
194,,saas,,,,,,,,,...,https://wetransfer.com,0.55432,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
203,,saas,,,,,,,,,...,https://www.dropbox.com/es_es/login,0.488692,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0
204,,cripto,,,,,,,,,...,https://www.coinbase.com/es-es,0.529964,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
209,,cripto,,,,,,,,,...,https://help.coinbase.com/es-es,0.529964,0.166667,0.0,0.0,0.0,0.0,0.0,1.0,0.0
212,,gaming,,,,,,,,,...,https://www.roblox.com/es/login,0.462651,0.0,0.0,0.0,0.0,-1.0,0.0,1.0,0.0
213,,saas,,,,,,,,,...,https://zoom.us/es/join,0.403717,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
214,,gaming,,,,,,,,,...,https://www.roblox.com/es/upgrades/robux?ctx=n...,0.462651,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5


In [82]:
# Sectorización adicional solo usando `entidad` normalizada para filas Genérico / Otros

# 1. Columna temporal normalizada
df["entidad_norm"] = df["entidad"].astype(str).str.lower().str.strip().fillna("")

# 2. Reglas sobre entidad_norm
entidad_sector_map_norm = {
    # SaaS / Cloud / Plataformas
    "twitter": "SaaS / Cloud / Plataformas",
    "linkedin": "SaaS / Cloud / Plataformas",
    "okta": "SaaS / Cloud / Plataformas",
    "auth0": "SaaS / Cloud / Plataformas",
    "microsoft azure": "SaaS / Cloud / Plataformas",
    "cloudflare": "SaaS / Cloud / Plataformas",
    "amazon web services": "SaaS / Cloud / Plataformas",
    # Fintech / Pagos
    "stripe españa": "Fintech / Pagos",
    "paypal españa": "Fintech / Pagos",
    "redsys españa": "Fintech / Pagos",
}

proposed_sector_norm = df["entidad_norm"].map(entidad_sector_map_norm)
mask_generic = df["sector"] == "Genérico / Otros"
mask_assign = mask_generic & proposed_sector_norm.notna()

# 3-4. Asignar solo donde sigue siendo Genérico / Otros y hay match en entidad_norm
if mask_assign.any():
    df.loc[mask_assign, "sector"] = proposed_sector_norm[mask_assign]

# 5. Contadores por categoría
counts_assigned_norm = proposed_sector_norm[mask_assign].value_counts().to_dict()
print("Filas sectorizadas por categoría (usando entidad_norm desde 'Genérico / Otros'):")
for categoria in [
    "SaaS / Cloud / Plataformas",
    "Fintech / Pagos",
]:
    print(f"- {categoria}: {counts_assigned_norm.get(categoria, 0)}")

# 6. Count final de Genérico / Otros
remaining_generic = int((df["sector"] == "Genérico / Otros").sum())
print(f"\nFilas restantes en 'Genérico / Otros': {remaining_generic}")

# Limpiar columna temporal
df.drop(columns=["entidad_norm"], inplace=True)

# Mostrar ejemplo de 20 filas restantes en Genérico / Otros
if remaining_generic:
    display(df[df["sector"] == "Genérico / Otros"].head(20))


Filas sectorizadas por categoría (usando entidad_norm desde 'Genérico / Otros'):
- SaaS / Cloud / Plataformas: 0
- Fintech / Pagos: 0

Filas restantes en 'Genérico / Otros': 66


Unnamed: 0,campaign,categoria,confianza,confidence,dataset_split,domain,entidad,free_hosting,inclusion,is_https,...,url_norm,domain_complexity,host_entropy,domain_whitelist_score,suspicious_path_token,token_density,trusted_token_context,infra_risk,fake_tld_in_subdomain_or_path,param_count_boost
183,,saas,,,,,,,,,...,https://login.yahoo.com,0.436826,0.241868,0.0,0.0,0.0,0.0,0.0,0.0,0.0
190,,energía,,,,,,,,,...,https://mibp.es/es/welcome,0.420676,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
191,,gaming,,,,,,,,,...,https://www.habbo.es,0.427394,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
194,,saas,,,,,,,,,...,https://wetransfer.com,0.55432,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
203,,saas,,,,,,,,,...,https://www.dropbox.com/es_es/login,0.488692,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0
204,,cripto,,,,,,,,,...,https://www.coinbase.com/es-es,0.529964,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
209,,cripto,,,,,,,,,...,https://help.coinbase.com/es-es,0.529964,0.166667,0.0,0.0,0.0,0.0,0.0,1.0,0.0
212,,gaming,,,,,,,,,...,https://www.roblox.com/es/login,0.462651,0.0,0.0,0.0,0.0,-1.0,0.0,1.0,0.0
213,,saas,,,,,,,,,...,https://zoom.us/es/join,0.403717,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
214,,gaming,,,,,,,,,...,https://www.roblox.com/es/upgrades/robux?ctx=n...,0.462651,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.5


In [83]:
# Sectorización usando `categoria` como regla principal sobre filas Genérico / Otros

categoria_norm = df["categoria"].astype(str).str.lower().str.strip()

categoria_sector_map = {
    "saas": "SaaS / Cloud / Plataformas",
    "viajes": "Viajes / Transporte",
    "energía": "Energía / Seguros",
    "energia": "Energía / Seguros",
    "seguros": "Energía / Seguros",
    "cripto": "Cripto / Fintech",
    "rrss": "RRSS / Redes Sociales",
    "gaming": "Gaming",
    "e-commerce": "Retail / e-commerce / Streaming",
    "streaming": "Retail / e-commerce / Streaming",
}

proposed_sector_cat = categoria_norm.map(categoria_sector_map)
mask_generic = df["sector"] == "Genérico / Otros"
mask_assign = mask_generic & proposed_sector_cat.notna()

if mask_assign.any():
    df.loc[mask_assign, "sector"] = proposed_sector_cat[mask_assign]

counts_assigned_cat = proposed_sector_cat[mask_assign].value_counts().to_dict()
print("Filas sectorizadas por categoría (usando categoria_norm desde 'Genérico / Otros'):")
for categoria in [
    "SaaS / Cloud / Plataformas",
    "Viajes / Transporte",
    "Energía / Seguros",
    "Cripto / Fintech",
    "RRSS / Redes Sociales",
    "Gaming",
    "Retail / e-commerce / Streaming",
]:
    print(f"- {categoria}: {counts_assigned_cat.get(categoria, 0)}")

remaining_generic = int((df["sector"] == "Genérico / Otros").sum())
print(f"\nFilas restantes en 'Genérico / Otros': {remaining_generic}")
if remaining_generic:
    display(df[df["sector"] == "Genérico / Otros"].head(20))


Filas sectorizadas por categoría (usando categoria_norm desde 'Genérico / Otros'):
- SaaS / Cloud / Plataformas: 17
- Viajes / Transporte: 0
- Energía / Seguros: 2
- Cripto / Fintech: 5
- RRSS / Redes Sociales: 0
- Gaming: 6
- Retail / e-commerce / Streaming: 0

Filas restantes en 'Genérico / Otros': 36


Unnamed: 0,campaign,categoria,confianza,confidence,dataset_split,domain,entidad,free_hosting,inclusion,is_https,...,url_norm,domain_complexity,host_entropy,domain_whitelist_score,suspicious_path_token,token_density,trusted_token_context,infra_risk,fake_tld_in_subdomain_or_path,param_count_boost
247,,,90.0,,train_val,webseguridadcuenta-9e626b.ingress-bonde.easywp...,Generico,,1.0,,...,http://webseguridadcuenta-9e626b.ingress-bonde...,0.47926,0.99271,0.0,1.0,0.107143,-1.0,0.3,0.0,0.666667
251,,,90.0,,train_val,suponsoro22-ba9799.ingress-daribow.ewp.live,Generico,,1.0,,...,https://suponsoro22-ba9799.ingress-daribow.ewp...,0.508495,0.976934,0.0,0.0,0.0,-1.0,3.0,1.0,0.0
252,,,90.0,,train_val,suponsoo22-ba6aa2.ingress-florina.ewp.live,Generico,,1.0,,...,https://suponsoo22-ba6aa2.ingress-florina.ewp....,0.508495,0.94133,0.0,0.0,0.0,-1.0,3.0,1.0,0.0
253,,,90.0,,train_val,supoertas22-bb468f.ingress-bonde.ewp.live,Generico,,1.0,,...,https://supoertas22-bb468f.ingress-bonde.ewp.l...,0.508495,1.0,0.0,0.0,0.0,-1.0,3.0,1.0,0.5
254,,,90.0,,train_val,supanort22-baa4c7.ingress-erytho.ewp.live,Generico,,1.0,,...,https://supanort22-baa4c7.ingress-erytho.ewp.l...,0.508495,1.0,0.0,0.0,0.0,-1.0,3.0,1.0,0.0
261,,,90.0,,train_val,particulares-es-b965d0.ingress-comporellon.ewp...,Generico,,1.0,,...,https://particulares-es-b965d0.ingress-compore...,0.508495,1.0,0.0,0.0,0.0,-1.0,3.0,1.0,0.0
262,,,90.0,,train_val,particulares-es-1-bd5e31.ingress-daribow.ewp.live,Generico,,1.0,,...,https://particulares-es-1-bd5e31.ingress-darib...,0.508495,1.0,0.0,0.0,0.0,-1.0,3.0,1.0,0.5
268,,,90.0,,train_val,informacion-cliente-spainespain34122306.codean...,Generico,,1.0,,...,https://informacion-cliente-spainespain3412230...,0.55432,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
305,,,90.0,,train_val,cuentainformacion-b8eb22.ingress-comporellon.e...,Generico,,1.0,,...,https://cuentainformacion-b8eb22.ingress-compo...,0.508495,1.0,0.0,0.0,0.0,-1.0,3.0,0.0,0.0
338,,,90.0,,train_val,confirmacion-cuenta-1-9cd542.ingress-comporell...,Generico,,1.0,,...,https://confirmacion-cuenta-1-9cd542.ingress-c...,0.47926,1.0,0.0,0.0,0.0,-1.0,0.0,1.0,0.0


In [84]:
# Marcar URLs de dominios acortadores como excluidas
import numpy as np

acortadores = {
    "bit.ly",
    "tinyurl.com",
    "t.co",
    "is.gd",
    "qrco.de",
    "cutt.ly",
    "rebrand.ly",
    "goo.gl",
    "shorturl.at",
}

mask_acortador = df["domain"].astype(str).str.lower().str.strip().isin(acortadores)
num_excluidas = int(mask_acortador.sum())

if num_excluidas:
    # exclude_flag = 1
    df.loc[mask_acortador, "exclude_flag"] = 1
    # exclude_reason += "acortador"
    existing_reason = df.loc[mask_acortador, "exclude_reason"].fillna("").astype(str).str.strip()
    df.loc[mask_acortador, "exclude_reason"] = np.where(
        existing_reason == "", "acortador", existing_reason + " | acortador"
    )

print(f"Filas marcadas como excluidas por acortador: {num_excluidas}")


Filas marcadas como excluidas por acortador: 0


In [None]:
# Recalcular columna `domain` usando tldextract sobre `url_norm`
import tldextract

def extract_domain(url):
    try:
        if pd.isna(url):
            return ""
        ext = tldextract.extract(str(url))
        return ext.registered_domain or ""
    except Exception:
        return ""

# Recalcular dominio ignorando el valor previo
df["domain"] = df["url_norm"].apply(extract_domain)

# Contar dominios vacíos
domain_empty_count = int((df["domain"] == "").sum())
print(f"Filas con domain vacío tras recalcular: {domain_empty_count}")

# Mostrar los 20 primeros dominios recalculados
display(df["domain"].head(20))


In [92]:
# Marcar como excluidas las URLs cuyo dominio es un acortador (lista ampliada)
import numpy as np

acortadores_ext = {
    "bit.ly", "tinyurl.com", "t.co", "is.gd", "qrco.de", "cutt.ly",
    "rebrand.ly", "shorturl.at", "shrtco.de", "rb.gy", "v.gd", "s.id", "goo.gl",
}

mask_acortador_ext = df["domain"].astype(str).str.lower().str.strip().isin(acortadores_ext)
num_excluidas_ext = int(mask_acortador_ext.sum())

if num_excluidas_ext:
    df.loc[mask_acortador_ext, "exclude_flag"] = 1
    existing_reason = df.loc[mask_acortador_ext, "exclude_reason"].fillna("").astype(str).str.strip()
    df.loc[mask_acortador_ext, "exclude_reason"] = np.where(
        existing_reason == "", "acortador", existing_reason + ";acortador"
    )

print(f"Filas marcadas como excluidas por acortador (lista ampliada): {num_excluidas_ext}")


Filas marcadas como excluidas por acortador (lista ampliada): 1


In [91]:
# Asegurar columnas exclude_flag y exclude_reason, y re-ejecutar exclusión por acortadores
import numpy as np

if "exclude_flag" not in df.columns:
    df["exclude_flag"] = 0
if "exclude_reason" not in df.columns:
    df["exclude_reason"] = ""

acortadores_ext = {
    "bit.ly", "tinyurl.com", "t.co", "is.gd", "qrco.de", "cutt.ly",
    "rebrand.ly", "shorturl.at", "shrtco.de", "rb.gy", "v.gd", "s.id", "goo.gl",
}

mask_acortador_ext = df["domain"].astype(str).str.lower().str.strip().isin(acortadores_ext)
num_excluidas_ext = int(mask_acortador_ext.sum())

if num_excluidas_ext:
    df.loc[mask_acortador_ext, "exclude_flag"] = 1
    existing_reason = df.loc[mask_acortador_ext, "exclude_reason"].fillna("").astype(str).str.strip()
    df.loc[mask_acortador_ext, "exclude_reason"] = np.where(
        existing_reason == "", "acortador", existing_reason + ";acortador"
    )

print(f"Filas marcadas como excluidas por acortador (lista ampliada): {num_excluidas_ext}")


Filas marcadas como excluidas por acortador (lista ampliada): 1


In [None]:
# Recalcular columna `domain` usando tldextract sobre `url_norm` (al final)
import tldextract

def extract_domain(url):
    try:
        if pd.isna(url):
            return ""
        ext = tldextract.extract(str(url))
        return ext.registered_domain or ""
    except Exception:
        return ""

# Recalcular dominio ignorando el valor previo
df["domain"] = df["url_norm"].apply(extract_domain)

# Contar dominios vacíos
domain_empty_count = int((df["domain"] == "").sum())
print(f"Filas con domain vacío tras recalcular: {domain_empty_count}")

# Mostrar los 20 primeros dominios recalculados
display(df["domain"].head(20))


In [85]:
# Sectorizar filas que quedaron como "Genérico / Otros" usando entidad o tokens en URL/domain
before_gen = df[df["sector"] == "Genérico / Otros"][
    ["url", "domain", "entidad", "categoria", "sector"]
].head(20)
before_indices = before_gen.index

# Reglas de tokens por sector, priorizadas en este orden
sector_token_rules = [
    ("SaaS / Cloud / Plataformas", ["twitter", "linkedin", "okta", "auth0", "azure", "microsoft azure", "cloudflare", "aws"]),
    ("Fintech / Pagos", ["paypal", "stripe", "redsys"]),
    ("Cripto / Fintech", ["binance"]),
    ("RRSS / Redes Sociales", ["whatsapp", "instagram"]),
    ("Viajes / Transporte", ["renfe", "iberia"]),
    ("Energía / Seguros", ["iberdrola", "mutua", "mapfre"]),
]

filled_counts = {label: 0 for label, _ in sector_token_rules}

# Solo modificamos filas que sigan en "Genérico / Otros"
gen_mask = df["sector"] == "Genérico / Otros"

for sector_label, tokens in sector_token_rules:
    token_pattern = "|".join(re.escape(t) for t in tokens)
    matches = (
        df["entidad"].str.contains(token_pattern, case=False, na=False)
        | df["url"].str.contains(token_pattern, case=False, na=False)
        | df["domain"].str.contains(token_pattern, case=False, na=False)
    )
    apply_mask = gen_mask & matches
    count = int(apply_mask.sum())
    if count:
        df.loc[apply_mask, "sector"] = sector_label
        filled_counts[sector_label] += count
        # Actualizamos máscara para no sobrescribir en iteraciones siguientes
        gen_mask = df["sector"] == "Genérico / Otros"

print("Filas sectorizadas por categoría adicional:")
for label, count in filled_counts.items():
    print(f"- {label}: {count}")

# df_gen actualizado tras la sectorización adicional
df_gen = df[df["sector"] == "Genérico / Otros"].copy()
print(f"\nFilas restantes en df_gen (Genérico / Otros): {len(df_gen)}")

after_gen = df.loc[before_indices, ["url", "domain", "entidad", "categoria", "sector"]]
print("\nMuestra antes de sectorizar (Genérico / Otros):")
display(before_gen)
print("Misma muestra después de sectorizar:")
display(after_gen)


Filas sectorizadas por categoría adicional:
- SaaS / Cloud / Plataformas: 0
- Fintech / Pagos: 0
- Cripto / Fintech: 0
- RRSS / Redes Sociales: 0
- Viajes / Transporte: 0
- Energía / Seguros: 0

Filas restantes en df_gen (Genérico / Otros): 36

Muestra antes de sectorizar (Genérico / Otros):


Unnamed: 0,url,domain,entidad,categoria,sector
247,http://webseguridadcuenta-9e626b.ingress-bonde...,webseguridadcuenta-9e626b.ingress-bonde.easywp...,Generico,,Genérico / Otros
251,https://suponsoro22-ba9799.ingress-daribow.ewp...,suponsoro22-ba9799.ingress-daribow.ewp.live,Generico,,Genérico / Otros
252,https://suponsoo22-ba6aa2.ingress-florina.ewp....,suponsoo22-ba6aa2.ingress-florina.ewp.live,Generico,,Genérico / Otros
253,https://supoertas22-bb468f.ingress-bonde.ewp.l...,supoertas22-bb468f.ingress-bonde.ewp.live,Generico,,Genérico / Otros
254,https://supanort22-baa4c7.ingress-erytho.ewp.l...,supanort22-baa4c7.ingress-erytho.ewp.live,Generico,,Genérico / Otros
261,https://particulares-es-b965d0.ingress-compore...,particulares-es-b965d0.ingress-comporellon.ewp...,Generico,,Genérico / Otros
262,https://particulares-es-1-bd5e31.ingress-darib...,particulares-es-1-bd5e31.ingress-daribow.ewp.live,Generico,,Genérico / Otros
268,https://informacion-cliente-spainespain3412230...,informacion-cliente-spainespain34122306.codean...,Generico,,Genérico / Otros
305,https://cuentainformacion-b8eb22.ingress-compo...,cuentainformacion-b8eb22.ingress-comporellon.e...,Generico,,Genérico / Otros
338,https://confirmacion-cuenta-1-9cd542.ingress-c...,confirmacion-cuenta-1-9cd542.ingress-comporell...,Generico,,Genérico / Otros


Misma muestra después de sectorizar:


Unnamed: 0,url,domain,entidad,categoria,sector
247,http://webseguridadcuenta-9e626b.ingress-bonde...,webseguridadcuenta-9e626b.ingress-bonde.easywp...,Generico,,Genérico / Otros
251,https://suponsoro22-ba9799.ingress-daribow.ewp...,suponsoro22-ba9799.ingress-daribow.ewp.live,Generico,,Genérico / Otros
252,https://suponsoo22-ba6aa2.ingress-florina.ewp....,suponsoo22-ba6aa2.ingress-florina.ewp.live,Generico,,Genérico / Otros
253,https://supoertas22-bb468f.ingress-bonde.ewp.l...,supoertas22-bb468f.ingress-bonde.ewp.live,Generico,,Genérico / Otros
254,https://supanort22-baa4c7.ingress-erytho.ewp.l...,supanort22-baa4c7.ingress-erytho.ewp.live,Generico,,Genérico / Otros
261,https://particulares-es-b965d0.ingress-compore...,particulares-es-b965d0.ingress-comporellon.ewp...,Generico,,Genérico / Otros
262,https://particulares-es-1-bd5e31.ingress-darib...,particulares-es-1-bd5e31.ingress-daribow.ewp.live,Generico,,Genérico / Otros
268,https://informacion-cliente-spainespain3412230...,informacion-cliente-spainespain34122306.codean...,Generico,,Genérico / Otros
305,https://cuentainformacion-b8eb22.ingress-compo...,cuentainformacion-b8eb22.ingress-comporellon.e...,Generico,,Genérico / Otros
338,https://confirmacion-cuenta-1-9cd542.ingress-c...,confirmacion-cuenta-1-9cd542.ingress-comporell...,Generico,,Genérico / Otros


In [86]:
# Recalcular `domain` desde `url_norm` usando tldextract (nueva celda final)
import tldextract

def _extract_domain_safe(url):
    try:
        if pd.isna(url):
            return ""
        ext = tldextract.extract(str(url))
        return ext.registered_domain or ""
    except Exception:
        return ""

# Recalcular ignorando el dominio previo
df["domain"] = df["url_norm"].apply(_extract_domain_safe)

# Conteo de dominios vacíos
domain_empty_count = int((df["domain"] == "").sum())
print(f"Filas con domain vacío tras recalcular: {domain_empty_count}")

# Mostrar los 20 primeros dominios recalculados
display(df["domain"].head(20))


Filas con domain vacío tras recalcular: 1


  return ext.registered_domain or ""


0        caixabank.es
1         ibercaja.es
2           wizink.es
3           wizink.es
4          cetelem.es
5          cetelem.es
6          cajamar.es
7         ibercaja.es
8          cajamar.es
9        kutxabank.es
10       kutxabank.es
11    unicajabanco.es
12    unicajabanco.es
13    unicajabanco.es
14         abanca.com
15       kutxabank.es
16      myinvestor.es
17      myinvestor.es
18         correos.es
19         correos.es
Name: domain, dtype: object