In [15]:
import pandas as pd

ruta_csv = "../data/interim/dataset_entrenamiento_v2.csv"
df = pd.read_csv(ruta_csv)

# Normalizar nulos en notas y campaign
for col in ["notas", "campaign"]:
    df[col] = df[col].fillna("").astype(str).str.strip()

print("df.isna().sum():")
print(df.isna().sum())
print("df.head(5):")
print(df.head(5))



df.isna().sum():
url         0
label       0
sector      0
entidad     0
notas       0
campaign    0
dtype: int64
df.head(5):
                                                 url  label sector    entidad  \
0  https://www.caixabank.es/particular/banca-digi...      0  banca  caixabank   
1              https://www.ibercaja.es/particulares/      0  banca   ibercaja   
2                     https://www.wizink.es/tarjetas      0  banca     wizink   
3                             https://www.wizink.es/      0  banca     wizink   
4        https://www.cetelem.es/credito-y-prestamos/      0  banca    cetelem   

                   notas campaign  
0  home bancaria oficial           
1                                  
2                                  
3                                  
4                                  


In [16]:
# Duplicados exactos por URL
dups = df[df.duplicated("url", keep=False)].sort_values("url")

print("Número de URLs duplicadas:", dups["url"].nunique())
display(dups.head(20))


Número de URLs duplicadas: 0


Unnamed: 0,url,label,sector,entidad,notas,campaign


In [17]:
df["sector"].value_counts().sort_index()


sector
banca            193
cripto            22
ecommerce         18
energia           14
gaming             6
generico          57
logistica         94
publico           20
redessociales      6
saas              36
seguros            2
streaming          9
teleco            13
viajes             2
Name: count, dtype: int64

In [18]:
sorted(df["entidad"].unique())


['abanca',
 'aeat',
 'agenciatributaria',
 'amazon',
 'amazon espana',
 'amazon web services',
 'auth0',
 'banco galicia',
 'banco santander',
 'bancomediolanum',
 'bancosantander',
 'bancsabadell',
 'bankia',
 'bankinter',
 'bbva',
 'binance',
 'bit2me',
 'bmedonline',
 'bp',
 'caixa',
 'caixabank',
 'caja rural',
 'cajamar',
 'cajaruraldeasturias',
 'carrefour',
 'carrefour espana',
 'cetelem',
 'clave',
 'cloudflare',
 'coinbase',
 'correos',
 'correosexpress',
 'decathlon',
 'dgt',
 'dhl',
 'dhl espana',
 'digi',
 'digimobil',
 'dpd',
 'dropbox',
 'el corte ingles',
 'endesa',
 'financieraelcorteingles',
 'flixole',
 'generico',
 'github',
 'gitlab',
 'gls espana',
 'google',
 'google workspace',
 'grupocajarural',
 'habbo',
 'hbomax',
 'ibercaja',
 'iberdrola',
 'iberia',
 'ing',
 'instagram',
 'ionos',
 'kraken',
 'kutxabank',
 'linea directa',
 'linkedin',
 'mapfre',
 'masmovil',
 'mediamarkt',
 'mercadona',
 'microsoft',
 'microsoft azure',
 'movistar',
 'mrw',
 'mutua',
 'myin

In [19]:
from urllib.parse import urlparse

SHORTENERS = [
    "bit.ly","t.co","l.ead.me","tinyurl.com","cutt.ly",
    "goo.gl","is.gd","rebrand.ly","s.id"
]

hosts = df["url"].apply(lambda u: urlparse(u if "://" in u else "//"+u).hostname or "")
mask_short = hosts.str.lower().isin([s.lower() for s in SHORTENERS])

print("Acortadores detectados:", mask_short.sum())
display(df[mask_short][["url","entidad","sector"]])

df = df[~mask_short].copy()
print("Nuevo shape:", df.shape)


Acortadores detectados: 4


Unnamed: 0,url,entidad,sector
398,https://l.ead.me/modifica-tu-entrega,generico,generico
404,https://bit.ly/Correos-ESP,correos,logistica
405,https://l.ead.me/BBVA24,bbva,banca
490,https://l.ead.me/DGTspain,dgt,publico


Nuevo shape: (488, 6)


In [20]:
# Patrones típicos de infraestructura comprometida
patterns = [
    "wp-", "wp-content", "wp-includes", "plugins", "themes", "includes",
    "/css/", "/js/", "webmail", "vendor/phpunit", "cpanel", "cpthemes"
]

# Detectar URLs con estos patrones
mask_infra = df["url"].str.contains("|".join(patterns), case=False, na=False)

print("Infra comprometida detectada:", mask_infra.sum())
display(df[mask_infra].head(10))

# Guardar dataset de infra para referencia
df_infra = df[mask_infra].copy()

# Eliminar del dataset principal
df = df[~mask_infra].copy()

print("Nuevo shape df:", df.shape)
print("Shape infra:", df_infra.shape)




Infra comprometida detectada: 4


Unnamed: 0,url,label,sector,entidad,notas,campaign
93,https://www.amazon.es/gp/css/order-details?ord...,0,ecommerce,amazon espana,,
384,https://lastbut-bc9a4f.ingress-florina.ewp.liv...,1,banca,bbva,Phishing BBVA + dominio fraudulento con subdom...,lastbut-bc9a4f.ingress-florina.ewp.live/wp-admin
461,http://unicismadrid.es/wp-content/com/index/ch...,1,publico,unicismadrid,web .es comprometida (WordPress) + ruta /login...,unicismadrid.es/wp-content
485,https://asesoriabarrachina.es/wp-includes/stra...,1,generico,generico,web .es comprometida(WordPress),asesoriabarrachina.es/wp-includes


Nuevo shape df: (484, 6)
Shape infra: (4, 6)


In [22]:
sorted(df["sector"].unique())


['banca',
 'cripto',
 'ecommerce',
 'energia',
 'gaming',
 'generico',
 'logistica',
 'publico',
 'redessociales',
 'saas',
 'seguros',
 'streaming',
 'teleco',
 'viajes']

In [23]:
# Distribución de labels y sectores
print("1) Recuento por label")
print(df["label"].value_counts())

print("\n2) Sectores para label==0")
print(df[df["label"] == 0]["sector"].value_counts())

print("\n3) Sectores para label==1")
print(df[df["label"] == 1]["sector"].value_counts())

print("\n4) Crosstab sector vs label")
print(pd.crosstab(df["sector"], df["label"]))



1) Recuento por label
label
0    243
1    241
Name: count, dtype: int64

2) Sectores para label==0
sector
banca            98
logistica        38
saas             25
cripto           18
ecommerce        17
energia          12
publico          12
teleco            7
redessociales     5
streaming         4
gaming            3
viajes            2
seguros           2
Name: count, dtype: int64

3) Sectores para label==1
sector
banca            93
logistica        55
generico         55
saas             11
publico           6
teleco            6
streaming         5
cripto            4
gaming            3
energia           2
redessociales     1
Name: count, dtype: int64

4) Crosstab sector vs label
label           0   1
sector               
banca          98  93
cripto         18   4
ecommerce      17   0
energia        12   2
gaming          3   3
generico        0  55
logistica      38  55
publico        12   6
redessociales   5   1
saas           25  11
seguros         2   0
streaming    

In [None]:
# Entidades únicas ordenadas
print(sorted(df["entidad"].unique()))



['abanca', 'aeat', 'agenciatributaria', 'amazon', 'amazon espana', 'amazon web services', 'auth0', 'banco galicia', 'banco santander', 'bancomediolanum', 'bancosantander', 'bancsabadell', 'bankia', 'bankinter', 'bbva', 'binance', 'bit2me', 'bmedonline', 'bp', 'caixa', 'caixabank', 'caja rural', 'cajamar', 'cajaruraldeasturias', 'carrefour', 'carrefour espana', 'cetelem', 'clave', 'cloudflare', 'coinbase', 'correos', 'correosexpress', 'decathlon', 'dgt', 'dhl', 'dhl espana', 'digi', 'digimobil', 'dpd', 'dropbox', 'el corte ingles', 'endesa', 'financieraelcorteingles', 'flixole', 'generico', 'github', 'gitlab', 'gls espana', 'google', 'google workspace', 'grupocajarural', 'habbo', 'hbomax', 'ibercaja', 'iberdrola', 'iberia', 'ing', 'instagram', 'ionos', 'kraken', 'kutxabank', 'linea directa', 'linkedin', 'mapfre', 'masmovil', 'mediamarkt', 'mercadona', 'microsoft', 'microsoft azure', 'movistar', 'mrw', 'mutua', 'myinvestor', 'nacex', 'netflix', 'netflix (espana)', 'okta', 'openbank', 'or

In [25]:
# Remove 2 legitimate rows from sector "energia"

energia_legit = df[(df["label"] == 0) & (df["sector"] == "energia")]

# Sample 2 rows to drop
to_drop = energia_legit.sample(2, random_state=42)

# Remove them
df = df.drop(to_drop.index).copy()

# Show updated stats
print(df.shape)
print(df["label"].value_counts())
print(df["sector"].value_counts())


(482, 6)
label
0    241
1    241
Name: count, dtype: int64
sector
banca            191
logistica         93
generico          55
saas              36
cripto            22
publico           18
ecommerce         17
teleco            13
energia           12
streaming          9
redessociales      6
gaming             6
viajes             2
seguros            2
Name: count, dtype: int64


In [26]:
# Save the final cleaned dataframe as dataset_v2.csv
df.to_csv("dataset_v2.csv", index=False, encoding="utf-8")

print("Saved:", "dataset_v2.csv")
print("Shape:", df.shape)


Saved: dataset_v2.csv
Shape: (482, 6)
