In [5]:
from pathlib import Path
import os

# Forzar a subir hasta la raíz del proyecto
root = Path(__file__).resolve().parents[2] if "__file__" in globals() else Path.cwd().parents[2]
os.chdir(root)

print("Directorio actual:", Path.cwd())


Directorio actual: /Users/test/Desktop/phishing-detector


In [6]:
path_consol = Path("data/processed/openphish_consolidado.csv")
assert path_consol.exists(), f"No existe: {path_consol.resolve()}"

op = pd.read_csv(path_consol)
print("Filas:", len(op))


Filas: 3863


In [8]:
import pandas as pd
from urllib.parse import urlparse
import validators
from pathlib import Path

# 1) Cargar
path = Path("data/processed/openphish_consolidado.csv")
df = pd.read_csv(path)

print("Antes de limpiar:", len(df))



Antes de limpiar: 3863


In [13]:
# 2) Eliminar duplicados
df = df.drop_duplicates(subset=["url"])
df = df.drop_duplicates(subset=["url", "fecha_hora_recoleccion"])
print("Tras eliminar dupliados:", len(df))

Tras eliminar dupliados: 3779


In [14]:
# 3) Normalizar URLs
def normalize_url(u):
    if not isinstance(u, str):
        return None
    u = u.strip().lower()
    if u.startswith("http://"):
        u = "https://" + u[len("http://"):]
    if u.endswith("/"):
        u = u[:-1]
    return u
df["url"] = df["url"].map(normalize_url)
df.head()


Unnamed: 0,url,fuente,fecha_hora_recoleccion,__source_file
0,https://monday.kimberlywoodrich.workers.dev,OpenPhish,2025-08-18 17:16:36,2025-08-18.csv
1,https://mybdoonline.device-personal.workers.de...,OpenPhish,2025-08-18 17:16:36,2025-08-18.csv
2,https://bruker.directsmailsolution.com/qrc-ac/...,OpenPhish,2025-08-18 17:16:36,2025-08-18.csv
3,https://avisou.click/x,OpenPhish,2025-08-18 17:16:36,2025-08-18.csv
4,https://bdo-app.sbdoonline.workers.dev/bdo-for...,OpenPhish,2025-08-18 17:16:36,2025-08-18.csv


In [16]:
# 4) Filtrar inválidas
def es_url_valida(x):
    try:
        return bool(validators.url(x))
    except Exception:
        return False

df = df[df["url"].map(lambda x: es_url_valida(x) if pd.notna(x) else False)]
print("Tras filtrar inválidas:", len(df))


Tras filtrar inválidas: 3703


In [18]:
# 5) Extraer dominio de cada URL
df = df.copy()  # asegura que trabajamos con copia, no vista
df["dominio"] = df["url"].map(lambda x: urlparse(x).netloc)
print("Columnas:", df.columns.tolist())
df.head(5)


Columnas: ['url', 'fuente', 'fecha_hora_recoleccion', '__source_file', 'dominio']


Unnamed: 0,url,fuente,fecha_hora_recoleccion,__source_file,dominio
0,https://monday.kimberlywoodrich.workers.dev,OpenPhish,2025-08-18 17:16:36,2025-08-18.csv,monday.kimberlywoodrich.workers.dev
1,https://mybdoonline.device-personal.workers.de...,OpenPhish,2025-08-18 17:16:36,2025-08-18.csv,mybdoonline.device-personal.workers.dev
2,https://bruker.directsmailsolution.com/qrc-ac/...,OpenPhish,2025-08-18 17:16:36,2025-08-18.csv,bruker.directsmailsolution.com
3,https://avisou.click/x,OpenPhish,2025-08-18 17:16:36,2025-08-18.csv,avisou.click
4,https://bdo-app.sbdoonline.workers.dev/bdo-for...,OpenPhish,2025-08-18 17:16:36,2025-08-18.csv,bdo-app.sbdoonline.workers.dev


In [19]:
# 6) Contar dominios distintos
n_dominios = df["dominio"].nunique()
print("Dominios únicos:", n_dominios)

# Mostrar los 10 más frecuentes
df["dominio"].value_counts().head(10)


Dominios únicos: 3210


dominio
keensalmon.org                             38
ipfs.io                                    31
financialalertsystem.com                   23
tts7324.webmo.fr                           21
bet63sss.com                               19
bdoonline.personal-device.workers.dev      17
www.kkinstagram.com                        16
mybdoonline.device-personal.workers.dev    16
bet73091.com                                9
steamcommunllity.com                        9
Name: count, dtype: int64

In [20]:
# 7) Eliminar duplicados exactos de URL
antes = len(df)
df = df.drop_duplicates(subset=["url"])
print(f"Eliminados {antes - len(df)} duplicados de URL. Total ahora: {len(df)}")

# También podemos ver cuántos dominios únicos quedan
print("Dominios únicos:", df["dominio"].nunique())


Eliminados 0 duplicados de URL. Total ahora: 3703
Dominios únicos: 3210


In [21]:
# 8) Contar frecuencia de dominios
dominios_freq = df["dominio"].value_counts()

print("Top 10 dominios más frecuentes:")
print(dominios_freq.head(10))

# Guardar para referencia completa
dominios_freq.to_csv("data/processed/openphish_dominios_frecuencia.csv")


Top 10 dominios más frecuentes:
dominio
keensalmon.org                             38
ipfs.io                                    31
financialalertsystem.com                   23
tts7324.webmo.fr                           21
bet63sss.com                               19
bdoonline.personal-device.workers.dev      17
www.kkinstagram.com                        16
mybdoonline.device-personal.workers.dev    16
bet73091.com                                9
steamcommunllity.com                        9
Name: count, dtype: int64


In [22]:
# 9) Normalizar dominios de plataformas conocidas
plataformas = ["workers.dev", "ipfs.io", "godaddysites.com"]

def normalizar_dominio(d):
    for base in plataformas:
        if d.endswith(base):
            return base
    return d

df["dominio_norm"] = df["dominio"].map(normalizar_dominio)

print("Ejemplos tras normalización:")
print(df[["dominio", "dominio_norm"]].head(15))

print("\nTop 10 dominios normalizados:")
print(df["dominio_norm"].value_counts().head(10))


Ejemplos tras normalización:
                                        dominio  \
0           monday.kimberlywoodrich.workers.dev   
1       mybdoonline.device-personal.workers.dev   
2                bruker.directsmailsolution.com   
3                                  avisou.click   
4                bdo-app.sbdoonline.workers.dev   
5         bdoonline.personal-device.workers.dev   
6   bccalumni.buenavistacommunitycollege.edu.ph   
7         bdoonline.personal-device.workers.dev   
8         bdoonline.personal-device.workers.dev   
9         bdoonline.personal-device.workers.dev   
10                                      ipfs.io   
11  onlinebdopersonal.mobile-device.workers.dev   
12              site-345fe5t4l.godaddysites.com   
13  onlinebdopersonal.mobile-device.workers.dev   
14  onlinebdopersonal.mobile-device.workers.dev   

                                   dominio_norm  
0                                   workers.dev  
1                                   workers.dev  
2   

In [23]:
entidades_es = {
    "bancos": [
        "bbva", "santander", "caixabank", "bankia", "cajamar", 
        "bankinter", "unicaja", "kutxabank", "sabadell", "ibercaja",
        "evo", "openbank", "abanca"
    ],
    "telcos": [
        "movistar", "telefonica", "vodafone", "orange", "yoigo", 
        "masmovil", "jazztel", "simyo", "lowi", "pepephone"
    ],
    "servicios": [
        "correos", "renfe", "iberia", "endesa", "naturgy", "repsol",
        "mapfre", "seguro", "dgt", "aeat", "seg-social"
    ],
    "ecommerce": [
        "corteingles", "zara", "mercadona", "ikea", "carrefour", 
        "aliexpress", "amazon.es", "pccomponentes", "mediamarkt"
    ]
}


In [24]:
# 9) Filtrar URLs con contexto español
import re

def score_es(url, entidades):
    url_lower = url.lower()
    score = 0
    for categoria, lista in entidades.items():
        for entidad in lista:
            if re.search(rf"\b{entidad}\b", url_lower):
                score += 1
    return score

df["score_es"] = df["url"].map(lambda x: score_es(x, entidades_es))

print("URLs con score_es >= 1:", (df["score_es"] >= 1).sum())
df[df["score_es"] >= 1].head(10)


URLs con score_es >= 1: 8


Unnamed: 0,url,fuente,fecha_hora_recoleccion,__source_file,dominio,dominio_norm,score_es
1431,https://www.santander.learn-on-demand.co.uk,OpenPhish,20250728_145936,openphish_20250728_145936.csv,www.santander.learn-on-demand.co.uk,www.santander.learn-on-demand.co.uk,1
2119,https://www.bbva.stockbrokers.com.mx,OpenPhish,2025-07-31 23:00:05,openphish_online_20250731_230005.csv,www.bbva.stockbrokers.com.mx,www.bbva.stockbrokers.com.mx,1
2137,https://bbva.stockbrokers.com.mx,OpenPhish,2025-07-31 23:00:05,openphish_online_20250731_230005.csv,bbva.stockbrokers.com.mx,bbva.stockbrokers.com.mx,1
2726,https://www.headabovetherim.com/orange/orange0...,OpenPhish,2025-08-11 23:00:07,openphish_online_20250811_230007.csv,www.headabovetherim.com,www.headabovetherim.com,1
2729,https://orangefr9.wixsite.com/orange,OpenPhish,2025-08-11 23:00:07,openphish_online_20250811_230007.csv,orangefr9.wixsite.com,orangefr9.wixsite.com,1
2814,https://santander.learn-on-demand.co.uk/favico...,OpenPhish,2025-08-11 23:00:07,openphish_online_20250811_230007.csv,santander.learn-on-demand.co.uk,santander.learn-on-demand.co.uk,1
2818,https://zarejestrujnumer.blue.pl/orange,OpenPhish,2025-08-11 23:00:07,openphish_online_20250811_230007.csv,zarejestrujnumer.blue.pl,zarejestrujnumer.blue.pl,1
3598,https://vodafone-lauf.de,OpenPhish,2025-08-16 23:00:04,openphish_online_20250816_230004.csv,vodafone-lauf.de,vodafone-lauf.de,1


In [25]:
# 10) Guardar solo las URLs con contexto español
out_es = Path("data/processed/openphish_es.csv")
df_es = df[df["score_es"] >= 1].copy()

df_es.to_csv(out_es, index=False)
print(f"Guardadas {len(df_es)} filas en {out_es}")


Guardadas 8 filas en data/processed/openphish_es.csv
