In [1]:
# ============================================================
# 🔍 Análisis de marca ING + detección de ruido internacional
# Dataset: feed_scored_v1_20251010_165910_score_4to7.csv
# ============================================================

import pandas as pd
import re
from urllib.parse import urlparse

# === CONFIG ===
INPUT_PATH = "../../../data/interim/phishing/feed_scored_v1_20251010_165910_score_4to7.csv"
OUTPUT_PATH = "feed_scored_v2_1_ing_analysis.csv"

# === 1️⃣ Función para detectar si "ing" es marca real o falso positivo ===
ING_WHITELIST = [
    r'\bing\.es\b', r'\bingdirect\b', r'\bing-bank\b', r'\bingbank\b'
]

def detect_ing_brand(url):
    """Detecta si 'ing' se usa como marca ING o falso positivo."""
    signals = []
    try:
        parsed = urlparse(url if url.startswith(('http://','https://')) else 'http://' + url)
        host = parsed.hostname or ''
        path = parsed.path or ''
        full = host + path

        labels = host.split('.') if host else []
        is_brand = False

        # Host label exacta == 'ing'
        if any(label.lower() == 'ing' for label in labels):
            is_brand = True
            signals.append('host_label_exact_ing')

        # Whitelist
        for w in ING_WHITELIST:
            if re.search(w, full, re.I):
                signals.append(f'whitelist_ing_match:{w}')
                is_brand = True
                break

        # Internal 'ing' dentro de palabras largas (banking, tracking, signing...)
        internal_ing = False
        for token in re.split(r'[-._/]', full):
            if re.search(r'[A-Za-z]+ing[A-Za-z]+', token):
                internal_ing = True
                signals.append(f'internal_ing_in_token:{token[:60]}')
                break
        if internal_ing and 'host_label_exact_ing' not in signals:
            is_brand = False
            signals.append('treated_as_false_positive_ing')

        # Penaliza si TLD es extranjero
        if is_brand and re.search(r'\.(de|fr|nl|us|uk|ch|au|ca|co\.za|no)\b', host, re.I):
            signals.append('foreign_tld_contradiction')
            is_brand = False

        reason = 'brand' if is_brand else 'not_brand'
        return is_brand, reason, ';'.join(signals)
    except Exception as e:
        return None, 'error', str(e)

# === 2️⃣ Función para detectar hosts comprometidos (.es con rutas técnicas) ===
def detect_compromised_host_es(url):
    try:
        parsed = urlparse(url)
        host = parsed.hostname or ''
        path = parsed.path or ''
        if '.es' in host and re.search(r'(wp-|plugins|themes|includes|css|js|webmail|vendor/phpunit)', path):
            return True
        return False
    except:
        return False

# === 3️⃣ Detección de TLD o tokens extranjeros ===
def detect_foreign(url):
    try:
        parsed = urlparse(url)
        host = parsed.hostname or ''
        path = parsed.path or ''
        # TLD o locales extranjeros
        if re.search(r'\.(de|fr|nl|us|uk|ch|au|ca|co\.za|no)\b', host, re.I):
            return True
        if re.search(r'locale\.x=(de|fr|nl|us|uk|ch|au|ca|no)', path, re.I):
            return True
        if re.search(r'(brooklyn|paris|london|zurich|montreal)', url, re.I):
            return True
        return False
    except:
        return False

# === 4️⃣ Carga del dataset y aplicación ===
df = pd.read_csv(INPUT_PATH)
df['is_ing_brand'], df['reason_ing'], df['signals_ing'] = zip(*df['url'].map(detect_ing_brand))
df['foreign_flag'] = df['url'].map(detect_foreign)
df['compromised_host_es'] = df['url'].map(detect_compromised_host_es)

# === 5️⃣ Guardar resultados ===
df.to_csv(OUTPUT_PATH, index=False)
print(f"✅ Archivo guardado: {OUTPUT_PATH}")

# === 6️⃣ Resumen general ===
summary = {
    "Total URLs": len(df),
    "ING marca real": int(df['is_ing_brand'].sum()),
    "No marca / falsos positivos": int((~df['is_ing_brand']).sum()),
    "Con TLD o tokens extranjeros": int(df['foreign_flag'].sum()),
    "Infraestructura .es comprometida": int(df['compromised_host_es'].sum())
}

print("\n📊 Resumen:")
for k,v in summary.items():
    print(f"{k}: {v}")

print("\n✅ Análisis completo. Revisa las nuevas columnas:")
print("['is_ing_brand', 'reason_ing', 'signals_ing', 'foreign_flag', 'compromised_host_es']")


✅ Archivo guardado: feed_scored_v2_1_ing_analysis.csv

📊 Resumen:
Total URLs: 16098
ING marca real: 34
No marca / falsos positivos: 16064
Con TLD o tokens extranjeros: 1144
Infraestructura .es comprometida: 69

✅ Análisis completo. Revisa las nuevas columnas:
['is_ing_brand', 'reason_ing', 'signals_ing', 'foreign_flag', 'compromised_host_es']


In [3]:

# === 1️⃣ Criterios para dataset español (positivos) ===
cond_es = (
    (df["is_ing_brand"] == True) |                                   # ING real
    (df["url"].str.contains(r"correos|paquete|entrega|recibir", case=False, na=False)) |  # logística ES
    (df["url"].str.contains(r"\.es|/es/|-es/", case=False, na=False)) |                   # dominio .es
    (df["url"].str.contains(r"bbva|santander|caixabank|bankinter|gobierno|aeat|seguridad|tarjeta", case=False, na=False)) |
    ((df["foreign_flag"] == False) & (df["compromised_host_es"] == False))                # no extranjero ni comprometido
)

# === 2️⃣ Criterios para ruido internacional (negativos) ===
cond_ruido = ~cond_es | (df["foreign_flag"] == True) | (df["compromised_host_es"] == True)

# === 3️⃣ Crear subconjuntos ===
df_es = df[cond_es].copy()
df_ruido = df[cond_ruido].copy()

# === 4️⃣ Guardar resultados ===
df_es.to_csv(OUTPUT_ES, index=False)
df_ruido.to_csv(OUTPUT_RUIDO, index=False)

# === 5️⃣ Resumen ===
print("✅ Separación completada:")
print(f" - Dataset español candidato: {len(df_es)} URLs")
print(f" - Dataset ruido internacional: {len(df_ruido)} URLs")

# === 6️⃣ Muestra ejemplos ===
print("\n🔹 Ejemplos España:")
print(df_es['url'].head(5).to_list())

print("\n🔸 Ejemplos Ruido internacional:")
print(df_ruido['url'].head(5).to_list())


✅ Separación completada:
 - Dataset español candidato: 15007 URLs
 - Dataset ruido internacional: 1213 URLs

🔹 Ejemplos España:
['http://005pyn4t.com/bg/directing/www.atbonline.com/atb/question.php', 'http://0c4d4e6.wcomhost.com/banco-santander/particulares/iogin/home/sms_codigo.php', 'http://0c4d4e6.wcomhost.com/Banco-Santander/particulares/Iogin/home/sms_codigo.php', 'http://0efbd9f.wcomhost.com/de/banking-postbank/login/privatkunden/de/4055483f021838a047b2', 'http://0efbd9f.wcomhost.com/de/banking-postbank/login/privatkunden/DE/4055483f021838a047b2']

🔸 Ejemplos Ruido internacional:
['http://1505270148.nzingaadvisors.co.za/ipfs/bafybeihhfqmqfzqrrctrrqgpdcnbym74jt3pdj5ojwslcol35425aqezle/mty5nju0njcymq?mtc0mzc3mtaznq==-sfmaxz2vulxbnec0xmduxotu4mjg0lwlmehjves5icmlja2vylwlzehbhcmfnb25zb2x1dglvbnmuy29tsf-1mc4w', 'http://3opqgzo7ypusuestivdlcb2bev8gqzooj7k6gyfvhm2rsmtvingbjdkrnfwrqx9.xzone.no/a/1/kjfar.php?p=n49gvzita0may6mek2k0sch2jnu0ejczwdwsgi938945scdsfghjsastytersz%20sderwaeetrabe%2

In [4]:
# ============================================================
# 🇪🇸 Análisis de composición del dataset candidato español
# Fuente: dataset_es_candidato.csv
# ============================================================

import pandas as pd
from urllib.parse import urlparse
from collections import Counter
import re

INPUT_PATH = "dataset_es_candidato.csv"
df = pd.read_csv(INPUT_PATH)

# === 1️⃣ Extraer dominio y TLD ===
def extract_tld(url):
    try:
        netloc = urlparse(url).netloc
        parts = netloc.split(".")
        if len(parts) > 1:
            return parts[-1].lower()
        return "unknown"
    except:
        return "error"

df["tld"] = df["url"].apply(extract_tld)

# === 2️⃣ Análisis de TLDs ===
tld_counts = df["tld"].value_counts().head(15)
print("📊 Principales TLDs:")
print(tld_counts)

# === 3️⃣ Presencia de marcas españolas ===
spanish_brands = ["bbva", "santander", "caixabank", "bankinter", "unicaja", "correos", "aeat", "seg-social", "gobierno", "mapfre"]
df["spanish_brand_detected"] = df["url"].apply(lambda x: any(b in x.lower() for b in spanish_brands))
brand_counts = df["spanish_brand_detected"].value_counts()

print("\n🏦 URLs con marcas españolas:")
print(brand_counts)
print(f"Porcentaje con marca española: {(brand_counts.get(True, 0) / len(df)) * 100:.2f}%")

# === 4️⃣ Tokens lingüísticos españoles ===
spanish_tokens = ["tarjeta", "paquete", "entrega", "correo", "mensaj", "factura", "cliente", "banco", "movil", "seguridad", "iniciar", "acceso", "dni"]
def count_tokens(url):
    url_lower = url.lower()
    return sum(1 for token in spanish_tokens if token in url_lower)

df["spanish_token_count"] = df["url"].apply(count_tokens)
token_distribution = df["spanish_token_count"].value_counts().sort_index()

print("\n🔤 Distribución de tokens españoles detectados:")
print(token_distribution)

# === 5️⃣ URLs con fuerte señal española (marca o ≥2 tokens ES) ===
df["es_signal_strong"] = (df["spanish_brand_detected"] == True) | (df["spanish_token_count"] >= 2)
strong_count = df["es_signal_strong"].sum()

print(f"\n✅ URLs con señal española fuerte: {strong_count} / {len(df)} ({(strong_count / len(df)) * 100:.2f}%)")

# === 6️⃣ Muestra ejemplos de cada tipo ===
print("\n🔹 Ejemplos con marcas españolas:")
print(df[df["spanish_brand_detected"] == True]["url"].head(5).to_list())

print("\n🔸 Ejemplos con tokens españoles pero sin marca:")
print(df[(df["spanish_brand_detected"] == False) & (df["spanish_token_count"] >= 2)]["url"].head(5).to_list())


📊 Principales TLDs:
tld
com     5723
live    4593
page    1048
dev      738
org      375
net      282
123      259
app      233
me       227
es       200
ru       119
io        97
149       69
id        69
info      63
Name: count, dtype: int64

🏦 URLs con marcas españolas:
spanish_brand_detected
False    14442
True       565
Name: count, dtype: int64
Porcentaje con marca española: 3.76%

🔤 Distribución de tokens españoles detectados:
spanish_token_count
0    14324
1      568
2      105
3       10
Name: count, dtype: int64

✅ URLs con señal española fuerte: 577 / 15007 (3.84%)

🔹 Ejemplos con marcas españolas:
['http://0c4d4e6.wcomhost.com/banco-santander/particulares/iogin/home/sms_codigo.php', 'http://0c4d4e6.wcomhost.com/Banco-Santander/particulares/Iogin/home/sms_codigo.php', 'http://actividadesinusuales-santander-tarjetasonline.com/particulares', 'http://alerta-caixabank1.serveirc.com/caixa-vbvfinal/home/espera2.html', 'http://alerta-caixabank1.serveirc.com/caixa-vbvfinal/home/esp

In [5]:
# =====================================================
# 🇪🇸 Análisis rápido de URLs con señal española fuerte
# (requiere df cargado desde dataset_es_candidato.csv)
# =====================================================

spanish_brands = ["bbva", "santander", "caixabank", "bankinter", "unicaja", "correos", "aeat", "seg-social", "gobierno", "mapfre"]
spanish_tokens = ["tarjeta", "paquete", "entrega", "correo", "mensaj", "factura", "cliente", "banco", "movil", "seguridad", "iniciar", "acceso", "dni"]

df["spanish_brand_detected"] = df["url"].apply(lambda x: any(b in x.lower() for b in spanish_brands))
df["spanish_token_count"] = df["url"].apply(lambda x: sum(1 for token in spanish_tokens if token in x.lower()))
df_strong = df[(df["spanish_brand_detected"]) | (df["spanish_token_count"] >= 2)]

print(f"✅ URLs con señal española fuerte: {len(df_strong)} ({(len(df_strong)/len(df))*100:.2f}%)")

# 🔹 Marcas más comunes
print("\n🏦 Marcas más frecuentes:")
print(df_strong['url'].str.extract(r'(' + '|'.join(spanish_brands) + r')', expand=False).value_counts().head(10))

# 🔸 Tokens más comunes
from collections import Counter
token_counter = Counter()
for url in df_strong["url"]:
    u = url.lower()
    for t in spanish_tokens:
        if t in u:
            token_counter[t] += 1
print("\n🔤 Tokens más frecuentes:")
print(dict(sorted(token_counter.items(), key=lambda x: x[1], reverse=True)))

# 🔍 Ejemplos
print("\n🔹 Ejemplos de URLs representativas:")
print(df_strong["url"].head(10).to_list())



✅ URLs con señal española fuerte: 577 (3.84%)

🏦 Marcas más frecuentes:
url
santander    236
correos      175
bbva          70
caixabank     44
bankinter      7
unicaja        1
Name: count, dtype: int64

🔤 Tokens más frecuentes:
{'correo': 179, 'paquete': 76, 'cliente': 43, 'banco': 32, 'tarjeta': 22, 'acceso': 22, 'movil': 20, 'seguridad': 7, 'iniciar': 3, 'entrega': 3, 'mensaj': 2}

🔹 Ejemplos de URLs representativas:
['http://0c4d4e6.wcomhost.com/banco-santander/particulares/iogin/home/sms_codigo.php', 'http://0c4d4e6.wcomhost.com/Banco-Santander/particulares/Iogin/home/sms_codigo.php', 'http://actividadesinusuales-santander-tarjetasonline.com/particulares', 'http://alerta-caixabank1.serveirc.com/caixa-vbvfinal/home/espera2.html', 'http://alerta-caixabank1.serveirc.com/caixa-vbvfinal/home/espera.html', 'http://alerta-caixabank1.serveirc.com/caixa-vbvfinal/home/tarjeta.html', 'http://alsheharymedical.com.ye/retail.santander.co.uk_LOGSUK_NS_ENS_BtoChannelDriver.ssobto_dse_operationNa

In [6]:
# =====================================================
# 💾 Exportar subconjunto de señal española fuerte
# =====================================================

OUTPUT_PATH = "dataset_es_signal_strong.csv"

df_strong.to_csv(OUTPUT_PATH, index=False)
print(f"✅ Archivo exportado: {OUTPUT_PATH}")
print(f"Total de URLs guardadas: {len(df_strong)}")


✅ Archivo exportado: dataset_es_signal_strong.csv
Total de URLs guardadas: 577
