In [1]:
# ============================================================
# üîç An√°lisis de marca ING + detecci√≥n de ruido internacional
# Dataset: feed_scored_v1_20251010_165910_score_4to7.csv
# ============================================================

import pandas as pd
import re
from urllib.parse import urlparse

# === CONFIG ===
INPUT_PATH = "../../../data/interim/phishing/feed_scored_v1_20251010_165910_score_4to7.csv"
OUTPUT_PATH = "feed_scored_v2_1_ing_analysis.csv"

# === 1Ô∏è‚É£ Funci√≥n para detectar si "ing" es marca real o falso positivo ===
ING_WHITELIST = [
    r'\bing\.es\b', r'\bingdirect\b', r'\bing-bank\b', r'\bingbank\b'
]

def detect_ing_brand(url):
    """Detecta si 'ing' se usa como marca ING o falso positivo."""
    signals = []
    try:
        parsed = urlparse(url if url.startswith(('http://','https://')) else 'http://' + url)
        host = parsed.hostname or ''
        path = parsed.path or ''
        full = host + path

        labels = host.split('.') if host else []
        is_brand = False

        # Host label exacta == 'ing'
        if any(label.lower() == 'ing' for label in labels):
            is_brand = True
            signals.append('host_label_exact_ing')

        # Whitelist
        for w in ING_WHITELIST:
            if re.search(w, full, re.I):
                signals.append(f'whitelist_ing_match:{w}')
                is_brand = True
                break

        # Internal 'ing' dentro de palabras largas (banking, tracking, signing...)
        internal_ing = False
        for token in re.split(r'[-._/]', full):
            if re.search(r'[A-Za-z]+ing[A-Za-z]+', token):
                internal_ing = True
                signals.append(f'internal_ing_in_token:{token[:60]}')
                break
        if internal_ing and 'host_label_exact_ing' not in signals:
            is_brand = False
            signals.append('treated_as_false_positive_ing')

        # Penaliza si TLD es extranjero
        if is_brand and re.search(r'\.(de|fr|nl|us|uk|ch|au|ca|co\.za|no)\b', host, re.I):
            signals.append('foreign_tld_contradiction')
            is_brand = False

        reason = 'brand' if is_brand else 'not_brand'
        return is_brand, reason, ';'.join(signals)
    except Exception as e:
        return None, 'error', str(e)

# === 2Ô∏è‚É£ Funci√≥n para detectar hosts comprometidos (.es con rutas t√©cnicas) ===
def detect_compromised_host_es(url):
    try:
        parsed = urlparse(url)
        host = parsed.hostname or ''
        path = parsed.path or ''
        if '.es' in host and re.search(r'(wp-|plugins|themes|includes|css|js|webmail|vendor/phpunit)', path):
            return True
        return False
    except:
        return False

# === 3Ô∏è‚É£ Detecci√≥n de TLD o tokens extranjeros ===
def detect_foreign(url):
    try:
        parsed = urlparse(url)
        host = parsed.hostname or ''
        path = parsed.path or ''
        # TLD o locales extranjeros
        if re.search(r'\.(de|fr|nl|us|uk|ch|au|ca|co\.za|no)\b', host, re.I):
            return True
        if re.search(r'locale\.x=(de|fr|nl|us|uk|ch|au|ca|no)', path, re.I):
            return True
        if re.search(r'(brooklyn|paris|london|zurich|montreal)', url, re.I):
            return True
        return False
    except:
        return False

# === 4Ô∏è‚É£ Carga del dataset y aplicaci√≥n ===
df = pd.read_csv(INPUT_PATH)
df['is_ing_brand'], df['reason_ing'], df['signals_ing'] = zip(*df['url'].map(detect_ing_brand))
df['foreign_flag'] = df['url'].map(detect_foreign)
df['compromised_host_es'] = df['url'].map(detect_compromised_host_es)

# === 5Ô∏è‚É£ Guardar resultados ===
df.to_csv(OUTPUT_PATH, index=False)
print(f"‚úÖ Archivo guardado: {OUTPUT_PATH}")

# === 6Ô∏è‚É£ Resumen general ===
summary = {
    "Total URLs": len(df),
    "ING marca real": int(df['is_ing_brand'].sum()),
    "No marca / falsos positivos": int((~df['is_ing_brand']).sum()),
    "Con TLD o tokens extranjeros": int(df['foreign_flag'].sum()),
    "Infraestructura .es comprometida": int(df['compromised_host_es'].sum())
}

print("\nüìä Resumen:")
for k,v in summary.items():
    print(f"{k}: {v}")

print("\n‚úÖ An√°lisis completo. Revisa las nuevas columnas:")
print("['is_ing_brand', 'reason_ing', 'signals_ing', 'foreign_flag', 'compromised_host_es']")


‚úÖ Archivo guardado: feed_scored_v2_1_ing_analysis.csv

üìä Resumen:
Total URLs: 16098
ING marca real: 34
No marca / falsos positivos: 16064
Con TLD o tokens extranjeros: 1144
Infraestructura .es comprometida: 69

‚úÖ An√°lisis completo. Revisa las nuevas columnas:
['is_ing_brand', 'reason_ing', 'signals_ing', 'foreign_flag', 'compromised_host_es']


In [3]:

# === 1Ô∏è‚É£ Criterios para dataset espa√±ol (positivos) ===
cond_es = (
    (df["is_ing_brand"] == True) |                                   # ING real
    (df["url"].str.contains(r"correos|paquete|entrega|recibir", case=False, na=False)) |  # log√≠stica ES
    (df["url"].str.contains(r"\.es|/es/|-es/", case=False, na=False)) |                   # dominio .es
    (df["url"].str.contains(r"bbva|santander|caixabank|bankinter|gobierno|aeat|seguridad|tarjeta", case=False, na=False)) |
    ((df["foreign_flag"] == False) & (df["compromised_host_es"] == False))                # no extranjero ni comprometido
)

# === 2Ô∏è‚É£ Criterios para ruido internacional (negativos) ===
cond_ruido = ~cond_es | (df["foreign_flag"] == True) | (df["compromised_host_es"] == True)

# === 3Ô∏è‚É£ Crear subconjuntos ===
df_es = df[cond_es].copy()
df_ruido = df[cond_ruido].copy()

# === 4Ô∏è‚É£ Guardar resultados ===
df_es.to_csv(OUTPUT_ES, index=False)
df_ruido.to_csv(OUTPUT_RUIDO, index=False)

# === 5Ô∏è‚É£ Resumen ===
print("‚úÖ Separaci√≥n completada:")
print(f" - Dataset espa√±ol candidato: {len(df_es)} URLs")
print(f" - Dataset ruido internacional: {len(df_ruido)} URLs")

# === 6Ô∏è‚É£ Muestra ejemplos ===
print("\nüîπ Ejemplos Espa√±a:")
print(df_es['url'].head(5).to_list())

print("\nüî∏ Ejemplos Ruido internacional:")
print(df_ruido['url'].head(5).to_list())


‚úÖ Separaci√≥n completada:
 - Dataset espa√±ol candidato: 15007 URLs
 - Dataset ruido internacional: 1213 URLs

üîπ Ejemplos Espa√±a:
['http://005pyn4t.com/bg/directing/www.atbonline.com/atb/question.php', 'http://0c4d4e6.wcomhost.com/banco-santander/particulares/iogin/home/sms_codigo.php', 'http://0c4d4e6.wcomhost.com/Banco-Santander/particulares/Iogin/home/sms_codigo.php', 'http://0efbd9f.wcomhost.com/de/banking-postbank/login/privatkunden/de/4055483f021838a047b2', 'http://0efbd9f.wcomhost.com/de/banking-postbank/login/privatkunden/DE/4055483f021838a047b2']

üî∏ Ejemplos Ruido internacional:
['http://1505270148.nzingaadvisors.co.za/ipfs/bafybeihhfqmqfzqrrctrrqgpdcnbym74jt3pdj5ojwslcol35425aqezle/mty5nju0njcymq?mtc0mzc3mtaznq==-sfmaxz2vulxbnec0xmduxotu4mjg0lwlmehjves5icmlja2vylwlzehbhcmfnb25zb2x1dglvbnmuy29tsf-1mc4w', 'http://3opqgzo7ypusuestivdlcb2bev8gqzooj7k6gyfvhm2rsmtvingbjdkrnfwrqx9.xzone.no/a/1/kjfar.php?p=n49gvzita0may6mek2k0sch2jnu0ejczwdwsgi938945scdsfghjsastytersz%20sder

In [4]:
# ============================================================
# üá™üá∏ An√°lisis de composici√≥n del dataset candidato espa√±ol
# Fuente: dataset_es_candidato.csv
# ============================================================

import pandas as pd
from urllib.parse import urlparse
from collections import Counter
import re

INPUT_PATH = "dataset_es_candidato.csv"
df = pd.read_csv(INPUT_PATH)

# === 1Ô∏è‚É£ Extraer dominio y TLD ===
def extract_tld(url):
    try:
        netloc = urlparse(url).netloc
        parts = netloc.split(".")
        if len(parts) > 1:
            return parts[-1].lower()
        return "unknown"
    except:
        return "error"

df["tld"] = df["url"].apply(extract_tld)

# === 2Ô∏è‚É£ An√°lisis de TLDs ===
tld_counts = df["tld"].value_counts().head(15)
print("üìä Principales TLDs:")
print(tld_counts)

# === 3Ô∏è‚É£ Presencia de marcas espa√±olas ===
spanish_brands = ["bbva", "santander", "caixabank", "bankinter", "unicaja", "correos", "aeat", "seg-social", "gobierno", "mapfre"]
df["spanish_brand_detected"] = df["url"].apply(lambda x: any(b in x.lower() for b in spanish_brands))
brand_counts = df["spanish_brand_detected"].value_counts()

print("\nüè¶ URLs con marcas espa√±olas:")
print(brand_counts)
print(f"Porcentaje con marca espa√±ola: {(brand_counts.get(True, 0) / len(df)) * 100:.2f}%")

# === 4Ô∏è‚É£ Tokens ling√º√≠sticos espa√±oles ===
spanish_tokens = ["tarjeta", "paquete", "entrega", "correo", "mensaj", "factura", "cliente", "banco", "movil", "seguridad", "iniciar", "acceso", "dni"]
def count_tokens(url):
    url_lower = url.lower()
    return sum(1 for token in spanish_tokens if token in url_lower)

df["spanish_token_count"] = df["url"].apply(count_tokens)
token_distribution = df["spanish_token_count"].value_counts().sort_index()

print("\nüî§ Distribuci√≥n de tokens espa√±oles detectados:")
print(token_distribution)

# === 5Ô∏è‚É£ URLs con fuerte se√±al espa√±ola (marca o ‚â•2 tokens ES) ===
df["es_signal_strong"] = (df["spanish_brand_detected"] == True) | (df["spanish_token_count"] >= 2)
strong_count = df["es_signal_strong"].sum()

print(f"\n‚úÖ URLs con se√±al espa√±ola fuerte: {strong_count} / {len(df)} ({(strong_count / len(df)) * 100:.2f}%)")

# === 6Ô∏è‚É£ Muestra ejemplos de cada tipo ===
print("\nüîπ Ejemplos con marcas espa√±olas:")
print(df[df["spanish_brand_detected"] == True]["url"].head(5).to_list())

print("\nüî∏ Ejemplos con tokens espa√±oles pero sin marca:")
print(df[(df["spanish_brand_detected"] == False) & (df["spanish_token_count"] >= 2)]["url"].head(5).to_list())


üìä Principales TLDs:
tld
com     5723
live    4593
page    1048
dev      738
org      375
net      282
123      259
app      233
me       227
es       200
ru       119
io        97
149       69
id        69
info      63
Name: count, dtype: int64

üè¶ URLs con marcas espa√±olas:
spanish_brand_detected
False    14442
True       565
Name: count, dtype: int64
Porcentaje con marca espa√±ola: 3.76%

üî§ Distribuci√≥n de tokens espa√±oles detectados:
spanish_token_count
0    14324
1      568
2      105
3       10
Name: count, dtype: int64

‚úÖ URLs con se√±al espa√±ola fuerte: 577 / 15007 (3.84%)

üîπ Ejemplos con marcas espa√±olas:
['http://0c4d4e6.wcomhost.com/banco-santander/particulares/iogin/home/sms_codigo.php', 'http://0c4d4e6.wcomhost.com/Banco-Santander/particulares/Iogin/home/sms_codigo.php', 'http://actividadesinusuales-santander-tarjetasonline.com/particulares', 'http://alerta-caixabank1.serveirc.com/caixa-vbvfinal/home/espera2.html', 'http://alerta-caixabank1.serveirc.com/ca

In [5]:
# =====================================================
# üá™üá∏ An√°lisis r√°pido de URLs con se√±al espa√±ola fuerte
# (requiere df cargado desde dataset_es_candidato.csv)
# =====================================================

spanish_brands = ["bbva", "santander", "caixabank", "bankinter", "unicaja", "correos", "aeat", "seg-social", "gobierno", "mapfre"]
spanish_tokens = ["tarjeta", "paquete", "entrega", "correo", "mensaj", "factura", "cliente", "banco", "movil", "seguridad", "iniciar", "acceso", "dni"]

df["spanish_brand_detected"] = df["url"].apply(lambda x: any(b in x.lower() for b in spanish_brands))
df["spanish_token_count"] = df["url"].apply(lambda x: sum(1 for token in spanish_tokens if token in x.lower()))
df_strong = df[(df["spanish_brand_detected"]) | (df["spanish_token_count"] >= 2)]

print(f"‚úÖ URLs con se√±al espa√±ola fuerte: {len(df_strong)} ({(len(df_strong)/len(df))*100:.2f}%)")

# üîπ Marcas m√°s comunes
print("\nüè¶ Marcas m√°s frecuentes:")
print(df_strong['url'].str.extract(r'(' + '|'.join(spanish_brands) + r')', expand=False).value_counts().head(10))

# üî∏ Tokens m√°s comunes
from collections import Counter
token_counter = Counter()
for url in df_strong["url"]:
    u = url.lower()
    for t in spanish_tokens:
        if t in u:
            token_counter[t] += 1
print("\nüî§ Tokens m√°s frecuentes:")
print(dict(sorted(token_counter.items(), key=lambda x: x[1], reverse=True)))

# üîç Ejemplos
print("\nüîπ Ejemplos de URLs representativas:")
print(df_strong["url"].head(10).to_list())



‚úÖ URLs con se√±al espa√±ola fuerte: 577 (3.84%)

üè¶ Marcas m√°s frecuentes:
url
santander    236
correos      175
bbva          70
caixabank     44
bankinter      7
unicaja        1
Name: count, dtype: int64

üî§ Tokens m√°s frecuentes:
{'correo': 179, 'paquete': 76, 'cliente': 43, 'banco': 32, 'tarjeta': 22, 'acceso': 22, 'movil': 20, 'seguridad': 7, 'iniciar': 3, 'entrega': 3, 'mensaj': 2}

üîπ Ejemplos de URLs representativas:
['http://0c4d4e6.wcomhost.com/banco-santander/particulares/iogin/home/sms_codigo.php', 'http://0c4d4e6.wcomhost.com/Banco-Santander/particulares/Iogin/home/sms_codigo.php', 'http://actividadesinusuales-santander-tarjetasonline.com/particulares', 'http://alerta-caixabank1.serveirc.com/caixa-vbvfinal/home/espera2.html', 'http://alerta-caixabank1.serveirc.com/caixa-vbvfinal/home/espera.html', 'http://alerta-caixabank1.serveirc.com/caixa-vbvfinal/home/tarjeta.html', 'http://alsheharymedical.com.ye/retail.santander.co.uk_LOGSUK_NS_ENS_BtoChannelDriver.ssobto_

In [6]:
# =====================================================
# üíæ Exportar subconjunto de se√±al espa√±ola fuerte
# =====================================================

OUTPUT_PATH = "dataset_es_signal_strong.csv"

df_strong.to_csv(OUTPUT_PATH, index=False)
print(f"‚úÖ Archivo exportado: {OUTPUT_PATH}")
print(f"Total de URLs guardadas: {len(df_strong)}")


‚úÖ Archivo exportado: dataset_es_signal_strong.csv
Total de URLs guardadas: 577
