In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

df = pd.read_csv("dataset_v2_features.csv")

features_v3 = [
    "domain_complexity", "domain_whitelist", "trusted_token_context",
    "host_entropy", "infra_risk", "brand_in_path", "brand_match_flag"
]
df["label"] = df["label"].astype(int)  # Por seguridad
assert all(f in df.columns for f in features_v3), "Faltan columnas V3"



In [3]:
img_dir = Path("EDA/img_v3")
img_dir.mkdir(parents=True, exist_ok=True)

for feat in features_v3:
    plt.figure(figsize=(6,3))
    sns.histplot(data=df, x=feat, hue="label", bins=40, kde=False, stat="density", palette="Set1", alpha=0.7)
    plt.title(f"Distribución por clase: {feat}")
    plt.tight_layout()
    plt.savefig(img_dir/f"{feat}_hist.png")
    plt.close()

# Boxplots de separación técnica
for feat in ["domain_complexity", "host_entropy", "infra_risk"]:
    plt.figure(figsize=(5,3))
    sns.boxplot(data=df, x="label", y=feat)
    plt.title(f"Boxplot {feat} por clase")
    plt.savefig(img_dir/f"{feat}_boxplot.png")
    plt.close()

# Heatmap correlaciones (solo features V3)
plt.figure(figsize=(8,5))
corr = df[features_v3].corr()
sns.heatmap(corr, annot=True, cmap="RdBu_r", vmin=-1, vmax=1)
plt.title("Correlación interna del vector v3")
plt.tight_layout()
plt.savefig(img_dir/"correlacion_heatmap.png")
plt.close()



In [4]:
# Checks de coherencia contractual
# domain_whitelist == 1 → TTC == +1
check1 = df[(df["domain_whitelist"] == 1) & (df["trusted_token_context"] != 1)]

# brand_match_flag == 1 → domain_whitelist == 1
check2 = df[(df["brand_match_flag"] == 1) & (df["domain_whitelist"] != 1)]

# brand_in_path == 1 → domain_whitelist == 0
check3 = df[(df["brand_in_path"] == 1) & (df["domain_whitelist"] != 0)]

# Unir incoherencias (sin duplicados)
incoherencias = pd.concat([check1, check2, check3]).drop_duplicates()

from pathlib import Path
Path("outputs").mkdir(exist_ok=True)
incoherencias.to_csv("outputs/eda_v3_incoherencias.csv", index=False)
print(f"Incoherencias detectadas: {incoherencias.shape[0]}")


Incoherencias detectadas: 0


In [5]:
resumen = []

# Discriminación de features
for f in ["domain_complexity", "host_entropy", "infra_risk"]:
    d0 = df[df["label"]==0][f]
    d1 = df[df["label"]==1][f]
    sep = abs(d0.mean() - d1.mean())
    resumen.append(f"{f}: diferencia de medias legit/phishing = {sep:.3f}")
    if sep < 0.07:
        resumen.append(f"⚠️ {f} no discrimina clases (mean legit ≃ mean phish)")

# Extremos y valores sospechosos
global_mean = lambda f: df[f].mean()
for f in ["domain_complexity"]:
    zeros = df[df[f] == 0].shape[0]
    ones = df[df[f] == 1].shape[0]
    if zeros/len(df) > 0.15 or ones/len(df) > 0.10:
        resumen.append(f"⚠️ {f}: valor extremo frecuente (0→{zeros}, 1→{ones})")

urls_problema = incoherencias["url"].unique() if "url" in incoherencias else []
resumen.extend([f"Incoherencia contractual en URL: {u}" for u in urls_problema])

Path("docs/EDA").mkdir(parents=True, exist_ok=True)
with open("docs/EDA/eda_v3_resumen.md", "w") as f:
    f.write("# Resumen EDA contractual v3\n\n")
    f.write("\n".join(resumen))
print("Resumen EDA v3 generado en docs/EDA/eda_v3_resumen.md")


Resumen EDA v3 generado en docs/EDA/eda_v3_resumen.md
