In [37]:
#  Introducción
"""
Feature Engineering – Prototipo Phishing
Objetivo: probar primeras features en dataset balanceado (100 phishing, 100 legítimas).
"""

import pandas as pd
import numpy as np
import tldextract
from collections import Counter
import math

# Cargar dataset
df = pd.read_csv("../data/dataset/dataset_prototipo.csv")
df.head()


Unnamed: 0,url,label,categoria,matched_target,notas,campaign
0,https://www.caixabank.es/particular/banca-digi...,0,banca,caixabank,home bancaria oficial,
1,https://accounts.google.com/ServiceLogin?servi...,0,saas,google,login Google oficial,
2,https://www.roblox.com/es/upgrades/robux?ctx=n...,0,gaming,roblox,portal Roblox oficial,
3,https://zoom.us/es/join,0,saas,zoom,página de reuniones Zoom oficial,
4,https://www.roblox.com/es/login,0,gaming,roblox,login Roblox oficial,


In [2]:
# 2. Funciones auxiliares

def domain_entropy(domain: str) -> float:
    """Calcula la entropía de un dominio."""
    counter = Counter(domain)
    probs = [freq/len(domain) for freq in counter.values()]
    return -sum(p * math.log2(p) for p in probs)


In [3]:
# 3. Feature: longitud de dominio
df["domain"] = df["url"].apply(lambda x: tldextract.extract(x).domain)
df["domain_length"] = df["domain"].str.len()

print("Distribución longitud de dominio:")
print(df.groupby("label")["domain_length"].describe())


Distribución longitud de dominio:
       count   mean       std  min  25%   50%   75%   max
label                                                    
0      100.0   8.14  3.840297  3.0  5.0   8.0  10.0  23.0
1      100.0  11.27  5.116531  2.0  7.0  11.0  14.0  24.0


In [4]:
# 4. Feature: profundidad de ruta
df["path_depth"] = df["url"].apply(lambda x: x.count("/"))
print("Distribución profundidad de ruta:")
print(df.groupby("label")["path_depth"].describe())


Distribución profundidad de ruta:
       count  mean       std  min  25%  50%  75%  max
label                                                
0      100.0  4.18  1.472852  2.0  3.0  4.0  5.0  9.0
1      100.0  3.32  1.081338  2.0  3.0  3.0  4.0  7.0


In [5]:
# 5. Feature: contiene @
df["contains_at"] = df["url"].apply(lambda x: 1 if "@" in x else 0)

print("Conteo de '@' por clase:")
print(df.groupby("label")["contains_at"].sum())


Conteo de '@' por clase:
label
0    0
1    2
Name: contains_at, dtype: int64


In [6]:
# Feature: entropía del dominio
df["domain_entropy"] = df["domain"].apply(domain_entropy)

print("Distribución entropía de dominio:")
print(df.groupby("label")["domain_entropy"].describe())


Distribución entropía de dominio:
       count      mean       std  min       25%       50%       75%       max
label                                                                        
0      100.0  2.474852  0.554276  1.5  1.921928  2.643599  2.924248  3.414441
1      100.0  2.891224  0.631226  1.0  2.584963  3.090379  3.277613  3.821928


In [7]:
# Feature: número de parámetros en query
from urllib.parse import urlparse

def count_params(url: str) -> int:
    parsed = urlparse(url)
    if parsed.query == "":
        return 0
    return len(parsed.query.split("&"))

df["num_params"] = df["url"].apply(count_params)

print("Distribución de número de parámetros:")
print(df.groupby("label")["num_params"].describe())


Distribución de número de parámetros:
       count  mean       std  min  25%  50%  75%  max
label                                                
0      100.0  0.04  0.242878  0.0  0.0  0.0  0.0  2.0
1      100.0  0.09  0.320826  0.0  0.0  0.0  0.0  2.0


In [8]:
# Feature: contiene '%'
df["contains_percent"] = df["url"].apply(lambda x: 1 if "%" in x else 0)

print("Conteo de '%' por clase:")
print(df.groupby("label")["contains_percent"].sum())

print("\nDistribución de '%' por clase:")
print(df.groupby("label")["contains_percent"].mean())


Conteo de '%' por clase:
label
0    0
1    6
Name: contains_percent, dtype: int64

Distribución de '%' por clase:
label
0    0.00
1    0.06
Name: contains_percent, dtype: float64


In [9]:
# Feature: número de '.' en toda la URL
df["dots_in_url"] = df["url"].apply(lambda x: x.count("."))

print("Distribución de puntos en URL completa:")
print(df.groupby("label")["dots_in_url"].describe())


Distribución de puntos en URL completa:
       count  mean       std  min   25%  50%  75%  max
label                                                 
0      100.0  2.12  0.477367  1.0  2.00  2.0  2.0  4.0
1      100.0  1.96  0.777460  1.0  1.75  2.0  2.0  5.0


In [10]:
# Feature: número de puntos en el dominio
df["dots_in_domain"] = df["url"].apply(lambda x: tldextract.extract(x).subdomain.count(".") + 1 if tldextract.extract(x).subdomain else 0)

print("Distribución de puntos en dominio:")
print(df.groupby("label")["dots_in_domain"].describe())


Distribución de puntos en dominio:
       count  mean       std  min  25%  50%  75%  max
label                                                
0      100.0  0.97  0.264193  0.0  1.0  1.0  1.0  2.0
1      100.0  0.69  0.706321  0.0  0.0  1.0  1.0  4.0


In [11]:
# Feature: contiene '='
df["contains_equal"] = df["url"].apply(lambda x: 1 if "=" in x else 0)

print("Conteo de '=' por clase:")
print(df.groupby("label")["contains_equal"].sum())

print("\nDistribución de '=' por clase:")
print(df.groupby("label")["contains_equal"].mean())


Conteo de '=' por clase:
label
0    3
1    8
Name: contains_equal, dtype: int64

Distribución de '=' por clase:
label
0    0.03
1    0.08
Name: contains_equal, dtype: float64


In [12]:
# Feature: protocolo (http = 0, https = 1)
df["protocol"] = df["url"].apply(lambda x: 1 if x.startswith("https") else 0)

print("Conteo de protocolos por clase:")
print(df.groupby("label")["protocol"].value_counts())

print("\nDistribución de https por clase:")
print(df.groupby("label")["protocol"].mean())


Conteo de protocolos por clase:
label  protocol
0      1           100
1      1            85
       0            15
Name: count, dtype: int64

Distribución de https por clase:
label
0    1.00
1    0.85
Name: protocol, dtype: float64


In [13]:
# Feature: TLD de la URL
df["tld"] = df["url"].apply(lambda x: tldextract.extract(x).suffix)

print("Top TLDs en legítimas:")
print(df[df["label"] == 0]["tld"].value_counts().head(10))

print("\nTop TLDs en phishing:")
print(df[df["label"] == 1]["tld"].value_counts().head(10))


Top TLDs en legítimas:
tld
es         59
com        34
gob.es      4
us          1
network     1
net         1
Name: count, dtype: int64

Top TLDs en phishing:
tld
com        44
es         15
me          5
app         5
com.es      5
net         4
digital     4
info        3
io          2
top         2
Name: count, dtype: int64


In [14]:
# Agrupamos en "es", "com" y "otros"
def tld_group(tld: str) -> str:
    if tld == "es":
        return "es"
    elif tld == "com":
        return "com"
    else:
        return "otros"

df["tld_group"] = df["tld"].apply(tld_group)

print("\nDistribución de TLD agrupados por clase:")
print(df.groupby("label")["tld_group"].value_counts(normalize=True))



Distribución de TLD agrupados por clase:
label  tld_group
0      es           0.59
       com          0.34
       otros        0.07
1      com          0.44
       otros        0.41
       es           0.15
Name: proportion, dtype: float64


In [15]:
# Feature: tokens sospechosos en la ruta
SUSPICIOUS_TOKENS = ["php", "html", "index", "view", "principal"]

def has_suspicious_token(url: str) -> int:
    path = urlparse(url).path.lower()
    return 1 if any(tok in path for tok in SUSPICIOUS_TOKENS) else 0

df["suspicious_path_token"] = df["url"].apply(has_suspicious_token)

print("Conteo de tokens sospechosos en ruta por clase:")
print(df.groupby("label")["suspicious_path_token"].sum())

print("\nDistribución de tokens sospechosos en ruta por clase:")
print(df.groupby("label")["suspicious_path_token"].mean())


Conteo de tokens sospechosos en ruta por clase:
label
0    10
1    20
Name: suspicious_path_token, dtype: int64

Distribución de tokens sospechosos en ruta por clase:
label
0    0.1
1    0.2
Name: suspicious_path_token, dtype: float64


In [16]:
feature_cols = ["domain_length", "path_depth", "domain_entropy", 
                "num_params", "contains_at", "contains_percent", 
                "contains_equal", "protocol", "tld_group", 
                "suspicious_path_token"]

df["features_sum"] = df[feature_cols].apply(lambda row: sum([1 if v != 0 else 0 for v in row]), axis=1)

print(df.groupby("label")["features_sum"].describe())


       count  mean       std  min  25%  50%  75%  max
label                                                
0      100.0  5.16  0.443129  5.0  5.0  5.0  5.0  7.0
1      100.0  5.29  0.807728  4.0  5.0  5.0  6.0  8.0


In [17]:
# Selección de columnas finales
feature_cols = [
    "domain_length",
    "path_depth",
    "domain_entropy",
    "num_params",
    "contains_at",
    "contains_percent",
    "contains_equal",
    "protocol",
    "tld_group",
    "suspicious_path_token"
]

# Dataset con features + label + url
features_final = df[["url", "label"] + feature_cols]

print("✅ Features exportadas a features/features_prototipo.csv")
print("Shape final:", features_final.shape)
features_final.head()


✅ Features exportadas a features/features_prototipo.csv
Shape final: (200, 12)


Unnamed: 0,url,label,domain_length,path_depth,domain_entropy,num_params,contains_at,contains_percent,contains_equal,protocol,tld_group,suspicious_path_token
0,https://www.caixabank.es/particular/banca-digi...,0,9,4,2.641604,0,0,0,0,1,es,1
1,https://accounts.google.com/ServiceLogin?servi...,0,6,3,1.918296,1,0,0,1,1,com,0
2,https://www.roblox.com/es/upgrades/robux?ctx=n...,0,6,5,2.251629,1,0,0,1,1,com,0
3,https://zoom.us/es/join,0,4,4,1.5,0,0,0,0,1,otros,0
4,https://www.roblox.com/es/login,0,6,4,2.251629,0,0,0,0,1,com,0


In [19]:
import matplotlib.pyplot as plt
import seaborn as sns

numeric_features = ["domain_length", "path_depth", "domain_entropy", "num_params"]

for col in numeric_features:
    plt.figure(figsize=(6,4))
    sns.boxplot(x="label", y=col, data=features_final)
    plt.title(f"Distribución de {col} por clase (0=legítima, 1=phishing)")
    plt.savefig(f"features/img/{col}_boxplot.png", dpi=120, bbox_inches="tight")
    plt.close()


In [20]:
binary_features = ["contains_at", "contains_percent", "contains_equal", 
                   "protocol", "tld_group", "suspicious_path_token"]

for col in binary_features:
    plt.figure(figsize=(6,4))
    sns.countplot(x=col, hue="label", data=features_final)
    plt.title(f"{col} por clase (0=legítima, 1=phishing)")
    plt.savefig(f"features/img/{col}_barplot.png", dpi=120, bbox_inches="tight")
    plt.close()


In [22]:
# Lista de dominios/hostings gratuitos comunes en phishing
FREE_HOSTINGS = [
    "webcindario.com", "000webhostapp.com", "rf.gd", "hol.es", "biz.nf",
    "blogspot.com", "wordpress.com", "weebly.com", "wix.com",
    "web.app", "firebaseapp.com", "sites.google.com", "godaddysites.com",
    "ead.me", "ucoz.net", "tk", "ml", "ga", "cf", "gq"
]

def is_free_hosting(url: str) -> int:
    domain = tldextract.extract(url).registered_domain.lower()
    return 1 if any(host in domain for host in FREE_HOSTINGS) else 0

df["free_hosting"] = df["url"].apply(is_free_hosting)

print("Conteo de free_hosting por clase:")
print(df.groupby("label")["free_hosting"].sum())

print("\nDistribución de free_hosting por clase:")
print(df.groupby("label")["free_hosting"].mean())


Conteo de free_hosting por clase:
label
0     0
1    22
Name: free_hosting, dtype: int64

Distribución de free_hosting por clase:
label
0    0.00
1    0.22
Name: free_hosting, dtype: float64


  domain = tldextract.extract(url).registered_domain.lower()


In [23]:
# Añadir la columna free_hosting al dataset final de features
features_final = df[[
    "url", "label", "domain_length", "domain_entropy", "num_params",
    "contains_at", "contains_percent", "contains_equal",
    "protocol", "tld_group", "suspicious_path_token", "free_hosting"
]]

# Guardar el CSV actualizado
features_final.to_csv("features/features_prototipo.csv", index=False)

print("✅ CSV actualizado con free_hosting guardado en features/features_prototipo.csv")
print("Shape final:", features_final.shape)
features_final.head()


✅ CSV actualizado con free_hosting guardado en features/features_prototipo.csv
Shape final: (200, 12)


Unnamed: 0,url,label,domain_length,domain_entropy,num_params,contains_at,contains_percent,contains_equal,protocol,tld_group,suspicious_path_token,free_hosting
0,https://www.caixabank.es/particular/banca-digi...,0,9,2.641604,0,0,0,0,1,es,1,0
1,https://accounts.google.com/ServiceLogin?servi...,0,6,1.918296,1,0,0,1,1,com,0,0
2,https://www.roblox.com/es/upgrades/robux?ctx=n...,0,6,2.251629,1,0,0,1,1,com,0,0
3,https://zoom.us/es/join,0,4,1.5,0,0,0,0,1,otros,0,0
4,https://www.roblox.com/es/login,0,6,2.251629,0,0,0,0,1,com,0,0


In [24]:
# Lista de tokens considerados "legitimadores"
TRUSTED_TOKENS = ["login", "clientes", "empresas", "banca", "seguridad"]

def has_trusted_token(url: str) -> int:
    path = urlparse(url).path.lower()
    return 1 if any(tok in path for tok in TRUSTED_TOKENS) else 0

df["trusted_path_token"] = df["url"].apply(has_trusted_token)

print("Conteo de trusted_path_token por clase:")
print(df.groupby("label")["trusted_path_token"].sum())

print("\nDistribución de trusted_path_token por clase:")
print(df.groupby("label")["trusted_path_token"].mean())

Conteo de trusted_path_token por clase:
label
0    33
1     5
Name: trusted_path_token, dtype: int64

Distribución de trusted_path_token por clase:
label
0    0.33
1    0.05
Name: trusted_path_token, dtype: float64


In [25]:
# Añadir la columna trusted_path_token al dataset final de features
features_final = df[[
    "url", "label", "domain_length", "domain_entropy", "num_params",
    "contains_at", "contains_percent", "contains_equal",
    "protocol", "tld_group", "suspicious_path_token",
    "free_hosting", "trusted_path_token"
]]

# Guardar el CSV actualizado
features_final.to_csv("features/features_prototipo.csv", index=False)

print("✅ CSV actualizado con trusted_path_token guardado en features/features_prototipo.csv")
print("Shape final:", features_final.shape)
features_final.head()


✅ CSV actualizado con trusted_path_token guardado en features/features_prototipo.csv
Shape final: (200, 13)


Unnamed: 0,url,label,domain_length,domain_entropy,num_params,contains_at,contains_percent,contains_equal,protocol,tld_group,suspicious_path_token,free_hosting,trusted_path_token
0,https://www.caixabank.es/particular/banca-digi...,0,9,2.641604,0,0,0,0,1,es,1,0,1
1,https://accounts.google.com/ServiceLogin?servi...,0,6,1.918296,1,0,0,1,1,com,0,0,1
2,https://www.roblox.com/es/upgrades/robux?ctx=n...,0,6,2.251629,1,0,0,1,1,com,0,0,0
3,https://zoom.us/es/join,0,4,1.5,0,0,0,0,1,otros,0,0,0
4,https://www.roblox.com/es/login,0,6,2.251629,0,0,0,0,1,com,0,0,1
