In [1]:
import sys
import os

# Añadir raíz del proyecto al path si no está ya
PROJECT_ROOT = "/Users/test/Desktop/phishing-detector"
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

from features.features_v2 import extract_features_v2, pretty_print_features
from features.features_constantes import (
    SUSPICIOUS_TOKENS_WEIGHT,
    SUSPICIOUS_TOKENS,
    TRUSTED_TOKENS,
    TLD_RISK,
    FREE_HOSTING,
)


print("Imports OK")


Imports OK


In [2]:
import pandas as pd

df = pd.read_csv("../docs/dominios_espanyoles.csv")

whitelist = set(df["domain"].str.lower().str.strip())


In [3]:
def show(url):
    feats = extract_features_v2(url, spanish_whitelist)
    print(f"\nURL: {url}\n{feats}")
    return feats


In [4]:
features = extract_features_v2("https://www.amazon.es", whitelist)
print(len(features))


9


In [5]:
print([type(x) for x in features])


[<class 'float'>, <class 'float'>, <class 'float'>, <class 'float'>, <class 'float'>, <class 'float'>, <class 'float'>, <class 'float'>, <class 'float'>]


In [6]:
[x for x in features if x != x]


[]

In [7]:
pretty_print_features(features)


domain_complexity: 20.264662506490403
host_entropy: -0.0
domain_whitelist_score: 1.0
suspicious_path_token: 0.0
token_density: 0.0
trusted_token_context: 0.0
infra_risk: 0.0
fake_tld_in_subdomain_or_path: 0.0
param_count_boost: 0.0


In [8]:
invalid = extract_features_v2("%%%%", whitelist)
pretty_print_features(invalid)


domain_complexity: -0.0
host_entropy: 0.0
domain_whitelist_score: 0.0
suspicious_path_token: 0.0
token_density: 0.0
trusted_token_context: 0.0
infra_risk: 0.0
fake_tld_in_subdomain_or_path: 0.0
param_count_boost: 0.0


In [9]:
extract_features_v2(
    "https://bbva.es.cdn.cloudflare.net/clientes/area-cliente/login",
    {"bbva.es", "caixabank.es", "ing.es"}
)


[43.70699332842307, 2.94770277922009, 0.0, 0.0, 0.0, -1.0, 0.0, 1.0, 0.0]

In [10]:
extract_features_v2(
    "https://cloudflare.net/login/identificacion",
    {"bbva.es", "caixabank.es", "ing.es"}
)


[43.70699332842307, 0.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0]

In [11]:
spanish_whitelist = {"caixabank.es", "bbva.es", "ing.es"}

extract_features_v2(
    "https://caixabank.es/particulares/area-cliente/login",
    spanish_whitelist
)


[31.69925001442312, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]

In [12]:
pretty_print_features(extract_features_v2("https://google.es", whitelist))


domain_complexity: 17.264662506490403
host_entropy: 0.0
domain_whitelist_score: 1.0
suspicious_path_token: 0.0
token_density: 0.0
trusted_token_context: 0.0
infra_risk: 0.0
fake_tld_in_subdomain_or_path: 0.0
param_count_boost: 0.0


In [13]:
pretty_print_features(extract_features_v2("https://aaaa.com", whitelist))


domain_complexity: -0.0
host_entropy: 0.0
domain_whitelist_score: 0.0
suspicious_path_token: 0.0
token_density: 0.0
trusted_token_context: 0.0
infra_risk: 0.0
fake_tld_in_subdomain_or_path: 0.0
param_count_boost: 0.0


In [14]:
pretty_print_features(extract_features_v2("https://ajsd92ksmqlf.com", whitelist))


domain_complexity: 54.692733344871826
host_entropy: 0.0
domain_whitelist_score: 0.0
suspicious_path_token: 0.0
token_density: 0.0
trusted_token_context: 0.0
infra_risk: 0.0
fake_tld_in_subdomain_or_path: 0.0
param_count_boost: 0.0


In [15]:
pretty_print_features(extract_features_v2("http:///", whitelist))


domain_complexity: 0.0
host_entropy: 0.0
domain_whitelist_score: 0.0
suspicious_path_token: 0.0
token_density: 0.0
trusted_token_context: 0.0
infra_risk: 0.3
fake_tld_in_subdomain_or_path: 0.0
param_count_boost: 0.0


In [16]:
pretty_print_features(extract_features_v2("https://google.es", whitelist))


domain_complexity: 17.264662506490403
host_entropy: 0.0
domain_whitelist_score: 1.0
suspicious_path_token: 0.0
token_density: 0.0
trusted_token_context: 0.0
infra_risk: 0.0
fake_tld_in_subdomain_or_path: 0.0
param_count_boost: 0.0


In [17]:
pretty_print_features(extract_features_v2("https://www.google.es", whitelist))


domain_complexity: 17.264662506490403
host_entropy: -0.0
domain_whitelist_score: 1.0
suspicious_path_token: 0.0
token_density: 0.0
trusted_token_context: 0.0
infra_risk: 0.0
fake_tld_in_subdomain_or_path: 0.0
param_count_boost: 0.0


In [18]:
pretty_print_features(extract_features_v2("https://as9dk3pqz.google.es", whitelist))


domain_complexity: 17.264662506490403
host_entropy: 3.169925001442312
domain_whitelist_score: 1.0
suspicious_path_token: 0.0
token_density: 0.0
trusted_token_context: 0.0
infra_risk: 0.0
fake_tld_in_subdomain_or_path: 0.0
param_count_boost: 0.0


In [19]:
pretty_print_features(extract_features_v2("https://amazon.es", whitelist))


domain_complexity: 20.264662506490403
host_entropy: 0.0
domain_whitelist_score: 1.0
suspicious_path_token: 0.0
token_density: 0.0
trusted_token_context: 0.0
infra_risk: 0.0
fake_tld_in_subdomain_or_path: 0.0
param_count_boost: 0.0


In [20]:
pretty_print_features(extract_features_v2("https://pepephonefake.es", whitelist))


domain_complexity: 43.50871241067167
host_entropy: 0.0
domain_whitelist_score: 0.0
suspicious_path_token: 0.0
token_density: 0.0
trusted_token_context: 0.0
infra_risk: 0.0
fake_tld_in_subdomain_or_path: 0.0
param_count_boost: 0.0


In [21]:
pretty_print_features(extract_features_v2("https://amazon.es-login.com", whitelist))


domain_complexity: 36.0
host_entropy: 2.2516291673878226
domain_whitelist_score: 0.0
suspicious_path_token: 0.0
token_density: 0.0
trusted_token_context: 0.0
infra_risk: 0.0
fake_tld_in_subdomain_or_path: 1.0
param_count_boost: 0.0


In [22]:
pretty_print_features(extract_features_v2("https://malicioso.com/confirmar-pago", whitelist))


domain_complexity: 35.43124724097228
host_entropy: 0.0
domain_whitelist_score: 0.0
suspicious_path_token: 1.0
token_density: 0.5
trusted_token_context: 0.0
infra_risk: 0.0
fake_tld_in_subdomain_or_path: 0.0
param_count_boost: 0.0


In [23]:
pretty_print_features(extract_features_v2("https://example.com/home", whitelist))


domain_complexity: 27.738046999776504
host_entropy: 0.0
domain_whitelist_score: 0.0
suspicious_path_token: 0.0
token_density: 0.0
trusted_token_context: 0.0
infra_risk: 0.0
fake_tld_in_subdomain_or_path: 0.0
param_count_boost: 0.0


In [24]:
pretty_print_features(extract_features_v2("https://fake.com/verificar_cliente%20seguridad", whitelist))


domain_complexity: 16.0
host_entropy: 0.0
domain_whitelist_score: 0.0
suspicious_path_token: 1.0
token_density: 0.2222222222222222
trusted_token_context: -1.0
infra_risk: 0.0
fake_tld_in_subdomain_or_path: 0.0
param_count_boost: 0.0


In [25]:
pretty_print_features(extract_features_v2("https://example.com/home/test", whitelist))


domain_complexity: 27.738046999776504
host_entropy: 0.0
domain_whitelist_score: 0.0
suspicious_path_token: 0.0
token_density: 0.0
trusted_token_context: 0.0
infra_risk: 0.0
fake_tld_in_subdomain_or_path: 0.0
param_count_boost: 0.0


In [26]:
pretty_print_features(extract_features_v2("https://fake.com/pago", whitelist))


domain_complexity: 16.0
host_entropy: 0.0
domain_whitelist_score: 0.0
suspicious_path_token: 1.0
token_density: 0.5
trusted_token_context: 0.0
infra_risk: 0.0
fake_tld_in_subdomain_or_path: 0.0
param_count_boost: 0.0


In [27]:
pretty_print_features(
    extract_features_v2("https://fake.com/seguridad/cliente/verificar/pago", whitelist)
)


domain_complexity: 16.0
host_entropy: 0.0
domain_whitelist_score: 0.0
suspicious_path_token: 1.0
token_density: 0.5833333333333333
trusted_token_context: -1.0
infra_risk: 0.0
fake_tld_in_subdomain_or_path: 0.0
param_count_boost: 0.0


In [28]:
pretty_print_features(
    extract_features_v2("https://bbva.es/area-cliente/login", whitelist)
)


domain_complexity: 10.5
host_entropy: 0.0
domain_whitelist_score: 1.0
suspicious_path_token: 0.0
token_density: 0.0
trusted_token_context: 1.0
infra_risk: 0.0
fake_tld_in_subdomain_or_path: 0.0
param_count_boost: 0.0


In [29]:
pretty_print_features(
    extract_features_v2("https://bbva.es-login.com/area-cliente/login", whitelist)
)


domain_complexity: 36.0
host_entropy: 1.5
domain_whitelist_score: 0.0
suspicious_path_token: 0.0
token_density: 0.0
trusted_token_context: -1.0
infra_risk: 0.0
fake_tld_in_subdomain_or_path: 1.0
param_count_boost: 0.0


In [30]:
pretty_print_features(
    extract_features_v2("https://google.es/home", whitelist)
)


domain_complexity: 17.264662506490403
host_entropy: 0.0
domain_whitelist_score: 1.0
suspicious_path_token: 0.0
token_density: 0.0
trusted_token_context: 0.0
infra_risk: 0.0
fake_tld_in_subdomain_or_path: 0.0
param_count_boost: 0.0


In [31]:
pretty_print_features(
    extract_features_v2("http://google.es", whitelist)
)


domain_complexity: 17.264662506490403
host_entropy: 0.0
domain_whitelist_score: 1.0
suspicious_path_token: 0.0
token_density: 0.0
trusted_token_context: 0.0
infra_risk: 0.3
fake_tld_in_subdomain_or_path: 0.0
param_count_boost: 0.0


In [32]:
pretty_print_features(
    extract_features_v2("https://phishingsite.live/login", whitelist)
)


domain_complexity: 48.54160521752807
host_entropy: 0.0
domain_whitelist_score: 0.0
suspicious_path_token: 0.0
token_density: 0.0
trusted_token_context: -1.0
infra_risk: 3.0
fake_tld_in_subdomain_or_path: 0.0
param_count_boost: 0.0


In [33]:
pretty_print_features(
    extract_features_v2("https://malware.rf.gd/login", whitelist)
)


domain_complexity: 5.0
host_entropy: 2.5216406363433186
domain_whitelist_score: 0.0
suspicious_path_token: 0.0
token_density: 0.0
trusted_token_context: -1.0
infra_risk: 1.0
fake_tld_in_subdomain_or_path: 0.0
param_count_boost: 0.0


In [34]:
pretty_print_features(
    extract_features_v2("http://malware.rf.gd/pago", whitelist)
)


domain_complexity: 5.0
host_entropy: 2.5216406363433186
domain_whitelist_score: 0.0
suspicious_path_token: 1.0
token_density: 0.5
trusted_token_context: 0.0
infra_risk: 1.3
fake_tld_in_subdomain_or_path: 0.0
param_count_boost: 0.0


In [35]:
pretty_print_features(
    extract_features_v2("https://bbva.es-login.com/seguridad", whitelist)
)


domain_complexity: 36.0
host_entropy: 1.5
domain_whitelist_score: 0.0
suspicious_path_token: 0.0
token_density: 0.3333333333333333
trusted_token_context: 0.0
infra_risk: 0.0
fake_tld_in_subdomain_or_path: 1.0
param_count_boost: 0.0


In [36]:
pretty_print_features(
    extract_features_v2("https://bbva.es-login.com/seguridad", whitelist)
)


domain_complexity: 36.0
host_entropy: 1.5
domain_whitelist_score: 0.0
suspicious_path_token: 0.0
token_density: 0.3333333333333333
trusted_token_context: 0.0
infra_risk: 0.0
fake_tld_in_subdomain_or_path: 1.0
param_count_boost: 0.0


In [37]:
pretty_print_features(
    extract_features_v2("https://example.com/home", whitelist)
)


domain_complexity: 27.738046999776504
host_entropy: 0.0
domain_whitelist_score: 0.0
suspicious_path_token: 0.0
token_density: 0.0
trusted_token_context: 0.0
infra_risk: 0.0
fake_tld_in_subdomain_or_path: 0.0
param_count_boost: 0.0


In [38]:
pretty_print_features(
    extract_features_v2("https://example.com/home?x=1", whitelist)
)


domain_complexity: 27.738046999776504
host_entropy: 0.0
domain_whitelist_score: 0.0
suspicious_path_token: 0.0
token_density: 0.0
trusted_token_context: 0.0
infra_risk: 0.0
fake_tld_in_subdomain_or_path: 0.0
param_count_boost: 0.5


In [39]:
pretty_print_features(
    extract_features_v2("https://example.com/home?a=1&b=2", whitelist)
)


domain_complexity: 27.738046999776504
host_entropy: 0.0
domain_whitelist_score: 0.0
suspicious_path_token: 0.0
token_density: 0.0
trusted_token_context: 0.0
infra_risk: 0.0
fake_tld_in_subdomain_or_path: 0.0
param_count_boost: 0.6666666666666666


In [40]:
pretty_print_features(
    extract_features_v2("https://example.com/home?a=&b=1&c=2", whitelist)
)


domain_complexity: 27.738046999776504
host_entropy: 0.0
domain_whitelist_score: 0.0
suspicious_path_token: 0.0
token_density: 0.0
trusted_token_context: 0.0
infra_risk: 0.0
fake_tld_in_subdomain_or_path: 0.0
param_count_boost: 0.75


In [41]:
pretty_print_features(
    extract_features_v2("www.bbva.es/login", whitelist)
)


domain_complexity: 10.5
host_entropy: -0.0
domain_whitelist_score: 1.0
suspicious_path_token: 0.0
token_density: 0.0
trusted_token_context: 1.0
infra_risk: 0.0
fake_tld_in_subdomain_or_path: 0.0
param_count_boost: 0.0


In [42]:
pretty_print_features(
    extract_features_v2("https://google.com", whitelist)
)


domain_complexity: 19.182958340544893
host_entropy: 0.0
domain_whitelist_score: 0.0
suspicious_path_token: 0.0
token_density: 0.0
trusted_token_context: 0.0
infra_risk: 0.0
fake_tld_in_subdomain_or_path: 0.0
param_count_boost: 0.0


In [43]:
long_url = "https://example.com/" + ("a/" * 300) + "login?x=1&y=2&z=3"
pretty_print_features(extract_features_v2(long_url, whitelist))


domain_complexity: 27.738046999776504
host_entropy: 0.0
domain_whitelist_score: 0.0
suspicious_path_token: 0.0
token_density: 0.0
trusted_token_context: -1.0
infra_risk: 0.0
fake_tld_in_subdomain_or_path: 0.0
param_count_boost: 0.75


In [44]:
print(len(long_url))
print(long_url[-50:])  # últimas 50 chars


637
/a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/a/login?x=1&y=2&z=3


In [45]:
long_url2 = "https://example.com/" + ("a/" * 300) + "verificar"
pretty_print_features(extract_features_v2(long_url2, whitelist))


domain_complexity: 27.738046999776504
host_entropy: 0.0
domain_whitelist_score: 0.0
suspicious_path_token: 1.0
token_density: 0.0033003300330033004
trusted_token_context: 0.0
infra_risk: 0.0
fake_tld_in_subdomain_or_path: 0.0
param_count_boost: 0.0


In [46]:
pretty_print_features(
    extract_features_v2("https://example.com", whitelist)
)


domain_complexity: 27.738046999776504
host_entropy: 0.0
domain_whitelist_score: 0.0
suspicious_path_token: 0.0
token_density: 0.0
trusted_token_context: 0.0
infra_risk: 0.0
fake_tld_in_subdomain_or_path: 0.0
param_count_boost: 0.0


In [47]:
pretty_print_features(
    extract_features_v2("https://exámple.com/a/b/c%20d/e_f/g-h?x=1", whitelist)
)


domain_complexity: 27.738046999776504
host_entropy: 0.0
domain_whitelist_score: 0.0
suspicious_path_token: 0.0
token_density: 0.0
trusted_token_context: 0.0
infra_risk: 0.0
fake_tld_in_subdomain_or_path: 0.0
param_count_boost: 0.5


In [48]:
pretty_print_features(
    extract_features_v2("https://example.technology/login", whitelist)
)


domain_complexity: 45.38953145417973
host_entropy: 0.0
domain_whitelist_score: 0.0
suspicious_path_token: 0.0
token_density: 0.0
trusted_token_context: -1.0
infra_risk: 0.0
fake_tld_in_subdomain_or_path: 0.0
param_count_boost: 0.0


In [51]:
pretty_print_features(
    extract_features_v2("https://example.com/test?a=&b=&c=1", whitelist)
)


domain_complexity: 27.738046999776504
host_entropy: 0.0
domain_whitelist_score: 0.0
suspicious_path_token: 0.0
token_density: 0.0
trusted_token_context: 0.0
infra_risk: 0.0
fake_tld_in_subdomain_or_path: 0.0
param_count_boost: 0.75


In [52]:
pretty_print_features(
    extract_features_v2("https://bbva.es-login.com/seguridad", whitelist)
)


domain_complexity: 36.0
host_entropy: 1.5
domain_whitelist_score: 0.0
suspicious_path_token: 0.0
token_density: 0.3333333333333333
trusted_token_context: 0.0
infra_risk: 0.0
fake_tld_in_subdomain_or_path: 1.0
param_count_boost: 0.0


In [53]:
pretty_print_features(
    extract_features_v2("ht!tp://%ZZ_bad_url", whitelist)
)


domain_complexity: 0.0
host_entropy: 0.0
domain_whitelist_score: 0.0
suspicious_path_token: 0.0
token_density: 0.0
trusted_token_context: 0.0
infra_risk: 0.0
fake_tld_in_subdomain_or_path: 0.0
param_count_boost: 0.0


In [54]:
pretty_print_features(
    extract_features_v2(None, whitelist)
)


domain_complexity: 0.0
host_entropy: 0.0
domain_whitelist_score: 0.0
suspicious_path_token: 0.0
token_density: 0.0
trusted_token_context: 0.0
infra_risk: 0.0
fake_tld_in_subdomain_or_path: 0.0
param_count_boost: 0.0


In [55]:
pretty_print_features(
    extract_features_v2("", whitelist)
)
ís

domain_complexity: 0.0
host_entropy: 0.0
domain_whitelist_score: 0.0
suspicious_path_token: 0.0
token_density: 0.0
trusted_token_context: 0.0
infra_risk: 0.0
fake_tld_in_subdomain_or_path: 0.0
param_count_boost: 0.0
