In [1]:
from urllib.parse import urlparse
from datetime import datetime
import pandas as pd
import re
import whois
import time
import requests
import os
from tqdm import tqdm
from scipy.stats import entropy
from bs4 import BeautifulSoup

In [2]:
from bs4 import BeautifulSoup


# -------- Función de extracción --------
def contar_digitos(texto):
    return sum(c.isdigit() for c in texto)

def obtener_tld(subdominio):
    return subdominio.split('.')[-1] if '.' in subdominio else ''

def string_entropy(s):
    prob = [s.count(c) / len(s) for c in set(s)]
    return entropy(prob, base=2)

def obtener_google_index(url):
    try:
        # Extraer solo el dominio
        parsed = urlparse(url)
        dominio = parsed.hostname  # Esto devuelve algo como "www.ejemplo.com"

        if dominio is None:
            return 0

        headers = {"API-Key": "01969e7f-04c8-744a-8245-79c2573fe845"}  # si tienes una
        params = {"q": f"domain:{dominio}", "size": 1}
        response = requests.get("https://urlscan.io/api/v1/search/", params=params, headers=headers)

        if response.status_code == 200:
            data = response.json()
            total = data.get("total", 0)
            return int(total > 0)
        else:
            return 0
    except:
        return 0

def obtener_page_rank(dominio, api_key="088o008o0gsgcw8k0444k8wswo84888cc0ck8kg4"):

    # Devuelve un valor entre 0 y 10. Devuelve -1 si hay error.

    try:
        url = "https://openpagerank.com/api/v1.0/getPageRank"
        headers = {"API-OPR": api_key}
        params = {"domains[]": dominio}
        response = requests.get(url, headers=headers, params=params)

        if response.status_code == 200:
            data = response.json()
            rank = data['response'][0].get("page_rank_integer", -1)
            return rank if rank is not None else -1
        else:
            return -1
    except:
        return -1

def extraer_features(url):
    # Asegura que la URL tenga esquema (http o https)
    if not url.startswith(("http://", "https://")):
        url = "http://" + url

    parsed = urlparse(url)
    hostname = parsed.hostname if parsed.hostname else ''
    path = parsed.path if parsed.path else ''

    features = {}
    features['longest_words_raw'] = max([len(word) for word in re.split(r'\W+', url)]) if url else 0
    features['nb_eq'] = url.count('=')
    features['length_hostname'] = len(hostname)
    features['length_url'] = len(url)

    # WHOIS para domain_age
    try:
        dominio_sin_www = hostname[4:] if hostname.startswith("www.") else hostname
        info = whois.whois(dominio_sin_www)
        creation_date = info.creation_date
        if isinstance(creation_date, list):
            creation_date = creation_date[0]
        if isinstance(creation_date, datetime):
            features['domain_age'] = (datetime.now() - creation_date).days
        else:
            features['domain_age'] = 0
    except:
        features['domain_age'] = 0

    features['nb_slash'] = url.count('/')
    path_words = re.split(r'\W+', path)
    features['longest_word_path'] = max([len(word) for word in path_words]) if path_words else 0
    hints = ['secure', 'account', 'update', 'login', 'verify', 'bank', 'confirm']
    features['phish_hints'] = sum(hint in url.lower() for hint in hints)
    features['nb_dots'] = url.count('.')
    host_words = hostname.split('.') if hostname else []
    features['shortest_word_host'] = min([len(w) for w in host_words]) if host_words else 0

    # 🆕 Verificar si está indexada en Google
    features['google_index'] = obtener_google_index(url)

    tld = obtener_tld(hostname)
    subdomain = hostname.split('.')[0] if hostname else ''
    features['tld_in_subdomain'] = int(tld in subdomain) if tld else 0
    digits_url = contar_digitos(url)
    features['ratio_digits_url'] = digits_url / len(url) if len(url) > 0 else 0
    features['prefix_suffix'] = int('-' in hostname) if hostname else 0
    features['ip'] = int(bool(re.fullmatch(r'(\d{1,3}\.){3}\d{1,3}', hostname)))
    features['nb_qm'] = url.count('?')
    digits_host = contar_digitos(hostname)
    features['ratio_digits_host'] = digits_host / len(hostname) if len(hostname) > 0 else 0
    features['nb_www'] = url.lower().count('www')
    features['page_rank'] = obtener_page_rank(hostname)

    # =======================
    # 🆕 HTML features agregadas
    # =======================
    try:
        response = requests.get(url, timeout=5)
        soup = BeautifulSoup(response.content, "html.parser")
    except:
        soup = BeautifulSoup("", "html.parser")

    title = soup.title.string.strip().lower() if soup.title and soup.title.string else ""
    features['domain_in_title'] = int(hostname in title)

    links = soup.find_all("a", href=True)
    features['nb_hyperlinks'] = len(links)

    ext_links = [a for a in links if a['href'].startswith(("http://", "https://")) and hostname not in a['href']]
    features['ratio_extHyperlinks'] = len(ext_links) / len(links) if links else 0

    time.sleep(5)  # Para evitar bloqueo por peticiones frecuentes

    return features

In [3]:
# -------- Cargar CSV --------
df_urls = pd.read_csv("Data/data_final.csv")  # Reemplaza con tu ruta

In [4]:
# -------- Aplicar extracción --------
tqdm.pandas()
features_list = df_urls["URL"].progress_apply(extraer_features)
df_features = pd.DataFrame(features_list.tolist())

100%|██████████| 1/1 [00:08<00:00,  8.41s/it]


In [5]:
#  -------- Unir todo --------
df_features['URL'] = df_urls['URL']
if 'Label' in df_urls.columns:
    df_features['Label'] = df_urls['Label']

In [6]:
# 🔃 Reordenar columnas según orden original de extracción
# orden_columnas = [
#     'page_rank','domain_age','google_index', 'shortest_word_host', 'nb_eq', 'ratio_digits_host',
#     'nb_slash', 'phish_hints', 'prefix_suffix', 'nb_qm', 'longest_words_raw',
#     'tld_in_subdomain', 'nb_dots', 'length_url', 'length_hostname', 'ratio_digits_url',
#     'nb_www', 'ip', 'longest_word_path', 'URL', 'Label',
# ]

# orden_columnas = [
#     'page_rank','domain_age','google_index','tld_in_subdomain','longest_words_raw','nb_slash','prefix_suffix','length_url','ratio_digits_url','ratio_digits_host','longest_word_path','nb_eq','phish_hints','nb_www','shortest_word_host','ip','length_hostname','nb_qm','nb_dots','URL', 'Label'
# ]

orden_columnas = [
    'google_index', 'page_rank', 'domain_age', 'nb_hyperlinks', 'nb_qm', 'domain_in_title', 'nb_eq', 'length_hostname', 'longest_word_path', 'tld_in_subdomain', 'ratio_digits_host', 'nb_www', 'ip', 'shortest_word_host', 'ratio_digits_url', 'nb_slash', 'length_url', 'longest_words_raw', 'prefix_suffix', 'nb_dots', 'phish_hints', 'ratio_extHyperlinks','URL','Label'
]

df_features = df_features[orden_columnas]

In [7]:
# -------- Guardar --------
df_features.to_csv("Data/dataset_procesado.csv", index=False)
print("Dataset guardado como dataset_procesado.csv")

Dataset guardado como dataset_procesado.csv


In [8]:
df_features

Unnamed: 0,google_index,page_rank,domain_age,nb_hyperlinks,nb_qm,domain_in_title,nb_eq,length_hostname,longest_word_path,tld_in_subdomain,...,ratio_digits_url,nb_slash,length_url,longest_words_raw,prefix_suffix,nb_dots,phish_hints,ratio_extHyperlinks,URL,Label
0,1,0,52,0,0,0,0,120,0,0,...,0.054688,3,128,35,0,10,0,0,http://signin.eday.co.uk.ws.edayisapi.dllsign....,0
