#  Detección de Phishing

In [1]:
#!pip install ydata-profiling

In [2]:
#Importar las librerías
import pandas as pd
import numpy as np
import re
from collections import Counter
from sklearn import feature_extraction, tree, model_selection, metrics
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
%matplotlib inline
from ydata_profiling import ProfileReport
from yellowbrick.features import Rank2D
from yellowbrick.features import RadViz

  def hasna(x: np.ndarray) -> bool:


## Parte 1: Ingeniería de características

In [3]:
## Cargar el dataset proporcionado
df = pd.read_csv('dataset_pishing.csv')
print(df.shape)

#Ejemplos de dominios phishing
df[df.status == 'phishing'].sample(5)

(11430, 2)


Unnamed: 0,url,status
2999,https://secureupdate.appleld.com.duilawyeryork...,phishing
9276,http://bizwebnature.wr02.dhrcenter.com/wp-cont...,phishing
3373,http://rxcmpd.com/forum/user.html,phishing
2668,https://wantzsolution.com/pu/new/wrong.html,phishing
6959,https://1drv.ms/w/s!At6aBCMxoQeQgRRAHazJu3fO1Ojj,phishing


In [4]:
#Ejemplos de dominios legitimos
df[df.status == 'legitimate'].sample(5)

Unnamed: 0,url,status
3969,https://www.youtube.com/watch?v=a9qD50wsQCo,legitimate
9399,https://www.manchester.ac.uk/,legitimate
2537,https://www.megachords.com/gigi/chords/amnesia/,legitimate
4189,http://www.ayurvedicgyan.in/,legitimate
9576,http://aslancoaching.ie,legitimate


In [5]:
# Contar la cantidad de observaciones para cada etiqueta en la columna 'status'
conteo_status = df['status'].value_counts()

# Mostrar los resultados
print("Cantidad de observaciones etiquetadas como 'legitimate':", conteo_status['legitimate'])
print("Cantidad de observaciones etiquetadas como 'phishing':", conteo_status['phishing'])

Cantidad de observaciones etiquetadas como 'legitimate': 5715
Cantidad de observaciones etiquetadas como 'phishing': 5715


R// Sí está balanceado el dataset

### Derivación de características

1. **¿Qué ventajas tiene el análisis de una URL contra el análisis de otros datos, cómo el tiempo de vida del dominio, o las características de la página Web?**

- El análisis de una URL ofrece un mayor acercamiento a un sitio web específico a diferencia de otros análisis ya que una URL es un recurso único y propio del sitio evaluado por lo que regularmente ofrecerá información sobre el mismo sin necesidad de acceder a este. Se puede estudiar la longitud de la URL, la presencia de caracteres sospechosos, subdominios inusuales, falta de protocolos o cifrados como HTTPS, errores gramaticales u ortográficos y muchas otras características que permiten evaluar si un sitio web es peligroso. 

2. **¿Qué características de una URL son más prometedoras para la detección de phishing?**

- Longitud de la URL (URL acortadas).
- Conteo de subdominios.
- Dominios aleatorios.
- Cantidad de vocales o consonantes.
- Entropía de símbolos.
- Puerto, ip y HTTPS tokens.
- Cantidad de redirecciones.
- Extensiones del path.
- Uso de caracteres especiales
- Uso de guíones y guíones bajos.
- Longitud de nombre del dominio.
- Palabras claves sensibles.
- Coincidencia con marcas conocidas.
- Tiempo de vida del dominio.
- Uso de dominios de nivel superior sospechosos.
- Presencia de cadenas numéricas aleatorias.

### Funciones

In [6]:
from urllib.parse import urlparse
from datetime import datetime
import requests
import re
import whois
import socket

def url_length(url):
    return len(url)

def subdomain_count(url):
    parsed_url = urlparse(url)
    return len(parsed_url.netloc.split('.')) - 2  # subtracting 2 for domain and top-level domain
    
def vowel_consonant_ratio (x):
    # Calculate vowel to consonant ratio
    x = x.lower()
    vowels_pattern = re.compile('([aeiou])')
    consonants_pattern = re.compile('([b-df-hj-np-tv-z])')
    vowels = re.findall(vowels_pattern, x)
    consonants = re.findall(consonants_pattern, x)
    try:
        ratio = len(vowels) / len(consonants)
    except: # catch zero devision exception 
        ratio = 0  
    return ratio

def count_redirects(url):
    response = requests.get(url, allow_redirects=False)
    return len(response.history)

def extract_path_extensions(url):
    parsed_url = urlparse(url)
    path = parsed_url.path
    return len([part.split('.')[-1] for part in path.split('/') if '.' in part])

def count_special_characters(url):
    special_character_pattern = re.compile(r'[^a-zA-Z0-9]')
    special_characters = re.findall(special_character_pattern, url)
    return len(special_characters)

def domain_length(url):
    parsed_url = urlparse(url)
    domain = parsed_url.netloc
    return len(domain)

def sensitive_keywords(url):
    sensitive_words = ["login", "password", "bank", "paypal", "account", "secure", "verify"]
    count = sum(url.lower().count(word) for word in sensitive_words)
    return count

def domain_age(url): #Lo retorna en días
    try:
        parsed_url = urlparse(url)
        domain = parsed_url.netloc
        domain_info = whois.whois(domain)
        
        creation_date = domain_info.creation_date
        
        if isinstance(creation_date, list):
            creation_date = creation_date[0]
        age = (datetime.now() - creation_date).days

        return int(age)
    
    except Exception as e:
        #print("Error:", e)
        return 0
    
def count_hyphens_and_underscores(url):
    hyphen_count = url.count("-")
    underscore_count = url.count("_")
    return hyphen_count + underscore_count

def check_suspicious_tld(url):
    suspicious_tlds = [".tk", ".ml", ".ga", ".cf", ".gq"]  # Example list of suspicious TLDs
    parsed_url = urlparse(url)
    domain = parsed_url.netloc
    tld = domain.split(".")[-1]
    
    if "." + tld in suspicious_tlds:
        return 1  # Suspicious TLD
    else:
        return 0  # Not suspicious TLD

def has_random_numeric_strings(url):
    pattern = re.compile(r'\d{5,}')  # Matches sequences of 5 or more digits
    matches = re.findall(pattern, url)
    return len(matches)

def uses_https(url):
    parsed_url = urlparse(url)
    return 1 if parsed_url.scheme == 'https' else 0

def number_letter_ratio_in_path(url):
    parsed_url = urlparse(url)
    path = parsed_url.path
    letters = sum(c.isalpha() for c in path)
    numbers = sum(c.isdigit() for c in path)
    if numbers == 0:  # Evitar división por cero
        return 0
    return letters / numbers

def letter_ratio_in_domain(url):
    parsed_url = urlparse(url)
    domain = parsed_url.netloc
    letters = sum(c.isalpha() for c in domain)
    if len(domain) == 0:  # Evitar división por cero
        return 0
    return letters / len(domain)

### Preprocesamiento de los datos

In [7]:
df['length'] = df['url'].apply(url_length)
df['vowels'] = df['url'].apply(vowel_consonant_ratio)
df['subdomain'] = df['url'].apply(subdomain_count)
df['paths'] = df['url'].apply(extract_path_extensions)
df['https'] = df['url'].apply(uses_https)
df['num_letter_ratio_path'] = df['url'].apply(number_letter_ratio_in_path)
df['num_letter_ratio_domain'] = df['url'].apply(letter_ratio_in_domain)
df['special_char'] = df['url'].apply(count_special_characters)
df['domain_length'] = df['url'].apply(domain_length)
df['sensitive_keywords'] = df['url'].apply(sensitive_keywords)
df['suspicious_tld'] = df['url'].apply(domain_age)
df['rand_numstring'] = df['url'].apply(has_random_numeric_strings)
df['hyphens'] = df['url'].apply(count_hyphens_and_underscores)
df['domain_age'] = df['url'].apply(domain_age)

df['status'] = df['status'].replace(to_replace = 'legitimate', value=1)
df['status'] = df['status'].replace(to_replace = 'phishing', value=0)

print(df.head())

                                                 url  status  length  \
0              http://www.crestonwood.com/router.php       1      37   
1  http://shadetreetechnology.com/V4/validation/a...       0      77   
2  https://support-appleld.com.secureupdate.duila...       0     126   
3                                 http://rgipt.ac.in       1      18   
4  http://www.iracing.com/tracks/gateway-motorspo...       1      55   

     vowels  subdomain  paths  https  num_letter_ratio_path  \
0  0.363636          1      1      0               0.000000   
1  0.827586          0      0      0               1.588235   
2  0.517241          3      0      1               0.888889   
3  0.300000          1      0      0               0.000000   
4  0.363636          1      0      0               0.000000   

   num_letter_ratio_domain  special_char  domain_length  sensitive_keywords  \
0                 0.894737             7             19                   0   
1                 0.956522    

In [8]:
df_final = df
df_final = df_final.drop(['url'], axis=1)
df_final.to_csv('phishing_features.csv', index=False)
df_final.head()

Unnamed: 0,status,length,vowels,subdomain,paths,https,num_letter_ratio_path,num_letter_ratio_domain,special_char,domain_length,sensitive_keywords,suspicious_tld,rand_numstring,hyphens,domain_age
0,1,37,0.363636,1,1,0,0.0,0.894737,7,19,0,0,0,0,0
1,0,77,0.827586,0,0,0,1.588235,0.956522,7,23,0,0,0,0,0
2,0,126,0.517241,3,0,1,0.888889,0.9,19,50,1,0,0,3,0
3,1,18,0.3,1,0,0,0.0,0.818182,5,11,0,0,0,0,0
4,1,55,0.363636,1,0,0,0.0,0.866667,10,15,0,0,0,2,0


In [9]:
df_final.dtypes

status                       int64
length                       int64
vowels                     float64
subdomain                    int64
paths                        int64
https                        int64
num_letter_ratio_path      float64
num_letter_ratio_domain    float64
special_char                 int64
domain_length                int64
sensitive_keywords           int64
suspicious_tld               int64
rand_numstring               int64
hyphens                      int64
domain_age                   int64
dtype: object

### Visualización de data

In [10]:
profile = ProfileReport(df_final, title="Profiling Report")
profile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

