In [1]:
# --- Carga de Datos ---
import pandas as pd
df = pd.read_csv('trainData.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,qty_dot_url,qty_hyphen_url,qty_underline_url,qty_slash_url,qty_questionmark_url,qty_equal_url,qty_at_url,qty_and_url,qty_exclamation_url,...,qty_ip_resolved,qty_nameservers,qty_mx_servers,ttl_hostname,tls_ssl_certificate,qty_redirects,url_google_index,domain_google_index,url_shortened,phishing
0,55124,2,0,0,0,0,0,0,0,0,...,1,4,5,43200,0,0,0,0,0,0
1,44575,3,0,0,0,0,0,0,0,0,...,1,2,5,14399,1,1,0,0,0,0
2,87793,2,0,0,0,0,0,0,0,0,...,1,4,0,292,1,0,0,0,0,0
3,5689,2,0,0,0,0,0,0,0,0,...,1,3,1,3600,1,1,0,0,0,0
4,38932,2,0,0,0,0,0,0,0,0,...,1,2,1,21596,1,2,0,0,0,0


In [2]:
# --- Preprocesamiento: Verificar valores nulos y valores negativos ---
import numpy as np

# Reemplazar valores nulos por 0
df = df.fillna(0)

# Reemplazar valores negativos por 0 en todas las columnas numéricas
num_cols = df.select_dtypes(include=[np.number]).columns
df[num_cols] = df[num_cols].clip(lower=0)

In [3]:




# --- Modelamiento y Evaluación ---
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, make_scorer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline  

# Separar features y target
X = df.drop('phishing', axis=1)  
y = df['phishing'] 

# Definir métrica
scoring = 'f1'  # Puedes cambiar a 'precision', 'recall', o 'accuracy'

# 10-fold cross-validation
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)  # Menos folds para pruebas rápidas

# Normalización y balanceo dentro de cada fold

knn_params = {'knn__n_neighbors': list(range(1, 21))}
knn_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42)),
    ('knn', KNeighborsClassifier())
])

grid_knn = GridSearchCV(knn_pipeline, knn_params, cv=cv, scoring=scoring, n_jobs=-1)
grid_knn.fit(X, y)
print("Mejor K para KNN:", grid_knn.best_params_)
print("Mejor F1-Score:", grid_knn.best_score_)

# Esto se demora mucho en ejecutarse.(5min aprox)

# --- Resultado obtenido  ---
# Mejor K para KNN: {'knn__n_neighbors': 4} 
# Mejor F1-Score: 0.9215568833373402


[WinError 2] The system cannot find the file specified
  File "C:\Users\Valentin Malov\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executabl

Mejor K para KNN: {'knn__n_neighbors': 4}
Mejor F1-Score: 0.9215568833373402


In [4]:
# --- Árbol de Decisión ---
tree_params = {'tree__max_depth': list(range(1, 21))}
tree_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42)),
    ('tree', DecisionTreeClassifier(random_state=42))
])

grid_tree = GridSearchCV(tree_pipeline, tree_params, cv=cv, scoring=scoring, n_jobs=-1)
grid_tree.fit(X, y)
print("Mejor max_depth para Árbol:", grid_tree.best_params_)
print("Mejor F1-Score:", grid_tree.best_score_)
# --- Resultado obtenido ---
#   Mejor max_depth para Árbol: {'tree__max_depth': 14}
#   Mejor F1-Score: 0.935073970535818

Mejor max_depth para Árbol: {'tree__max_depth': 13}
Mejor F1-Score: 0.9377527024470051


In [5]:
# --- Naive Bayes ---
nb_params = {'nb__var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1]}
nb_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42)),
    ('nb', GaussianNB())
])

grid_nb = GridSearchCV(nb_pipeline, nb_params, cv=cv, scoring=scoring, n_jobs=-1)
grid_nb.fit(X, y)
print("Mejor var_smoothing para NB:", grid_nb.best_params_)
print("Mejor F1-Score:", grid_nb.best_score_)

# --- Resultado Obtenido ---
# Mejor var_smoothing para NB: {'nb__var_smoothing': 0.01}
# Mejor F1-Score: 0.4459903427029562

Mejor var_smoothing para NB: {'nb__var_smoothing': 0.01}
Mejor F1-Score: 0.4464631131685433


In [None]:
# --- Regresión Logística: búsqueda del mejor penalty ---
log_params = {
    'log__penalty': ['l1', 'l2', 'elasticnet'],
    'log__solver': ['saga'],
    'log__C': [0.01, 0.1, 1, 10],
    'log__l1_ratio': [0.5]  # Solo se usa cuando penalty='elasticnet'
}
log_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42)),
    ('log', LogisticRegression(max_iter=1000, random_state=42))
])

grid_log = GridSearchCV(log_pipeline, log_params, cv=cv, scoring=scoring, n_jobs=-1, verbose=2)
grid_log.fit(X, y)
print("Mejor penalty para Regresión Logística:", grid_log.best_params_['log__penalty'])
print("Mejor combinación de hiperparámetros:", grid_log.best_params_)
print("Mejor F1-Score:", grid_log.best_score_)
#   -- Resultados Obtenidos-- (La ejecucion tomo 50min)
#   Mejor penalty para Regresión Logística: l1
#   Mejor combinación de hiperparámetros: {'log__C': 1, 'log__l1_ratio': 0.5, 'log__penalty': 'l1', 'log__solver': 'saga'}
#   Mejor F1-Score: 0.9040002993680263

Fitting 10 folds for each of 12 candidates, totalling 120 fits




Mejor penalty para Regresión Logística: l1
Mejor combinación de hiperparámetros: {'log__C': 1, 'log__l1_ratio': 0.5, 'log__penalty': 'l1', 'log__solver': 'saga'}
Mejor F1-Score: 0.9040002993680263
