In [11]:
import pandas as pd
from features.features import extract_features

# 1. Cargar las URLs seleccionadas
df_urls = pd.read_csv("urls_inclusion_1.csv")

# 2. Generar las features para cada URL
features_list = [extract_features(u) for u in df_urls["url"]]
df_features = pd.DataFrame(features_list)

# 3. Añadir columnas de sector y entidad para mantener el contexto
df_final = pd.concat([df_urls, df_features], axis=1)

# 4. Guardar como CSV listo para el modelo
output_path = "features_inclusion1.csv"
df_final.to_csv(output_path, index=False, encoding="utf-8")

print(f"✅ Features generadas y guardadas en {output_path}")
df_final.head()


✅ Features generadas y guardadas en features_inclusion1.csv


Unnamed: 0,url,sector,entidad,domain_length,domain_entropy,suspicious_path_token,num_params,contains_equal,protocol,tld_group,trusted_path_token,contains_percent,free_hosting
0,http://000o8dc.wcomhost.com/www.santanderbanco...,banca,santander,8,2.75,1,0,0,0,com,0,0,0
1,http://044088d.wcomhost.com/www.santanader.es/...,banca,santander,8,2.75,1,0,0,0,com,0,0,0
2,http://122.114.173.242:30/bancosantander.es/pa...,,,15,2.389898,1,0,0,0,otros,0,0,0
3,http://correos.es-es-es-herramientas-localizad...,logistica,correos,6,2.251629,1,0,0,0,otros,0,0,0
4,http://correos-pay-paquete-tasa845246254.clien...,logistica,correos,12,3.084963,0,0,0,0,es,0,0,0


In [14]:
import pandas as pd
import joblib

# 1. Cargar features generadas
df_test = pd.read_csv("features_inclusion1.csv")

# 2. Cargar objeto joblib
obj = joblib.load("models/logreg_phishing_final.joblib")

# 3. Extraer partes
model = obj["model"]
train_features = obj["features"]
threshold = obj["threshold"]

# 4. Reordenar columnas como en entrenamiento
X_test = df_test.reindex(columns=train_features, fill_value=0)

# 5. Predecir probabilidades y aplicar threshold
y_proba = model.predict_proba(X_test)[:,1]
y_pred = (y_proba >= threshold).astype(int)

# 6. Guardar resultados
df_test["prediccion"] = y_pred
df_test["probabilidad"] = y_proba

print(df_test[["url", "sector", "entidad", "prediccion", "probabilidad"]].head())


                                                 url     sector    entidad  \
0  http://000o8dc.wcomhost.com/www.santanderbanco...      banca  santander   
1  http://044088d.wcomhost.com/www.santanader.es/...      banca  santander   
2  http://122.114.173.242:30/bancosantander.es/pa...        NaN        NaN   
3  http://correos.es-es-es-herramientas-localizad...  logistica    correos   
4  http://correos-pay-paquete-tasa845246254.clien...  logistica    correos   

   prediccion  probabilidad  
0           1      0.971697  
1           1      0.971697  
2           1      0.997451  
3           1      0.991259  
4           1      0.887088  


In [13]:
import joblib

obj = joblib.load("models/logreg_phishing_final.joblib")
print(type(obj))
print(obj.keys() if isinstance(obj, dict) else obj)


<class 'dict'>
dict_keys(['model', 'threshold', 'features', 'metadata'])


In [15]:
from sklearn.metrics import classification_report, confusion_matrix

# 1. Todas deberían ser phishing (1)
y_true = [1] * len(df_test)
y_pred = df_test["prediccion"]

# 2. Reporte de clasificación
print(classification_report(y_true, y_pred, target_names=["Legítima (0)", "Phishing (1)"]))

# 3. Matriz de confusión
print("Matriz de confusión:")
print(confusion_matrix(y_true, y_pred))


              precision    recall  f1-score   support

Legítima (0)       0.00      0.00      0.00         0
Phishing (1)       1.00      0.87      0.93        69

    accuracy                           0.87        69
   macro avg       0.50      0.43      0.47        69
weighted avg       1.00      0.87      0.93        69

Matriz de confusión:
[[ 0  0]
 [ 9 60]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [16]:
false_negatives = df_test[df_test["prediccion"] == 0]
print(false_negatives[["url", "sector", "entidad", "probabilidad"]])


                                                  url     sector    entidad  \
26   https://area.clientes-ingdiirect.es-eu.org/login      banca        ing   
29  https://bancosantander.es.hotelparadis.es/avis...      banca  santander   
40  https://correosprepago.bnext.es/aco?token=0cwd...  logistica    correos   
41  https://correosprepago.bnext.es/register?from=...  logistica    correos   
44  https://draganov-8bb38d.ingress-erytho.easywp....      banca        ing   
54  https://ivresse.com/js/ionosv2/ionosv2/v/mail....     correo      ionos   
58  https://msantamobs.com/wp-admin/user/correos.e...  logistica    correos   
62  https://wififpt.com.vn/es/bankia.es/es/acceso-...      banca     bankia   
67  https://www.servicio-ing.es/ap/app-login/?view...      banca        ing   

    probabilidad  
26      0.117377  
29      0.337044  
40      0.131633  
41      0.103589  
44      0.242601  
54      0.029650  
58      0.387205  
62      0.221209  
67      0.158848  
