In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression    # recuerda luego rescatarlas
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
#from sklearn.feature_selection import SelectKBest

In [13]:
path = "C:/Users/Ana/Documents/BOOTCAMP/Ana Fdz/3-Machine_Learning/Detector de Phising_ML/data/processed/dataframe_limpio.csv"

In [14]:
df = pd.read_csv(path, index_col=0)   

In [15]:
df.shape

(10000, 16)

In [16]:
X = df.drop("CLASS_LABEL", axis=1)
y = df['CLASS_LABEL']

In [17]:
from sklearn import model_selection

In [18]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y,test_size=0.20,random_state=5)

In [19]:
train = pd.concat([X_train,y_train], axis=1)
test = pd.concat([X_test,y_test], axis=1)

In [20]:
train.to_csv("C:/Users/Ana/Documents/BOOTCAMP/Ana Fdz/3-Machine_Learning/Detector de Phising_ML/data/train/train.csv")
test, y_test.to_csv("C:/Users/Ana/Documents/BOOTCAMP/Ana Fdz/3-Machine_Learning/Detector de Phising_ML/data/test/test.csv")

(      NumDots  PathLevel  NumDash  NumDashInHostname  HostnameLength  \
 7054        6          2        0                  0              21   
 442         3          2        0                  0              16   
 3954        1          4        0                  0              11   
 2288        2          3        0                  0              11   
 3196        3          8        4                  1              20   
 ...       ...        ...      ...                ...             ...   
 6133        3          1        0                  0              19   
 532         3          4        0                  0              14   
 1370        2          6        1                  0              19   
 6514        1          6        8                  0              19   
 3081        2          3        0                  0              22   
 
       NumSensitiveWords  PctExtHyperlinks  InsecureForms  \
 7054                  0          0.294118              1   


In [21]:
print(X.shape)
print(y.shape)

(10000, 15)
(10000,)


In [22]:
y_test.value_counts(normalize=True)

CLASS_LABEL
1    0.507
0    0.493
Name: proportion, dtype: float64

In [31]:
pipe = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("selectkbest", SelectKBest()),
    ('classifier', RandomForestClassifier())
])

In [32]:
log_params = {
    "scaler" : [StandardScaler(), None],
    "selectkbest__k":np.arange(5,15),
    "classifier": [LogisticRegression()],
    "classifier__C": [0.1,1,10]                
}
rf_params = {
    "scaler" : [StandardScaler(), None],
    "selectkbest__k":np.arange(5,15),
    "classifier": [RandomForestClassifier()],
    "classifier__max_depth": [3,5,7]
}
gb_params = {
    "scaler" : [StandardScaler(), None],
    "selectkbest__k":np.arange(5,15),
    "classifier": [GradientBoostingClassifier()],
    "classifier__max_depth": [3,5,7]
}
knn_params = {
    "scaler" : [StandardScaler(), None],
    "selectkbest__k":np.arange(5,15),
    "classifier": [KNeighborsClassifier()],
    "classifier__n_neighbors": np.arange(5,15)
}
svm_params = {
    "scaler" : [StandardScaler(), None],
    "selectkbest__k":np.arange(5,15),
    "classifier": [SVC()],
    "classifier__C": [0.1,1,10]
}
pipe

In [35]:
search_space = [
    log_params,
    rf_params,
    gb_params,
    knn_params,
    svm_params]

In [36]:
clf_gs = GridSearchCV(estimator = pipe,
                  param_grid = search_space,
                  cv = 5,
                  scoring="recall",
                  verbose=1,
                  n_jobs=-1)

In [37]:
clf_gs.fit(X_train, y_train)

Fitting 5 folds for each of 440 candidates, totalling 2200 fits


In [38]:
print(clf_gs.best_estimator_)
print(clf_gs.best_params_)
print(clf_gs.best_score_)

Pipeline(steps=[('scaler', None), ('selectkbest', SelectKBest()),
                ('classifier', GradientBoostingClassifier(max_depth=7))])
{'classifier': GradientBoostingClassifier(), 'classifier__max_depth': 7, 'scaler': None, 'selectkbest__k': 10}
0.9746625031839322


In [39]:
final_model = clf_gs.best_estimator_
final_model.fit(X_train, y_train)
y_pred = final_model.predict(X_test)

In [40]:
print("recall_score", recall_score(y_test, y_pred))
print("confusion_matrix\n", confusion_matrix(y_test, y_pred))

recall_score 0.9792899408284024
confusion_matrix
 [[953  33]
 [ 21 993]]


In [None]:
# pintar las confusiones matrix

In [None]:
# REVISAR NÚMEROS.

# Verdaderos Positivos (993): Son los casos en los que el modelo predijo correctamente que una URL es phishing (1)

# Falsos Positivos (33): Estos son los casos en los que el modelo predijo incorrectamente que una URL es phishing (1)
# cuando en realidad no lo es (0).

# Verdaderos Negativos (953): Son los casos en los que el modelo predijo correctamente que una URL no es phishing (0). 

# Falsos Negativos (21): Estos son los casos en los que el modelo predijo incorrectamente que una URL no es phishing (0) 
# cuando en realidad sí lo es (1). 

In [41]:
clf_rs = RandomizedSearchCV(estimator=pipe, param_distributions=search_space, cv=3, scoring="accuracy", verbose=3, n_jobs=-1)
clf_rs.fit(X_train, y_train)

print(clf_rs.best_estimator_)
print(clf_rs.best_score_)
print(clf_rs.best_params_)

final_model = clf_rs.best_estimator_
final_model.fit(X_train, y_train)
y_pred = final_model.predict(X_test)

print("recall_score", recall_score(y_test, y_pred))
print("confusion_matrix\n", confusion_matrix(y_test, y_pred))

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Pipeline(steps=[('scaler', StandardScaler()),
                ('selectkbest', SelectKBest(k=14)),
                ('classifier', GradientBoostingClassifier(max_depth=5))])
0.9682495145721188
{'selectkbest__k': 14, 'scaler': StandardScaler(), 'classifier__max_depth': 5, 'classifier': GradientBoostingClassifier()}
recall_score 0.9783037475345168
confusion_matrix
 [[967  19]
 [ 22 992]]


Aunque los falsos positivos hayan disminuido considerablemente, aumenta en 1 los falsos negativos (casos en los que el modelo predijo incorrectamente que una URL no es phishing, cuando en realidad sí lo es) por lo que me quedo con el modelo que me da
el GridSearchCV

In [42]:
results = cross_val_score(final_model, X, y, cv=5, scoring="recall")
print(results)
print(results.mean())
print(results.std())

[0.959 0.98  0.969 0.965 0.953]
0.9652
0.00917387595294378


In [132]:
# PIPELINE CON RANDOMFORESC Y GRADIENTBOOSTINGC

In [43]:
pipe1 = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("pca", PCA()),
    ("classifier", RandomForestClassifier())
])
pipe1

In [44]:
pipe1.fit(X_train, y_train)                     

In [45]:
print("recall_score", recall_score(y_test, y_pred))
print("confusion_matrix\n", confusion_matrix(y_test, y_pred))

recall_score 0.9783037475345168
confusion_matrix
 [[967  19]
 [ 22 992]]


In [52]:
pipe2 = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("pca", PCA()),
    ("classifier", GradientBoostingClassifier())
])
pipe2

In [53]:
pipe2.fit(X_train, y_train)  

In [54]:
print("recall_score", recall_score(y_test, y_pred))
print("confusion_matrix\n", confusion_matrix(y_test, y_pred))

recall_score 0.9783037475345168
confusion_matrix
 [[967  19]
 [ 22 992]]


In [None]:
# INTENTO HACER MODELOS MÁS EXACTOS HIPERPARAMETRIZANDO:

In [None]:
# log_params = {
#     "scaler": [StandardScaler(), None],
#     "pca__n_components": [13, 14, 15],
#     "classifier__C": [0.1, 1, 10]
# }
# rf_params = {
#     "scaler": [StandardScaler(), None],
#     "pca__n_components": [13, 14, 15],
#     "classifier__max_depth": np.arange(2, 20),
#     "classifier__min_samples_leaf": [2, 3, 4]
# }
# gb_params = {
#     "scaler": [StandardScaler(), None],
#     "pca__n_components": [13, 14, 15],
#     "classifier__max_depth": np.arange(2, 20),
#     "classifier__min_samples_leaf": [2, 3, 4]
# }
# knn_params = {
#     "scaler": [StandardScaler(), None],
#     "pca__n_components": [13, 14, 15],
#     "classifier__n_neighbors": np.arange(2, 20)
# }
# svm_params = {
#     "scaler": [StandardScaler(), None],
#     "pca__n_components": [13, 14, 15],
#     "classifier__C": [0.1, 1, 10]
# }
# pipe1

In [47]:
# search_space = [
#     log_params,
#     rf_params,
#     gb_params,
#     knn_params,
#     svm_params   
# ]

In [48]:
# clf_rs1 = RandomizedSearchCV(pipe1, search_space, cv=5, scoring="recall", verbose=2, n_jobs=-1)

In [None]:
# clf_rs1.fit(X_train, y_train)        !! Aquí me da el error

In [None]:
# print(clf_rs.best_estimator_)
# print(clf_rs.best_score_)
# print(clf_rs.best_params_)

In [165]:
# final_model1 = clf_rs.best_estimator_
# final_model1.fit(X_train, y_train)
# y_pred = final_model1.predict(X_test)

In [None]:
# print("recall_score", recall_score(y_test, y_pred))                 
# print("confusion_matrix\n", confusion_matrix(y_test, y_pred))

In [57]:
import pickle

In [58]:
with open('C:/Users/Ana/Documents/BOOTCAMP/Ana Fdz/3-Machine_Learning/Detector de Phising_ML/models/modelo_01.pkl', "wb") as archivo_salida:
    pickle.dump(final_model, archivo_salida)

In [61]:
! pip install PyYAML




[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [62]:
import yaml

# Para leer el contenido del archivo YAML
with open("../models/model_config.yaml", "r") as archivo:
    datos_yaml = yaml.safe_load(archivo)

In [63]:
datos_yaml

{'model_name': 'modelo_01.',
 'selectkbest__k': 10,
 'scaler': 'None',
 'classifier__max_depth': 7,
 'classifier': 'GradientBoostingClassifier()'}