In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score

In [2]:
path = "C:/Users/Ana/Documents/BOOTCAMP/Ana Fdz/3-Machine_Learning/Detector de Phising_ML/data/processed/dataframe_limpio.csv"

In [3]:
df = pd.read_csv(path, index_col=0)   

In [4]:
df.shape

(10000, 16)

In [5]:
X = df.drop("CLASS_LABEL", axis=1)
y = df['CLASS_LABEL']

In [6]:
from sklearn import model_selection

In [7]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y,test_size=0.20,random_state=5)

In [8]:
train = pd.concat([X_train,y_train], axis=1)
test = pd.concat([X_test,y_test], axis=1)

In [9]:
train.to_csv("C:/Users/Ana/Documents/BOOTCAMP/Ana Fdz/3-Machine_Learning/Detector de Phising_ML/data/train/train.csv")
test, y_test.to_csv("C:/Users/Ana/Documents/BOOTCAMP/Ana Fdz/3-Machine_Learning/Detector de Phising_ML/data/test/test.csv")

(      NumDots  PathLevel  NumDash  NumDashInHostname  HostnameLength  \
 7054        6          2        0                  0              21   
 442         3          2        0                  0              16   
 3954        1          4        0                  0              11   
 2288        2          3        0                  0              11   
 3196        3          8        4                  1              20   
 ...       ...        ...      ...                ...             ...   
 6133        3          1        0                  0              19   
 532         3          4        0                  0              14   
 1370        2          6        1                  0              19   
 6514        1          6        8                  0              19   
 3081        2          3        0                  0              22   
 
       NumSensitiveWords  PctExtHyperlinks  InsecureForms  \
 7054                  0          0.294118              1   


In [10]:
print(X.shape)
print(y.shape)

(10000, 15)
(10000,)


In [11]:
y_test.value_counts(normalize=True)

CLASS_LABEL
1    0.507
0    0.493
Name: proportion, dtype: float64

In [12]:
# scaler = StandardScaler()
# scaler.fit(X_train)

# X_train_scaled = scaler.transform(X_train)
# X_test_scaled = scaler.transform(X_test)

In [13]:
pipe = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=15)),                     
    ("classifier", RandomForestClassifier())
])
pipe

In [14]:
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

In [15]:
print("recall_score", recall_score(y_test, y_pred))
print("confusion_matrix\n", confusion_matrix(y_test, y_pred))

recall_score 0.9723865877712031
confusion_matrix
 [[963  23]
 [ 28 986]]


In [16]:
pipe_gs_rf = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("pca", PCA()),
    ("classifier", RandomForestClassifier())
])
pipe_gs_rf

In [17]:
params = {
    "scaler" : [StandardScaler(), None],
    "pca__n_components": [13,14,15],
    "classifier__max_depth": np.arange(2,20),
    "classifier__min_samples_leaf": [3,4,5]
}

In [18]:
gs = RandomizedSearchCV (pipe_gs_rf, params, cv=5, scoring="recall", verbose=1, n_jobs=-1)    

gs.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [19]:
print(gs.best_params_)
print(gs.best_score_)
print(gs.best_estimator_)

{'scaler': StandardScaler(), 'pca__n_components': 15, 'classifier__min_samples_leaf': 3, 'classifier__max_depth': 18}
0.9613660877413107
Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=15)),
                ('classifier',
                 RandomForestClassifier(max_depth=18, min_samples_leaf=3))])


In [20]:
final_model1 = gs.best_estimator_.fit(X_train, y_train)                # me sale peor la confusión. Revisar hiperparámetros
y_pred = final_model1.predict(X_test)
print("recall_score", recall_score(y_test, y_pred))
print("confusion_matrix\n", confusion_matrix(y_test, y_pred))

recall_score 0.9674556213017751
confusion_matrix
 [[962  24]
 [ 33 981]]


In [21]:
# Verdaderos Positivos (981): Son los casos en los que el modelo predijo correctamente que una URL es phishing (1)

# Falsos Positivos (24): Estos son los casos en los que el modelo predijo incorrectamente que una URL es phishing (1)
# cuando en realidad no lo es (0).

# Verdaderos Negativos (962): Son los casos en los que el modelo predijo correctamente que una URL no es phishing (0). 

# Falsos Negativos (33): Estos son los casos en los que el modelo predijo incorrectamente que una URL no es phishing (0) cuando en realidad sí lo es (1). 

In [22]:
import pickle

In [23]:
with open('C:/Users/Ana/Documents/BOOTCAMP/Ana Fdz/3-Machine_Learning/Detector de Phising_ML/models/modelo_01.pkl', "wb") as archivo_salida:
    pickle.dump(final_model1, archivo_salida)