In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [46]:
path = "C:/Users/Ana/Documents/BOOTCAMP/Ana Fdz/3-Machine_Learning/Detector de Phising_ML/data/processed/dataframe_limpio.csv"

In [47]:
df = pd.read_csv(path, index_col=0)   

In [48]:
df.shape

(10000, 16)

In [49]:
train = df[:8000]
train.shape

(8000, 16)

In [50]:
test = df[8000:]
test.shape

(2000, 16)

In [51]:
train.to_csv("C:/Users/Ana/Documents/BOOTCAMP/Ana Fdz/3-Machine_Learning/Detector de Phising_ML/data/train/train.csv")
test.to_csv("C:/Users/Ana/Documents/BOOTCAMP/Ana Fdz/3-Machine_Learning/Detector de Phising_ML/data/test/test.csv")

In [52]:
X = df.drop("CLASS_LABEL", axis=1)
y = df['CLASS_LABEL']

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc= RandomForestClassifier()
rfc.fit(X, y)

In [None]:
importancias_caracteristicas = rfc.feature_importances_
importancias_df = pd.DataFrame({'Caracteristica': X.columns, 'Importancia': importancias_caracteristicas})
importancias_df = importancias_df.sort_values(by='Importancia', ascending=False)

print(importancias_df)

In [None]:
df_importancias_top10= importancias_df.head(10)
df_importancias_top10

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [54]:
print(X.shape)
print(y.shape)

(10000, 15)
(10000,)


In [55]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [57]:
# En el mismo Pipeline, todos los modelos. Los supervisados y el no supervisado
from sklearn.decomposition import PCA

In [58]:
pca = PCA(n_components=15)
X_pca = pca.fit(X_scaled)

In [59]:
pca.explained_variance_ratio_       # por qué ponemos esto ??

array([7.89197382e-01, 1.19247759e-01, 4.26867497e-02, 1.88642653e-02,
       1.08376221e-02, 4.24758304e-03, 3.46393983e-03, 2.39597584e-03,
       2.30523784e-03, 1.85237177e-03, 1.56147786e-03, 1.28675250e-03,
       1.17942600e-03, 8.35170391e-04, 3.82871059e-05])

In [60]:
pca.explained_variance_ratio_.cumsum()          

array([0.78919738, 0.90844514, 0.95113189, 0.96999616, 0.98083378,
       0.98508136, 0.9885453 , 0.99094128, 0.99324651, 0.99509889,
       0.99666036, 0.99794712, 0.99912654, 0.99996171, 1.        ])

In [61]:
pipe = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=5)),
    ("classifier", RandomForestClassifier())
])
pipe

In [62]:
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

In [63]:
print("recall_score", recall_score(y_test, y_pred))
print("confusion_matrix\n", confusion_matrix(y_test, y_pred))

recall_score 0.9377470355731226
confusion_matrix
 [[916  72]
 [ 63 949]]


In [64]:
pipe_gs_rf = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("pca", PCA()),
    ("classifier", RandomForestClassifier())
])
pipe_gs_rf

In [65]:
params = {
    "scaler" : [StandardScaler(), None],
    "pca__n_components": [5,6,7],
    "classifier__max_depth": np.arange(3,7),
    "classifier__min_samples_leaf": [2,3,4]
}

In [66]:
gs = GridSearchCV(pipe_gs_rf, params, cv=5, scoring="recall")

gs.fit(X_train, y_train)

In [67]:
print(gs.best_params_)
print(gs.best_score_)
print(gs.best_estimator_)

{'classifier__max_depth': 6, 'classifier__min_samples_leaf': 4, 'pca__n_components': 6, 'scaler': StandardScaler()}
0.930294368292123
Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=6)),
                ('classifier',
                 RandomForestClassifier(max_depth=6, min_samples_leaf=4))])


In [68]:
final_model1 = gs.best_estimator_.fit(X_train, y_train)
y_pred = final_model1.predict(X_test)
print("recall_score", recall_score(y_test, y_pred))
print("confusion_matrix\n", confusion_matrix(y_test, y_pred))

recall_score 0.91600790513834
confusion_matrix
 [[889  99]
 [ 85 927]]


In [69]:
import pickle

In [71]:
with open('C:/Users/Ana/Documents/BOOTCAMP/Ana Fdz/3-Machine_Learning/Detector de Phising_ML/models/modelo_01.pkl', "wb") as archivo_salida:
    pickle.dump(final_model1, archivo_salida)