# **PhiUSIIL Phishing URL (Website)**

## **Part 2:**

- For this section, we aim to predict whether a URL page is legitimate or phishing using a Naive Bayes model.

#### **Step 1: Loading the dataset**

In [6]:
import pandas as pd

# Reading the processed dataset

train_data = pd.read_csv("/workspaces/proyecto-final-Phishing-URL/data/processed/X_train_sel.csv")
test_data = pd.read_csv("/workspaces/proyecto-final-Phishing-URL/data/processed/X_test_sel.csv")

train_data.head()

Unnamed: 0,URLSimilarityIndex,CharContinuationRate,URLCharProb,LetterRatioInURL,DegitRatioInURL,NoOfOtherSpecialCharsInURL,SpacialCharRatioInURL,IsHTTPS,HasTitle,DomainTitleMatchScore,...,Robots,IsResponsive,HasDescription,HasSocialNet,HasSubmitButton,HasHiddenFields,Pay,HasCopyrightInfo,NoOfJS,label
0,100.0,1.0,0.066724,0.5,0.0,1.0,0.038,1.0,1.0,100.0,...,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,17.0,1
1,100.0,0.692308,0.048056,0.481,0.0,2.0,0.074,1.0,1.0,0.0,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,7.0,1
2,89.623464,0.72,0.045218,0.568,0.162,2.0,0.054,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,26.11451,1.0,0.064391,0.695,0.0,4.0,0.102,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,100.0,1.0,0.054861,0.552,0.0,1.0,0.034,1.0,1.0,100.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,23.0,1


#### **Step 2: Build a Naive Bayes:**

In [7]:
# Separate predictors and target variable in training and test data:

X_train = train_data.drop(['label'], axis = 1)
y_train = train_data['label']
X_test = test_data.drop(['label'], axis = 1)
y_test = test_data['label']

In [8]:
# BernoulliNB

from sklearn.naive_bayes import BernoulliNB

model = BernoulliNB()
model.fit(X_train, y_train)

In [10]:
# Make predictions on test data:

y_pred = model.predict(X_test)

# Calculating model accuracy on test data:

from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.9795640905807877

In [11]:
# Optimize Bernoulli model:

from sklearn.model_selection import GridSearchCV

# We define the parameters by hand that we want to adjust
hyperparams = {
    'alpha': [0.1, 0.5, 1.0, 2.0],    
    'binarize': [0.0, 0.5, 1.0],     
    'fit_prior': [True, False],       
}

# We initialize the grid
grid = GridSearchCV(model, hyperparams, scoring = "accuracy", cv = 5)
grid

In [12]:
# Retrieve the best parameters:

grid.fit(X_train, y_train)

print(f"Best hyperparameters: {grid.best_params_}")

Best hyperparameters: {'alpha': 0.1, 'binarize': 0.0, 'fit_prior': True}


In [13]:
# Retrain the model:

model_grid = BernoulliNB(alpha=0.1, binarize=0.0, fit_prior=True)
model_grid.fit(X_train, y_train)

In [15]:
# Make predictions on retrained data:

y_pred = model_grid.predict(X_test)

# Calculating model accuracy on test data:

accuracy_score(y_test, y_pred)

0.9796703063262098

In [16]:
# Save the model:

from pickle import dump

dump(model, open("/workspaces/proyecto-final-Phishing-URL/models/nbayes_bernoulli_opt_alpha-0.1_bina-0.0_prior-true.sav", "wb"))

In [17]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model.fit(X_train, y_train)

In [18]:
# Make predictions on test data:

y_pred = model.predict(X_test)

# Calculating model accuracy on test data:

accuracy_score(y_test, y_pred)

0.9983430343714152

In [19]:
# Optimize Gaussian model:

from sklearn.model_selection import GridSearchCV

# We define the parameters by hand that we want to adjust
parameters = {
    'var_smoothing': [1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8, 1e-9, 1e-10, 1e-11, 1e-12, 1e-13, 1e-14, 1e-15]
}

# We initialize the grid
grid = GridSearchCV(model, parameters, scoring = "accuracy", cv = 5)
grid

In [20]:
# Retrieve the best parameters:

grid.fit(X_train, y_train)

print(f"Best parameter: {grid.best_params_}")

Best parameter: {'var_smoothing': 1e-05}


In [21]:
# Retrain the model:

model_grid = GaussianNB(priors=None, var_smoothing=1e-05)
model_grid.fit(X_train, y_train)

In [22]:
# Make predictions on retrained data:

y_pred = model_grid.predict(X_test)

# Calculating model accuracy on test data:

accuracy_score(y_test, y_pred)

0.9992140034838765

In [23]:
# Save the model:

from pickle import dump

dump(model, open("/workspaces/proyecto-final-Phishing-URL/models/nbayes_gaussian_opt_var_smoothing-1e-05.sav", "wb"))

Conclusions:
When comparing both Naive Bayes models, we have:
- Optimized Bernoulli model:  0.9796703063262098
- Optimized Gaussian model: **0.9992140034838765**