In [None]:
#importing required libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn import metrics 
import warnings
warnings.filterwarnings('ignore')

In [None]:
#Loading data into dataframe

data = pd.read_csv("phishing.csv")
data.head()

In [None]:
#Shape of dataframe

data.shape

In [None]:
#Listing the features of the dataset

data.columns

In [None]:
#Information about the dataset

data.info()

In [None]:
# nunique value in columns

data.nunique()

In [None]:
#droping index column

data = data.drop(['Index'],axis = 1)

In [None]:
#description of dataset

data.describe().T

In [None]:
#Correlation heatmap

plt.figure(figsize=(15,15))
sns.heatmap(data.corr(), annot=True)
plt.show()

In [None]:
# Phishing Count in pie chart

data['class'].value_counts().plot(kind='pie',autopct='%1.2f%%')
plt.title("Phishing Count")
plt.show()

In [None]:
# Splitting the dataset into dependant and independant fetature

X = data.drop(["class"],axis =1)
y = data["class"]

In [None]:
# Splitting the dataset into train and test sets: 80-20 split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
# Creating holders to store the model performance results
ML_Model = []
accuracy = []
f1_score = []
recall = []
precision = []

#function to call for storing the results
def storeResults(model, a,b,c,d):
  ML_Model.append(model)
  accuracy.append(round(a, 3))
  f1_score.append(round(b, 3))
  recall.append(round(c, 3))
  precision.append(round(d, 3))

In [None]:
# Random Forest Classifier Model
from sklearn.ensemble import RandomForestClassifier

# instantiate the model
forest = RandomForestClassifier(n_estimators=100, random_state=42)

# fit the model 
forest.fit(X_train,y_train)

In [None]:
#predicting the target value from the model for the samples
y_train_forest = forest.predict(X_train)
y_test_forest = forest.predict(X_test)

In [None]:
#computing the accuracy, f1_score, Recall, precision of the model performance

acc_train_forest = metrics.accuracy_score(y_train,y_train_forest)
acc_test_forest = metrics.accuracy_score(y_test,y_test_forest)
print("Random Forest : Accuracy on training Data: {:.3f}".format(acc_train_forest))
print("Random Forest : Accuracy on test Data: {:.3f}".format(acc_test_forest))
print()

f1_score_train_forest = metrics.f1_score(y_train,y_train_forest)
f1_score_test_forest = metrics.f1_score(y_test,y_test_forest)
print("Random Forest : f1_score on training Data: {:.3f}".format(f1_score_train_forest))
print("Random Forest : f1_score on test Data: {:.3f}".format(f1_score_test_forest))
print()

recall_score_train_forest = metrics.recall_score(y_train,y_train_forest)
recall_score_test_forest = metrics.recall_score(y_test,y_test_forest)
print("Random Forest : Recall on training Data: {:.3f}".format(recall_score_train_forest))
print("Random Forest : Recall on test Data: {:.3f}".format(recall_score_test_forest))
print()

precision_score_train_forest = metrics.precision_score(y_train,y_train_forest)
precision_score_test_forest = metrics.precision_score(y_test,y_test_forest)
print("Random Forest : precision on training Data: {:.3f}".format(precision_score_train_forest))
print("Random Forest : precision on test Data: {:.3f}".format(precision_score_test_forest))

In [None]:
#computing the classification report of the model

print(metrics.classification_report(y_test, y_test_forest))

In [None]:
training_accuracy = []
test_accuracy = []
# try max_depth from 1 to 20
depth = range(1,20)
for n in depth:
    forest_test = RandomForestClassifier(n_estimators=100, max_depth=n, random_state=42)

    forest_test.fit(X_train, y_train)
    # record training set accuracy
    training_accuracy.append(forest_test.score(X_train, y_train))
    # record generalization accuracy
    test_accuracy.append(forest_test.score(X_test, y_test))
    

#plotting the training & testing accuracy for n_estimators from 1 to 20
plt.figure(figsize=None)
plt.plot(depth, training_accuracy, label="training accuracy")
plt.plot(depth, test_accuracy, label="test accuracy")
plt.ylabel("Accuracy")  
plt.xlabel("n_estimators")
plt.title("Training & Testing Accuracy for Random Forest")
plt.legend()

In [None]:
# Logistic regression model 
from sklearn.linear_model import LogisticRegression
#from sklearn.pipeline import Pipeline

# instantiate the model
log = LogisticRegression()

# fit the model 
log.fit(X_train,y_train)

In [None]:
#predicting the target value from the model for the samples

y_train_log = log.predict(X_train)
y_test_log = log.predict(X_test)

In [None]:
#computing the accuracy, f1_score, Recall, precision of the model performance

acc_train_log = metrics.accuracy_score(y_train,y_train_log)
acc_test_log = metrics.accuracy_score(y_test,y_test_log)
print("Logistic Regression : Accuracy on training Data: {:.3f}".format(acc_train_log))
print("Logistic Regression : Accuracy on test Data: {:.3f}".format(acc_test_log))
print()

f1_score_train_log = metrics.f1_score(y_train,y_train_log)
f1_score_test_log = metrics.f1_score(y_test,y_test_log)
print("Logistic Regression : f1_score on training Data: {:.3f}".format(f1_score_train_log))
print("Logistic Regression : f1_score on test Data: {:.3f}".format(f1_score_test_log))
print()

recall_score_train_log = metrics.recall_score(y_train,y_train_log)
recall_score_test_log = metrics.recall_score(y_test,y_test_log)
print("Logistic Regression : Recall on training Data: {:.3f}".format(recall_score_train_log))
print("Logistic Regression : Recall on test Data: {:.3f}".format(recall_score_test_log))
print()

precision_score_train_log = metrics.precision_score(y_train,y_train_log)
precision_score_test_log = metrics.precision_score(y_test,y_test_log)
print("Logistic Regression : precision on training Data: {:.3f}".format(precision_score_train_log))
print("Logistic Regression : precision on test Data: {:.3f}".format(precision_score_test_log))

In [None]:
#computing the classification report of the model

print(metrics.classification_report(y_test, y_test_log))

In [None]:

# Lists to store training and testing accuracy
training_accuracy_logreg = []
test_accuracy_logreg = []

# Try different values of the regularization parameter C
C_values = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

for C in C_values:

    # Record training set accuracy
    training_accuracy_logreg.append(log.score(X_train, y_train))

    # Record generalization accuracy
    test_accuracy_logreg.append(log.score(X_test, y_test))

# Plotting the training & testing accuracy for different values of C
plt.plot(C_values, training_accuracy_logreg, label="Training accuracy")
plt.plot(C_values, test_accuracy_logreg, label="Testing accuracy")
plt.xscale('log')  # Using a logarithmic scale for better visualization
plt.xlabel("Regularization Parameter (C)")
plt.ylabel("Accuracy")
plt.title("Training & Testing Accuracy for Logistic Regression")
plt.legend()
plt.show()

In [None]:
#storing the results. The below mentioned order of parameter passing is important.

storeResults('Logistic Regression',acc_test_log,f1_score_test_log,
             recall_score_train_log,precision_score_train_log)

In [None]:
# Decision Tree Classifier model 
from sklearn.tree import DecisionTreeClassifier

# instantiate the model 
tree = DecisionTreeClassifier(max_depth=30)

# fit the model 
tree.fit(X_train, y_train)

In [None]:
#predicting the target value from the model for the samples

y_train_tree = tree.predict(X_train)
y_test_tree = tree.predict(X_test)

In [None]:
#computing the accuracy, f1_score, Recall, precision of the model performance

acc_train_tree = metrics.accuracy_score(y_train,y_train_tree)
acc_test_tree = metrics.accuracy_score(y_test,y_test_tree)
print("Decision Tree : Accuracy on training Data: {:.3f}".format(acc_train_tree))
print("Decision Tree : Accuracy on test Data: {:.3f}".format(acc_test_tree))
print()

f1_score_train_tree = metrics.f1_score(y_train,y_train_tree)
f1_score_test_tree = metrics.f1_score(y_test,y_test_tree)
print("Decision Tree : f1_score on training Data: {:.3f}".format(f1_score_train_tree))
print("Decision Tree : f1_score on test Data: {:.3f}".format(f1_score_test_tree))
print()

recall_score_train_tree = metrics.recall_score(y_train,y_train_tree)
recall_score_test_tree = metrics.recall_score(y_test,y_test_tree)
print("Decision Tree : Recall on training Data: {:.3f}".format(recall_score_train_tree))
print("Decision Tree : Recall on test Data: {:.3f}".format(recall_score_test_tree))
print()

precision_score_train_tree = metrics.precision_score(y_train,y_train_tree)
precision_score_test_tree = metrics.precision_score(y_test,y_test_tree)
print("Decision Tree : precision on training Data: {:.3f}".format(precision_score_train_tree))
print("Decision Tree : precision on test Data: {:.3f}".format(precision_score_test_tree))

In [None]:
#computing the classification report of the model

print(metrics.classification_report(y_test, y_test_tree))

In [None]:
training_accuracy = []
test_accuracy = []
# try max_depth from 1 to 30
depth = range(1,30)
for n in depth:
    tree_test = DecisionTreeClassifier(max_depth=n)

    tree_test.fit(X_train, y_train)
    # record training set accuracy
    training_accuracy.append(tree_test.score(X_train, y_train))
    # record generalization accuracy
    test_accuracy.append(tree_test.score(X_test, y_test))
    

#plotting the training & testing accuracy for max_depth from 1 to 30
plt.plot(depth, training_accuracy, label="training accuracy")
plt.plot(depth, test_accuracy, label="test accuracy")
plt.ylabel("Accuracy")  
plt.xlabel("max_depth")
plt.legend();

In [None]:
#storing the results. The below mentioned order of parameter passing is important.

storeResults('Decision Tree',acc_test_tree,f1_score_test_tree,
             recall_score_train_tree,precision_score_train_tree)

In [None]:
# Gaussian Naive Bayes Classifier model
from sklearn.naive_bayes import GaussianNB

# instantiate the model
nb = GaussianNB()

# fit the model
nb.fit(X_train, y_train)

In [None]:
# predicting the target value from the model for the samples
y_train_nb = nb.predict(X_train)
y_test_nb = nb.predict(X_test)



In [None]:
# computing the accuracy, f1_score, Recall, precision of the model performance
acc_train_nb = metrics.accuracy_score(y_train, y_train_nb)
acc_test_nb = metrics.accuracy_score(y_test, y_test_nb)
print("Gaussian Naive Bayes: Accuracy on training Data: {:.3f}".format(acc_train_nb))
print("Gaussian Naive Bayes: Accuracy on test Data: {:.3f}".format(acc_test_nb))
print()

f1_score_train_nb = metrics.f1_score(y_train, y_train_nb)
f1_score_test_nb = metrics.f1_score(y_test, y_test_nb)
print("Gaussian Naive Bayes: f1_score on training Data: {:.3f}".format(f1_score_train_nb))
print("Gaussian Naive Bayes: f1_score on test Data: {:.3f}".format(f1_score_test_nb))
print()

recall_score_train_nb = metrics.recall_score(y_train, y_train_nb)
recall_score_test_nb = metrics.recall_score(y_test, y_test_nb)
print("Gaussian Naive Bayes: Recall on training Data: {:.3f}".format(recall_score_train_nb))
print("Gaussian Naive Bayes: Recall on test Data: {:.3f}".format(recall_score_test_nb))
print()

precision_score_train_nb = metrics.precision_score(y_train, y_train_nb)
precision_score_test_nb = metrics.precision_score(y_test, y_test_nb)
print("Gaussian Naive Bayes: precision on training Data: {:.3f}".format(precision_score_train_nb))
print("Gaussian Naive Bayes: precision on test Data: {:.3f}".format(precision_score_test_nb))


In [None]:

# computing the classification report of the model
print(metrics.classification_report(y_test, y_test_nb))


In [None]:

training_accuracy_nb = []
test_accuracy_nb = []
# try different priors
priors = [None, [0.1, 0.9], [0.2, 0.8], [0.3, 0.7], [0.4, 0.6], [0.5, 0.5]]
for prior in priors:
    nb_test = GaussianNB(priors=prior)

    nb_test.fit(X_train, y_train)
    # record training set accuracy
    training_accuracy_nb.append(nb_test.score(X_train, y_train))
    # record generalization accuracy
    test_accuracy_nb.append(nb_test.score(X_test, y_test))

# plotting the training & testing accuracy for different priors
plt.plot(range(len(priors)), training_accuracy_nb, label="training accuracy")
plt.plot(range(len(priors)), test_accuracy_nb, label="test accuracy")
plt.xticks(range(len(priors)), [str(prior) for prior in priors])
plt.ylabel("Accuracy")
plt.xlabel("Priors")
plt.legend();


In [None]:

# storing the results. The below mentioned order of parameter passing is important.
storeResults('Gaussian Naive Bayes', acc_test_nb, f1_score_test_nb,
             recall_score_train_nb, precision_score_train_nb)

In [None]:
# Multi-layer Perceptron Classifier Model
from sklearn.neural_network import MLPClassifier

# instantiate the model
mlp = MLPClassifier()
#mlp = GridSearchCV(mlpc, parameter_space)

# fit the model 
mlp.fit(X_train,y_train)


In [None]:

#predicting the target value from the model for the samples
y_train_mlp = mlp.predict(X_train)
y_test_mlp = mlp.predict(X_test)


In [None]:

#computing the accuracy, f1_score, Recall, precision of the model performance

acc_train_mlp  = metrics.accuracy_score(y_train,y_train_mlp)
acc_test_mlp = metrics.accuracy_score(y_test,y_test_mlp)
print("Multi-layer Perceptron : Accuracy on training Data: {:.3f}".format(acc_train_mlp))
print("Multi-layer Perceptron : Accuracy on test Data: {:.3f}".format(acc_test_mlp))
print()

f1_score_train_mlp = metrics.f1_score(y_train,y_train_mlp)
f1_score_test_mlp = metrics.f1_score(y_test,y_test_mlp)
print("Multi-layer Perceptron : f1_score on training Data: {:.3f}".format(f1_score_train_mlp))
print("Multi-layer Perceptron : f1_score on test Data: {:.3f}".format(f1_score_train_mlp))
print()

recall_score_train_mlp = metrics.recall_score(y_train,y_train_mlp)
recall_score_test_mlp = metrics.recall_score(y_test,y_test_mlp)
print("Multi-layer Perceptron : Recall on training Data: {:.3f}".format(recall_score_train_mlp))
print("Multi-layer Perceptron : Recall on test Data: {:.3f}".format(recall_score_test_mlp))
print()

precision_score_train_mlp = metrics.precision_score(y_train,y_train_mlp)
precision_score_test_mlp = metrics.precision_score(y_test,y_test_mlp)
print("Multi-layer Perceptron : precision on training Data: {:.3f}".format(precision_score_train_mlp))
print("Multi-layer Perceptron : precision on test Data: {:.3f}".format(precision_score_test_mlp))


In [None]:

#computing the classification report of the model

print(metrics.classification_report(y_test, y_test_mlp))


In [None]:

training_accuracy = []
test_accuracy = []
depth=range(1,20)
for n in depth:
    mlp_test = MLPClassifier(learning_rate_init=n*0.1, max_iter=200, random_state=42)

    mlp_test.fit(X_train, y_train)
    # Record training set accuracy
    training_accuracy.append(mlp_test.score(X_train, y_train))
    # Record generalization accuracy
    test_accuracy.append(mlp_test.score(X_test, y_test))

# Plotting the training & testing accuracy for different learning rates
plt.figure(figsize=(10, 6))
plt.plot(depth, training_accuracy, label="Training accuracy")
plt.plot(depth, test_accuracy, label="Testing accuracy")
plt.xlabel("Learning Rate")
plt.ylabel("Accuracy")
plt.title("Training & Testing Accuracy for MLP")
plt.legend()
plt.show()


In [None]:

#storing the results. The below mentioned order of parameter passing is important.

storeResults('Multi-layer Perceptron',acc_test_mlp,f1_score_test_mlp,
             recall_score_train_mlp,precision_score_train_mlp)
             


In [None]:
#creating dataframe
result = pd.DataFrame({ 'ML Model' : ML_Model,
                        'Accuracy' : accuracy,
                        'f1_score' : f1_score,
                        'Recall'   : recall,
                        'Precision': precision,
                      })

In [None]:
#Sorting the datafram on accuracy
sorted_result=result.sort_values(by=['Accuracy', 'f1_score'],ascending=False).reset_index(drop=True)

In [None]:
ML_Model = ['Random Forest', 'LogisticRegression','DecisionTreeClassifier ','Gaussian Naive Bayes', 'Multi-layer Perceptron']
accuracy = [acc_test_forest, acc_test_log, acc_test_tree, acc_test_nb, acc_test_mlp]

In [None]:
# Creating a DataFrame
results = pd.DataFrame({
    'ML Model': ML_Model,
    'Accuracy': accuracy
})

In [None]:

# Sorting the DataFrame based on Accuracy
sorted_results = results.sort_values(by=['Accuracy'], ascending=False).reset_index(drop=True)


In [None]:
# Plotting the bar chart
plt.figure(figsize=(10, 6))
bars = plt.bar(sorted_result['ML Model'], sorted_result['Accuracy'], color=['skyblue', 'lightgreen', 'lightcoral', 'lightsalmon'])
plt.title('Testing Accuracy Comparison of ML Models')
plt.xlabel('ML Model')
plt.ylabel('Testing Accuracy')
plt.ylim(0, 1)  # Set the y-axis limit between 0 and 1 for accuracy values
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability

# Displaying precise values on top of each bar
for bar, value in zip(bars, sorted_result['Accuracy']):
    plt.text(bar.get_x() + bar.get_width() / 2 - 0.1, bar.get_height() + 0.01, f'{value:.3f}', ha='center', color='black')

plt.show()

In [None]:
# dispalying total result
result

In [None]:
# dispalying total result
sorted_result

In [None]:
# Multi-layer Perceptron Classifier Model
from sklearn.neural_network import MLPClassifier

# instantiate the model
# Assuming you want a single hidden layer with 4 neurons
Bestmodel = MLPClassifier(hidden_layer_sizes=(4,), learning_rate_init=0.7)

# fit the model 
Bestmodel.fit(X_train, y_train)


Storing the pickel model



In [None]:
import os
import pickle

# Create the directory if it doesn't exist
directory = 'pickle1'
if not os.path.exists(directory):
    os.makedirs(directory)

# Dump information to that file
pickle.dump(Bestmodel, open(os.path.join(directory, 'model.pkl'), 'wb'))
