In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## The Goal of this notebook is to analyse the change in the accuracy, precision, and recall vs the change in the training percentage

# Import the libraries

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from tensorflow import keras
from keras.models import Sequential
from keras import Input
from keras.layers import Dense

# Read the data

In [None]:
df = pd.read_csv("../input/phishing-site-urls/phishing_site_urls.csv")
print(df.shape)
df.head()

In [None]:
df.Label.value_counts()

In [None]:
sns.countplot(data=df, x="Label")
plt.show()

# Shuffle the data

In [None]:
df_shuffled = shuffle(df, random_state=42)

# Choose 5000 samples

In [None]:
data_size = 5000

In [None]:
df_used = df_shuffled[:data_size].copy()

# Make sure the distribution is the same after taking the sample

In [None]:
sns.countplot(data=df_used, x="Label")
plt.show()

In [None]:
df_used.info()

# Replace the labels

In [None]:
df_used.replace({'good':0, 'bad':1}, inplace=True)

In [None]:
df_used.Label.value_counts()

# Divide the data into features and labels

In [None]:
X = df_used[['URL']].copy()
y = df_used.Label.copy()

# Initialize the tokenizer, stemmer, and Vectorizer

In [None]:
tokenizer = RegexpTokenizer(r'[A-Za-z]+')
stemmer = SnowballStemmer("english")
cv = CountVectorizer()

# Prepare the data

In [None]:
def prepare_data(X) :
    X['text_tokenized'] = X.URL.map(lambda t: tokenizer.tokenize(t))
    X['text_stemmed'] = X.text_tokenized.map(lambda t: [stemmer.stem(word) for word in t])
    X['text_sent'] = X.text_stemmed.map(lambda t: ' '.join(t))
    features = cv.fit_transform(X.text_sent)
    return X, features

In [None]:
X, features = prepare_data(X)

# Import and intitialize the models

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [None]:
logreg = LogisticRegression()
knn = KNeighborsClassifier()
dtree = DecisionTreeClassifier()
rfc = RandomForestClassifier()
svc = SVC()

# Train and test the models using different training percentages

In [None]:
def train_test_model(model, X, y, training_percentage) :
    trainX, testX, trainY, testY = train_test_split(X, y, test_size=1-training_percentage, stratify=y, random_state=42)
    model.fit(trainX, trainY)
    predY = model.predict(testX)
    accuracy = accuracy_score(testY, predY)
    precision = precision_score(testY, predY, pos_label=1)
    recall = recall_score(testY, predY, pos_label=1)
    return accuracy, precision, recall  

In [None]:
training_sizes = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])

In [None]:
def model_results(model) :
    results = []
    for p in training_sizes :
        results.append(train_test_model(model, features, y, p))
    return pd.DataFrame(results, columns=['Accuracy', 'Precision', 'Recall'])

In [None]:
logreg_results = model_results(logreg)
knn_results = model_results(knn)
dtree_results = model_results(dtree)
rfc_results = model_results(rfc)
svc_results = model_results(svc)

In [None]:
models = ['Logistic Regression', 'KNN', 'Decision Tree', 'Random Forest', 'SVM']
model_results = [logreg_results, knn_results, dtree_results, rfc_results, svc_results]

In [None]:
accuracies = []
precisions = []
recalls = []
for model in model_results :
    accuracies.append(model.Accuracy.values)
    precisions.append(model.Precision.values)
    recalls.append(model.Recall.values)

In [None]:
accuracies = pd.DataFrame(np.transpose(accuracies), columns=models, index=training_sizes*100)
precisions = pd.DataFrame(np.transpose(precisions), columns=models, index=training_sizes*100)
recalls = pd.DataFrame(np.transpose(recalls), columns=models, index=training_sizes*100)

In [None]:
accuracies

In [None]:
precisions

In [None]:
recalls

# Plot the accuracy, precision, and recall of the 5 models

In [None]:
sns.set(rc={'figure.figsize':(15,8)})
sns.set_style('whitegrid')
g = sns.lineplot(data = accuracies, markers= ['o', 'o', 'o', 'o', 'o'])
g.set(xlim = (0,100), ylim = (0.6,1), xticks = np.arange(0, 100, 10), yticks = np.arange(0.6, 1, 0.05))
g.set_title("Accuracy vs Training Percentage for the Machine Learning Algorithms")
g.set_xlabel("Training Percentage")
g.set_ylabel("Accuracy")

In [None]:
sns.set(rc={'figure.figsize':(15,8)})
sns.set_style('whitegrid')
g = sns.lineplot(data = precisions, markers= ['o', 'o', 'o', 'o', 'o'])
g.set(xlim = (0,100), ylim = (0.4,1), xticks = np.arange(0, 100, 10), yticks = np.arange(0.4, 1, 0.05))
g.set_title("Precision vs Training Percentage for the Machine Learning Algorithms")
g.set_xlabel("Training Percentage")
g.set_ylabel("Precision")

In [None]:
sns.set(rc={'figure.figsize':(15,8)})
sns.set_style('whitegrid')
g = sns.lineplot(data = recalls, markers= ['o', 'o', 'o', 'o', 'o'])
g.set(xlim = (0,100), ylim = (0,1), xticks = np.arange(0, 100, 10), yticks = np.arange(0, 1, 0.05))
g.set_title("Recall vs Training Percentage for the Machine Learning Algorithms")
g.set_xlabel("Training Percentage")
g.set_ylabel("Recall")

# Let's do the same with a feed forward neural network
We can also test the impact of the number of hidden units as well

In [None]:
hidden_units = [2, 4, 6, 8, 10, 12, 14, 16, 18]

In [None]:
def train_test_nn(X, y, training_percentage, hidden_units) :
    trainX, testX, trainY, testY = train_test_split(X, y, test_size=1-training_percentage, stratify=y, random_state=42)
    trainX = trainX.toarray()
    testX = testX.toarray()
    trainY = np.array(trainY)
    testY = np.array(testY)
    model = Sequential()
    model.add(Input(shape=(trainX.shape[1], ), name='Input-Layer'))
    model.add(Dense(hidden_units, activation='relu', name='Hidden-Layer'))
    model.add(Dense(1, activation='sigmoid', name='Output-Layer'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['Accuracy', 'Precision', 'Recall'])
    model.fit(trainX, trainY, batch_size = 10, epochs = 10, verbose=0)
    score = model.evaluate(testX, testY, batch_size = 1, verbose=2)
    return score

In [None]:
accuracies_nn = []
precisions_nn = []
recalls_nn = []
for ts in training_sizes :
    a = []
    p = []
    r = []
    for hn in hidden_units :
        s = train_test_nn(features, y, ts, hn)
        a.append(s[1])
        p.append(s[2])
        r.append(s[3])
    accuracies_nn.append(a)
    precisions_nn.append(p)
    recalls_nn.append(r)

In [None]:
accuracies_nn_df = pd.DataFrame(accuracies_nn, columns=hidden_units, index=training_sizes*100)
precisions_nn_df = pd.DataFrame(precisions_nn, columns=hidden_units, index=training_sizes*100)
recalls_nn_df = pd.DataFrame(recalls_nn, columns=hidden_units, index=training_sizes*100)

In [None]:
accuracies_nn_df

In [None]:
precisions_nn_df

In [None]:
recalls_nn_df

In [None]:
sns.set(rc={'figure.figsize':(15,8)})
sns.set_style('whitegrid')
g = sns.lineplot(data = accuracies_nn_df, markers= ['o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o'])
g.set(xlim = (0,100), ylim = (0.65,1), xticks = np.arange(0, 100, 10), yticks = np.arange(0.65, 1, 0.05))
g.set_title("Accuracy vs Training Percentage for the Feed Forward Neural Network")
g.set_xlabel("Training Percentage")
g.set_ylabel("Accuracy")

In [None]:
sns.set(rc={'figure.figsize':(15,8)})
sns.set_style('whitegrid')
g = sns.lineplot(data = precisions_nn_df, markers= ['o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o'])
g.set(xlim = (0,100), ylim = (0,1), xticks = np.arange(0, 100, 10), yticks = np.arange(0, 1, 0.05))
g.set_title("Precision vs Training Percentage for the Feed Forward Neural Network")
g.set_xlabel("Training Percentage")
g.set_ylabel("Precision")

In [None]:
sns.set(rc={'figure.figsize':(15,8)})
sns.set_style('whitegrid')
g = sns.lineplot(data = recalls_nn_df, markers= ['o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o'])
g.set(xlim = (0,100), ylim = (0,1), xticks = np.arange(0, 100, 10), yticks = np.arange(0, 1, 0.05))
g.set_title("Recall vs Training Percentage for the Feed Forward Neural Network")
g.set_xlabel("Training Percentage")
g.set_ylabel("Recall")

In [None]:
# df_used2 = df_shuffled[:50000].copy()
# df_used2.replace({'good':0, 'bad':1}, inplace=True)
# X2 = df_used2[['URL']].copy()
# y2 = df_used2.Label.copy()
# X2, features2 = prepare_data(X2)

In [None]:
# logreg2 = LogisticRegression(max_iter=1000)
# knn2 = KNeighborsClassifier()
# dtree2 = DecisionTreeClassifier()
# rfc2 = RandomForestClassifier()
# svc2 = SVC()

In [None]:
# print(train_test_model(logreg2, features2, y2, 0.8))
# print(train_test_model(knn2, features2, y2, 0.8))
# print(train_test_model(dtree2, features2, y2, 0.8))
# print(train_test_model(rfc2, features2, y2, 0.8))
# print(train_test_model(svc2, features2, y2, 0.8))