In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import missingno as msno
import json
import xgboost

from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,recall_score,precision_score


In [None]:
df = pd.read_csv('dataset_small.csv')

l = list(df.columns)

URL_based_features = [i for i in l if 'url' in i]
Domain_based_features = [i for i in l if 'domain' in i]
Page_based_features = [i for i in l if i.endswith('params') or i.endswith('directory') or i.endswith('file')]
Content_based_features = 'email_in_url'

def Uploat_data_to_MongoDB():
    client = pymongo.MongoClient('mongodb+srv://aryangaur556:Abhishek@cluster0.pfi4w9l.mongodb.net/?retryWrites=true&w=majority')
    db = client['database']['PhishingDomainDetection']
    db.insert_many(json.loads(df.T.to_json()).values())

df.head()

msno.matrix(df)

px.box(df)

x = df.drop('phishing', axis=1)
y = df['phishing']

y.value_counts()

sns.heatmap(x.corr(), annot=True)

redundant_cols = []
redundant_cols.extend(np.var(x).keys()[np.var(x).values == 0])

x.drop(labels=redundant_cols, axis=1, inplace=True)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

pca = PCA()
pca.fit(x_train_scaled)

explained_variance_ratio = pca.explained_variance_ratio_
cum_explained_variance = np.cumsum(explained_variance_ratio)
n = np.argmax(cum_explained_variance >= 0.95) + 1
plt.plot(cum_explained_variance)

pca = PCA(n_components=n)
x_train = pca.fit_transform(x_train_scaled)
x_test = pca.transform(x_test_scaled)


In [None]:
models = {
    "AdaBoostClassifier": AdaBoostClassifier(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "RandomForestClassifier": RandomForestClassifier(),
    "GradientBoostingClassifier": GradientBoostingClassifier(),
    "SVC": SVC(),
    "XGBClassifier": XGBClassifier()
}

model_list = {}

for model_name, model in models.items():
    model.fit(x_train, y_train)
    pred = model.predict(x_test)
    score = accuracy_score(y_test, pred)
    model_list[model_name] = score

print(model_list)

i = np.argmax(list(model_list.values()))
best_model_name = list(model_list.keys())[i]
best_model_obj = models[best_model_name]

import yaml

def read_yaml():
    with open('E:\\ML_projects\\PhishingDomainDetection\\config\\Model.yaml', 'r') as file:
        return yaml.safe_load(file)

grid = GridSearchCV(best_model_obj, param_grid=read_yaml()['model_selection']['model'][best_model_name]['search_param_grid'], cv=5)
grid.fit(x_train, y_train)
parameters = grid.best_params_

clf = best_model_obj(**parameters)
clf.fit(x_train, y_train)
pred = clf.predict(x_test)
accuracy_score(y_test, pred)