In [1]:
%pip install numpy pandas requests joblib beautifulsoup4 tensorflow scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import pandas as pd
import requests
import joblib
from bs4 import BeautifulSoup
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import LeakyReLU
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [None]:
discriminator = load_model(
    "discriminator_model.h5",
    custom_objects={"LeakyReLU": LeakyReLU}  
)
ensemble_model = joblib.load("voting_classifier_soft_model.joblib")  
scaler = joblib.load("scaler.pkl") 

In [None]:
def preprocess_url(url):
    features = {
        "url_length": len(url),
        "num_digits": sum(c.isdigit() for c in url),
        "num_special_chars": sum(c in "!@#$%^&*()_+=" for c in url),
        "num_subdomains": url.count("."),
        "has_https": 1 if "https" in url else 0,
        "contains_phishing_keyword": 1 if any(word in url.lower() for word in ["login", "secure", "bank", "verify", "update"]) else 0,
        "digit_ratio": sum(c.isdigit() for c in url) / len(url)
    }
    return pd.DataFrame([features])

In [None]:
def classify_url(url):
    url_features = preprocess_url(url)
    url_features_scaled = scaler.transform(url_features)
    prediction = discriminator.predict(url_features_scaled)
    return prediction[0][0]

In [None]:

def is_website_accessible(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Raise an exception for HTTP errors
        return True
    except requests.exceptions.RequestException as e:
        print(f"Website is not accessible: {e}")
        return False

In [None]:
def extract_content_features(url):
    try:
        # Fetch the website content
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Raise an exception for HTTP errors
        soup = BeautifulSoup(response.content, "html.parser")

        # Extract features
        features = {
            "has_title": 1 if soup.title else 0,
            "has_input": 1 if soup.find("input") else 0,
            "has_button": 1 if soup.find("button") else 0,
            "has_image": 1 if soup.find("img") else 0,
            "has_submit": 1 if soup.find("input", {"type": "submit"}) else 0,
            "has_link": 1 if soup.find("a") else 0,
            "has_password": 1 if soup.find("input", {"type": "password"}) else 0,
            "has_email_input": 1 if soup.find("input", {"type": "email"}) else 0,
            "has_hidden_element": 1 if soup.find("input", {"type": "hidden"}) else 0,
            "has_audio": 1 if soup.find("audio") else 0,
            "has_video": 1 if soup.find("video") else 0,
            "number_of_inputs": len(soup.find_all("input")),
            "number_of_buttons": len(soup.find_all("button")),
            "number_of_images": len(soup.find_all("img")),
            "number_of_option": len(soup.find_all("option")),
            "number_of_list": len(soup.find_all("li")),
            "number_of_th": len(soup.find_all("th")),
            "number_of_tr": len(soup.find_all("tr")),
            "number_of_href": len(soup.find_all("a", href=True)),
            "number_of_paragraph": len(soup.find_all("p")),
            "number_of_script": len(soup.find_all("script")),
            "length_of_title": len(soup.title.string) if soup.title else 0,
            "has_h1": 1 if soup.find("h1") else 0,
            "has_h2": 1 if soup.find("h2") else 0,
            "has_h3": 1 if soup.find("h3") else 0,
            "length_of_text": len(soup.get_text()),
            "number_of_clickable_button": len(soup.find_all("button")) + len(soup.find_all("input", {"type": "submit"})),
            "number_of_a": len(soup.find_all("a")),
            "number_of_img": len(soup.find_all("img")),
            "number_of_div": len(soup.find_all("div")),
            "number_of_figure": len(soup.find_all("figure")),
            "has_footer": 1 if soup.find("footer") else 0,
            "has_form": 1 if soup.find("form") else 0,
            "has_text_area": 1 if soup.find("textarea") else 0,
            "has_iframe": 1 if soup.find("iframe") else 0,
            "has_text_input": 1 if soup.find("input", {"type": "text"}) else 0,
            "number_of_meta": len(soup.find_all("meta")),
            "has_nav": 1 if soup.find("nav") else 0,
            "has_object": 1 if soup.find("object") else 0,
            "has_picture": 1 if soup.find("picture") else 0,
            "number_of_sources": len(soup.find_all("source")),
            "number_of_span": len(soup.find_all("span")),
            "number_of_table": len(soup.find_all("table"))
        }
        return pd.DataFrame([features])
    except Exception as e:
        print(f"Error extracting content features: {e}")
        return None

In [None]:
def classify_content(content_features):
    prediction = ensemble_model.predict_proba(content_features)[0][1]  # Probability of phishing
    return prediction

In [None]:
url_weight = 0.4  
content_weight = 0.6 
y_true = []
y_pred = []
test_data = pd.read_csv("test_data.csv")  

In [None]:
for index, row in test_data.iterrows():
    url = row["url"]
    true_label = row["label"]
    y_true.append(true_label)
    url_prob = classify_url(url)
    if is_website_accessible(url):
        content_features = extract_content_features(url)
        if content_features is not None:
            content_prob = classify_content(content_features)
        else:
            content_prob = 1.0  
    else:
        content_prob = 1.0  
    combined_prob = (url_weight * url_prob) + (content_weight * content_prob)
    final_decision = 1 if combined_prob > 0.5 else 0  #threshold=0.5
    y_pred.append(final_decision)
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
conf_matrix = confusion_matrix(y_true, y_pred)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)



Scaler not found. Fitting scaler on training data...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 146ms/step
Website is not accessible: 404 Client Error: Not Found for url: http://carrorosabreas.blogspot.com/
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
Website is not accessible: 410 Client Error: Gone for url: https://app-degenchain.com/
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
Website is not accessible: 403 Client Error: Forbidden for url: http://storageapi.fleek.co/3a6ea15a-585f-4859-a754-100e256992d1-bucket/ggg/index.html
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
Website is not accessible: 403 Client Error: Forbidden for url: https://storageapi.fleek.co/3a6ea15a-585f-4859-a754-100e256992d1-bucket/jhy/index.html
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
Website is not accessible: HTTPSConnectionPool(host='chatbotlmkndndjkcnjkdcn.page', port=443): Max

In [None]:
# Print wrongly misclassified URLs
print("\nMisclassified URLs:")
for i in range(len(y_true)):
    if y_true[i] != y_pred[i]:  # Check for misclassification
        print(f"Index: {i}, URL: {test_data.iloc[i]['url']}, True Label: {y_true[i]}, Predicted Label: {y_pred[i]}")



Misclassified URLs:
Index: 33, URL: http://ww-magiceden-web-page2.blogspot.com/, True Label: 1, Predicted Label: 0
Index: 48, URL: http://zx-kyber-swap-swap-acessoo-y0.blogspot.com/, True Label: 1, Predicted Label: 0
Index: 92, URL: http://impervadns.net/, True Label: 0, Predicted Label: 1
