#Imports

In [None]:
try:
    from google.colab import drive
    drive.mount('/content/drive')
    print("Google Drive is gekoppeld!")
except ModuleNotFoundError:
    print("Niet in Google Colab, slaan Google Drive-mount over.")


import os
import joblib
import numpy as np
from gensim.models import KeyedVectors
import gensim.downloader as api
from bs4 import BeautifulSoup
import chardet

from google.colab import files
from tqdm import tqdm



Mounted at /content/drive
Google Drive is gekoppeld!


#Load FasText & Scaler

In [None]:
print("Loading FastText pre-trained model...")
fasttext_model = api.load('fasttext-wiki-news-subwords-300')
print("FastText model loaded!")

# Load the MinMaxScaler you previously saved:
scaler_path = "/content/drive/MyDrive/Afstuderen/Deadline/models/scaler.pkl"
scaler = joblib.load(scaler_path)
print("Scaler loaded from:", scaler_path)


Loading FastText pre-trained model...
FastText model loaded!
Scaler loaded from: /content/drive/MyDrive/Afstuderen/Deadline/models/scaler.pkl


#Feature Extraction

In [None]:
def extract_text_from_html(local_html_path):
    """
    Extract raw text from an HTML file using BeautifulSoup.
    """
    with open(local_html_path, 'rb') as f:
        raw_data = f.read()
        detected_encoding = chardet.detect(raw_data)['encoding'] or 'utf-8'
    with open(local_html_path, 'r', encoding=detected_encoding, errors='ignore') as file:
        soup = BeautifulSoup(file, 'lxml')
        return soup.get_text(separator=" ", strip=True)

def generate_fasttext_vector(text, model):
    """
    Convert text into a single FastText embedding (mean of word vectors).
    """
    words = text.split()
    vectors = [model[word] for word in words if word in model.key_to_index]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        # If empty text, return a zero vector
        return np.zeros(model.vector_size)





#Upload an HTML File & Load One of the SGD (Online) Models

In [None]:
from google.colab import files

# Step A: Upload the HTML file
print("Please upload a single HTML file for classification:")
uploaded = files.upload()
html_filename = list(uploaded.keys())[0]
print("Uploaded:", html_filename)

# Step B: Choose which scenario model to load
scenario_choice = 1  # or 2 or 3

model_path_map = {
    1: "/content/drive/MyDrive/Afstuderen/Deadline/models/sgd_online_scenario1.pkl",
    2: "/content/drive/MyDrive/Afstuderen/Deadline/models/sgd_online_scenario2.pkl",
    3: "/content/drive/MyDrive/Afstuderen/Deadline/models/sgd_online_scenario3.pkl"
}

chosen_model_path = model_path_map.get(scenario_choice)
print(f"Loading SGD (Online) model for scenario {scenario_choice} from:", chosen_model_path)

# Load the model
sgd_online_model = joblib.load(chosen_model_path)
print("SGD (Online) model loaded!")


Please upload a single HTML file for classification:


Saving 7455455.html to 7455455.html
Uploaded: 7455455.html
Loading SGD (Online) model for scenario 1 from: /content/drive/MyDrive/Afstuderen/Deadline/models/sgd_online_scenario1.pkl
SGD (Online) model loaded!


#Process the Uploaded File & Predict

In [None]:
# 1) Extract text
raw_text = extract_text_from_html(html_filename)

# 2) Convert to FastText embedding
embedding = generate_fasttext_vector(raw_text, fasttext_model)

# 3) Scale the embedding using the SAME scaler from training
embedding_2d = embedding.reshape(1, -1)
embedding_scaled = scaler.transform(embedding_2d)

# 4) Predict with the loaded SGD (Online) model
prediction = sgd_online_model.predict(embedding_scaled)

# 5) Interpret the result
label = prediction[0]  # 0 or 1
if label == 1:
    print("Prediction: PHISHING")
else:
    print("Prediction: BENIGN (Not Phishing)")




# Because we used 'log_loss' in SGD, we have predict_proba available:
probabilities = sgd_online_model.predict_proba(embedding_scaled)[0]  # array([prob_not_phish, prob_phish])
phish_prob = probabilities[1]
benign_prob = probabilities[0]

if phish_prob >= 0.5:
    print("Result: PHISHING")
    print(f"Confidence: {phish_prob*100:.2f}% (phishing)")
else:
    print("Result: BENIGN")
    print(f"Confidence: {benign_prob*100:.2f}% (benign)")





Prediction: PHISHING
Result: PHISHING
Confidence: 98.33% (phishing)
