# Dataset Download

This data set is from [circl-phishing-dataset-01](https://www.circl.lu/opendata/datasets/circl-phishing-dataset-01/), Introduction [here](https://www.circl.lu/opendata/circl-phishing-dataset-01/).

It contains 457 phishing webpage screenshots.

In [13]:
import os
import requests
from bs4 import BeautifulSoup

# URL of the dataset page
url = "https://www.circl.lu/opendata/datasets/circl-phishing-dataset-01/"

# Directory to save the downloaded files
output_dir = "circl_phishing_dataset"
os.makedirs(output_dir, exist_ok=True)

def download_file(file_url, save_path):
    """Download a file from a URL."""
    response = requests.get(file_url, stream=True)
    if response.status_code == 200:
        with open(save_path, "wb") as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        print(f"Downloaded: {save_path}")
    else:
        print(f"Failed to download: {file_url} (Status code: {response.status_code})")

# Scrape the page to find downloadable files
response = requests.get(url)
if response.status_code == 200:
    soup = BeautifulSoup(response.content, "html.parser")
    # Find all links to downloadable files
    for link in soup.find_all("a", href=True):
        file_link = link["href"]
        if file_link.endswith((".json",".txt")):  
            file_url = url + file_link if not file_link.startswith("http") else file_link
            file_name = os.path.basename(file_link)
            download_file(file_url, os.path.join(output_dir, file_name))
else:
    print(f"Failed to access the URL: {url} (Status code: {response.status_code})")


Downloaded: circl_phishing_dataset/Clean_phishing_references.json
Downloaded: circl_phishing_dataset/ground_truth_dataturks.json
Downloaded: circl_phishing_dataset/ground_truth_visjs.json
Downloaded: circl_phishing_dataset/labels_dataturks.txt
Downloaded: circl_phishing_dataset/visjs_graph.json


In [11]:
import os
import requests
from bs4 import BeautifulSoup

# URL of the dataset page
url = "https://www.circl.lu/opendata/datasets/circl-phishing-dataset-01/Clean_phishing/"

# Directory to save the downloaded files
output_dir = "circl_phishing_dataset/Clean_phishing"
os.makedirs(output_dir, exist_ok=True)

def download_file(file_url, save_path):
    """Download a file from a URL."""
    response = requests.get(file_url, stream=True)
    if response.status_code == 200:
        with open(save_path, "wb") as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        print(f"Downloaded: {save_path}")
    else:
        print(f"Failed to download: {file_url} (Status code: {response.status_code})")

# Scrape the page to find downloadable files
response = requests.get(url)
if response.status_code == 200:
    soup = BeautifulSoup(response.content, "html.parser")
    # Find all links to downloadable files
    for link in soup.find_all("a", href=True):
        file_link = link["href"]
        if file_link.endswith(("png")):  # Add other extensions as needed
            file_url = url + file_link if not file_link.startswith("http") else file_link
            file_name = os.path.basename(file_link)
            download_file(file_url, os.path.join(output_dir, file_name))
else:
    print(f"Failed to access the URL: {url} (Status code: {response.status_code})")


Downloaded: circl_phishing_dataset/Clean_phishing/abashed-careless-ordinary-crew.png
Downloaded: circl_phishing_dataset/Clean_phishing/ablaze-jazzy-tangy-file.png
Downloaded: circl_phishing_dataset/Clean_phishing/ablaze-lean-grubby-particular.png
Downloaded: circl_phishing_dataset/Clean_phishing/able-yellow-defective-variety.png
Downloaded: circl_phishing_dataset/Clean_phishing/abstracted-colossal-rich-race.png
Downloaded: circl_phishing_dataset/Clean_phishing/abstracted-excellent-plastic-insect.png
Downloaded: circl_phishing_dataset/Clean_phishing/abundant-various-adjoining-tap.png
Downloaded: circl_phishing_dataset/Clean_phishing/abusive-gabby-statuesque-nurse.png
Downloaded: circl_phishing_dataset/Clean_phishing/accidental-eager-grumpy-writer.png
Downloaded: circl_phishing_dataset/Clean_phishing/accurate-scintillating-picayune-administration.png
Downloaded: circl_phishing_dataset/Clean_phishing/adamant-accidental-hellish-feature.png
Downloaded: circl_phishing_dataset/Clean_phishing/

# Data Preperation

## OCR to extract textual content

We OCR the images, which are screenshots of phishing webpages, to extract textual content.

In [5]:
import os
import pytesseract
from PIL import Image
import pandas as pd

# Define the directory containing PNG files and the output CSV file
image_dir = "/Volumes/T7/DLCOURSEWORK/circl_phishing_dataset/Clean_phishing"  
output_csv = "/Volumes/T7/DLCOURSEWORK/circl_phishing_dataset/extracted_texts.csv" 

# List to store extracted text data
data = []

# Process each PNG file in the directory
for file_name in os.listdir(image_dir):
    if file_name.lower().endswith(".png"):
        image_path = os.path.join(image_dir, file_name)
        try:
            # Open the image and extract text using OCR
            text = pytesseract.image_to_string(Image.open(image_path))
            # Append the filename and extracted text to the list
            data.append({"file_name": file_name, "extracted_text": text})
            print(f"Processed: {file_name}")
        except Exception as e:
            print(f"Error processing {file_name}: {e}")

# Convert the data list to a DataFrame
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
df.to_csv(output_csv, index=False)

print(f"OCR completed. Extracted text saved to {output_csv}.")


Processed: abashed-careless-ordinary-crew.png
Processed: ablaze-jazzy-tangy-file.png
Processed: ablaze-lean-grubby-particular.png
Processed: able-yellow-defective-variety.png
Processed: abstracted-colossal-rich-race.png
Processed: abstracted-excellent-plastic-insect.png
Processed: abundant-various-adjoining-tap.png
Processed: abusive-gabby-statuesque-nurse.png
Processed: accidental-eager-grumpy-writer.png
Processed: accurate-scintillating-picayune-administration.png
Processed: adamant-accidental-hellish-feature.png
Processed: adamant-spicy-absurd-economics.png
Processed: adhesive-concerned-expensive-competition.png
Processed: adhoc-racial-condemned-letter.png
Processed: adjoining-handsomely-female-scene.png
Processed: adventurous-sleepy-public-joke.png
Processed: aggressive-defiant-imminent-bottle.png
Processed: alike-idiotic-shy-fail.png
Processed: alleged-abashed-clumsy-life.png
Processed: amazing-daily-handy-speech.png
Processed: ambitious-curious-picayune-shock.png
Processed: amuck

In [7]:
import json
import pandas as pd

# Load the ground_truth_visjs.json file
json_file_path = "/Volumes/T7/DLCOURSEWORK/circl_phishing_dataset/ground_truth_visjs.json"  
with open(json_file_path, "r") as file:
    clusters = json.load(file)

# Create a mapping of file names to their corresponding cluster (brand)
file_to_cluster = {}
for cluster_data in clusters:
    cluster_name = cluster_data["cluster"]
    members = cluster_data["members"]
    for file_name in members:
        file_to_cluster[file_name] = cluster_name

# Load the CSV file into a DataFrame
csv_file_path = "/Volumes/T7/DLCOURSEWORK/circl_phishing_dataset/extracted_texts.csv" 
#df = pd.read_csv(csv_file_path)

# Map the "brand" based on the file_to_cluster dictionary
df["brand"] = df["file_name"].map(file_to_cluster)

# Save the updated DataFrame back to a CSV file
output_csv_path = "/Volumes/T7/DLCOURSEWORK/circl_phishing_dataset/extracted_texts_with_brands.csv"  
df.to_csv(output_csv_path, index=False)

print(f"Updated DataFrame saved to {output_csv_path}.")


Updated DataFrame saved to /Volumes/T7/DLCOURSEWORK/circl_phishing_dataset/extracted_texts_with_brands.csv.


In [9]:
df['brand'].value_counts()

brand
uncomplete         64
unloaded           42
N26                39
microsoft          36
errorMessage       25
CounterStrike      22
misc               19
Advanzia           17
CirclWebServer     15
ackouperm          13
BancoInter         12
Office             11
Steam              11
TaggingServer      11
Outlook            11
formular           10
Oldcircl            9
SPankki             8
KBC                 7
WeTransfer          6
news                6
fibank              5
EmiratesNBD         5
OrangeWebmail       5
paypal              5
OneDrive            4
multilogo           4
CarreBlue           3
ForeignLanguage     3
Windows             3
google              3
rackspace           3
dhl                 2
bitcoin             2
Post                2
hsbc                2
Netflix             2
FrImpots            2
WellsFargo          2
DropBox             2
android             2
AmericanExpress     2
Name: count, dtype: int64

In [17]:
# show brand = android
df[df['brand']=='CirclWebServer'].head()

Unnamed: 0,file_name,extracted_text,brand
1,ablaze-jazzy-tangy-file.png,ol Confirm you r account circl lu password j...,CirclWebServer
16,aggressive-defiant-imminent-bottle.png,4 m ad Confirm your account taggingserver com...,CirclWebServer
95,dispensable-snotty-luxuriant-policy.png,ol Confirm you r account circl lu password j...,CirclWebServer
167,halting-abounding-vigorous-log.png,4 m ad Confirm your account taggingserver com...,CirclWebServer
206,kaput-foamy-nippy-charge.png,ol Confirm you r account circl lu password j...,CirclWebServer


## Textual Data Cleaning

In [16]:
# Remove newlines, tabs, and excessive spaces

import re

# Function to clean extracted text
def clean_text(text):
    if isinstance(text, str):
        # Remove newlines, tabs, and excessive spaces
        text = re.sub(r'\s+', ' ', text.strip())  # Replace multiple spaces/newlines with a single space
        text = re.sub(r'[^\x20-\x7E]', '', text)  # Remove non-ASCII characters (optional)
        text = text.replace("\n", " ").replace("\t", " ").replace("\r", " ").replace("@", " ").replace(":", " ").replace(";", " ").replace("!", " ").replace("#", " ").replace("$", " ").replace("%", " ").replace("^", " ").replace("&", " ").replace("*", " ").replace("(", " ").replace(")", " ").replace("-", " ").replace("_", " ").replace("+", " ").replace("=", " ").replace("{", " ").replace("}", " ").replace("[", " ").replace("]", " ").replace("|", " ").replace("\\", " ").replace(":", " ").replace("\"", " ").replace("'", " ").replace("<", " ").replace(">", " ").replace(",", " ").replace(".", " ").replace("?", " ").replace("/", " ")
    return text

# Apply cleaning to the 'extracted_text' column
df['extracted_text'] = df['extracted_text'].apply(clean_text)

# Save the cleaned DataFrame back to the file
cleaned_csv_path = "/Volumes/T7/DLCOURSEWORK/circl_phishing_dataset/extracted_texts_cleaned.csv"  # Update the path as needed
df.to_csv(cleaned_csv_path, index=False)

print(f"Cleaned DataFrame saved to {cleaned_csv_path}.")


Cleaned DataFrame saved to /Volumes/T7/DLCOURSEWORK/circl_phishing_dataset/extracted_texts_cleaned.csv.


# Extract Features (visual + textual) via CLIP

In [19]:
import os
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel

# Initialize the CLIP model and processor
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Paths and data
image_dir = "/Volumes/T7/DLCOURSEWORK/circl_phishing_dataset/Clean_phishing"

# Function to get image embeddings
def get_image_embedding(image_path):
    try:
        image = Image.open(image_path).convert("RGB")
        inputs = processor(images=image, return_tensors="pt").to(device)
        image_features = model.get_image_features(**inputs)
        return image_features.detach().cpu().numpy().flatten()
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        return None

# Function to get text embeddings
def get_text_embedding(text):
    try:
        inputs = processor(text=text, return_tensors="pt", padding=True, truncation=True).to(device)
        text_features = model.get_text_features(**inputs)
        return text_features.detach().cpu().numpy().flatten()
    except Exception as e:
        print(f"Error processing text: {e}")
        return None

# Add columns for image and text embeddings to the DataFrame
image_embeddings = []
text_embeddings = []

for idx, row in df.iterrows():
    image_path = os.path.join(image_dir, row["file_name"])
    text = row["extracted_text"]
    
    # Extract image and text embeddings
    img_emb = get_image_embedding(image_path)
    text_emb = get_text_embedding(text)
    
    image_embeddings.append(img_emb)
    text_embeddings.append(text_emb)

# Add embeddings to the DataFrame
df["image_embedding"] = image_embeddings
df["text_embedding"] = text_embeddings

# Save embeddings to a file
output_csv_path = "/Volumes/T7/DLCOURSEWORK/circl_phishing_dataset/embeddings.csv"
df.to_csv(output_csv_path, index=False)

print(f"Embeddings saved to {output_csv_path}.")


config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

Embeddings saved to /Volumes/T7/DLCOURSEWORK/circl_phishing_dataset/embeddings.csv.


In [20]:
df

Unnamed: 0,file_name,extracted_text,brand,image_embedding,text_embedding
0,abashed-careless-ordinary-crew.png,N26 Actualiser les GmbH paramtres des 2019 c...,unloaded,"[-0.14099549, 0.18997173, 0.12000446, 0.205960...","[0.14271626, -0.23965064, -0.17083012, 0.11320..."
1,ablaze-jazzy-tangy-file.png,ol Confirm you r account circl lu password j...,CirclWebServer,"[0.14484096, 0.26118287, -0.1034108, 0.1293356...","[0.27832943, 0.19564445, -0.30090627, 0.206732..."
2,ablaze-lean-grubby-particular.png,IT Mobiles Entertainment Wissen Netzpolitik Wi...,unloaded,"[-0.12322151, 0.30594707, -0.007408384, 0.1513...","[-0.10782848, 0.34986025, -0.30775118, 0.02452..."
3,able-yellow-defective-variety.png,EE Microsoft Pick an account undefined b use a...,unloaded,"[-0.021512665, 0.0829493, -0.056524467, 0.3748...","[0.20450851, 0.18280731, -0.23862273, 0.319023..."
4,abstracted-colossal-rich-race.png,Transaktionen oo Spaces a Mein Konto N26 Bit...,N26,"[-0.09904538, 0.17214404, 0.0125989895, 0.0279...","[0.049980957, -0.027177861, 0.19742016, 0.1905..."
...,...,...,...,...,...
452,yummy-neat-frantic-sound.png,Need login a B a a a a a i,CounterStrike,"[0.07600027, -0.31320873, 0.0018638405, -0.302...","[0.13922371, 0.103531815, 0.048435986, -0.0286..."
453,zany-cloudy-true-part.png,Microsoft Taking you to your organization s si...,microsoft,"[-0.3799956, 0.39194575, -0.24593008, 0.250652...","[0.052628953, 0.23422095, -0.5313105, 0.151695..."
454,zealous-parsimonious-smiling-fun.png,Microsoft Enter passw D2018 Microsoft Terms of...,microsoft,"[0.3621599, -0.06515593, -0.21567273, 0.071415...","[0.007452556, -0.066111065, -0.3963314, 0.1852..."
455,zippy-piquant-fascinated-bag.png,1 Office 365 Verify your email password to con...,Office,"[-0.00078154635, -0.015230171, 0.14547199, -0....","[0.208157, -0.48072356, -0.6584283, 0.08933899..."


# Classifier 
(Evaluation Metrics: Accuracy, F1, Precsion, and Recall)

## Autogluon with multimodal data

In [33]:
from autogluon.tabular import TabularPredictor
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    top_k_accuracy_score,
    classification_report
)
from sklearn.preprocessing import label_binarize
import pandas as pd
import numpy as np

# Combine image and text embeddings into a single feature set
df["combined_embedding"] = df.apply(
    lambda row: np.hstack([row["image_embedding"], row["text_embedding"]]), axis=1
)

# Prepare DataFrame for AutoGluon
embedding_cols = pd.DataFrame(df["combined_embedding"].to_list())
embedding_cols.columns = [f"feature_{i}" for i in range(embedding_cols.shape[1])]
df_for_autogluon = pd.concat([embedding_cols, df["brand"]], axis=1)

# Split the data into training and testing sets
train_data = df_for_autogluon.sample(frac=0.8, random_state=42)
test_data = df_for_autogluon.drop(train_data.index)

# Define label column
label = "brand"

# Train AutoGluon model
predictor = TabularPredictor(label=label).fit(train_data)

# Get leaderboard
leaderboard = predictor.leaderboard(test_data, silent=True)
print("Leaderboard of Models:")
print(leaderboard)

# Test labels and data without the label column
test_labels = test_data[label]
test_data_no_label = test_data.drop(columns=[label])

# Unique labels (needed for ROC-AUC alignment)
unique_labels = predictor.class_labels

# Evaluate each model individually
for model_name in leaderboard["model"]:
    print(f"\nEvaluating Model: {model_name}")
    
    # Predict using the specific model
    predictions = predictor.predict(test_data_no_label, model=model_name)
    probs = predictor.predict_proba(test_data_no_label, model=model_name)

    # Ensure alignment of probabilities and labels
    probs_aligned = probs.loc[:, unique_labels].to_numpy()
    test_labels_binarized = label_binarize(test_labels, classes=unique_labels)

    # Calculate metrics
    accuracy = accuracy_score(test_labels, predictions)
    precision = precision_score(test_labels, predictions, average="weighted")
    recall = recall_score(test_labels, predictions, average="weighted")
    f1 = f1_score(test_labels, predictions, average="weighted")

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

    # Classification Report
    print("\nClassification Report:")
    print(classification_report(test_labels, predictions))


No path specified. Models will be saved in: "AutogluonModels/ag-20241208_000036"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.10.8
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 23.4.0: Wed Feb 21 21:44:06 PST 2024; root:xnu-10063.101.15~2/RELEASE_ARM64_T8103
CPU Count:          8
Memory Avail:       3.70 GB / 16.00 GB (23.2%)
Disk Space Avail:   112.74 GB / 931.48 GB (12.1%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='experimental' : New in v1.2: Pre-trained foundation model + parallel fits. The absolute best accuracy without consideration for inference speed. Does not support GPU.
	presets='best'         : Maximize accuracy. Recommended for most users. Use in competitions a

Leaderboard of Models:
                  model  score_test  score_val eval_metric  pred_time_test  \
0       NeuralNetFastAI    0.725275   0.780822    accuracy        0.014237   
1   WeightedEnsemble_L2    0.725275   0.780822    accuracy        0.022228   
2        ExtraTreesGini    0.714286   0.739726    accuracy        0.056415   
3              CatBoost    0.703297   0.698630    accuracy        0.034480   
4        ExtraTreesEntr    0.692308   0.726027    accuracy        0.061946   
5        NeuralNetTorch    0.681319   0.753425    accuracy        0.016790   
6        KNeighborsDist    0.681319   0.698630    accuracy        0.020085   
7      RandomForestEntr    0.681319   0.780822    accuracy        0.058026   
8      RandomForestGini    0.681319   0.753425    accuracy        0.059188   
9            LightGBMXT    0.670330   0.780822    accuracy        0.020411   
10       KNeighborsUnif    0.659341   0.657534    accuracy        0.012002   
11              XGBoost    0.637363   0.7

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Accuracy: 0.7033
Precision: 0.7030
Recall: 0.7033
F1 Score: 0.6844

Classification Report:
                precision    recall  f1-score   support

      Advanzia       1.00      1.00      1.00         5
    BancoInter       1.00      1.00      1.00         3
     CarreBlue       0.00      0.00      0.00         2
CirclWebServer       1.00      1.00      1.00         2
 CounterStrike       0.75      0.60      0.67         5
   EmiratesNBD       0.00      0.00      0.00         1
           KBC       1.00      0.33      0.50         3
           N26       0.82      0.69      0.75        13
      Oldcircl       1.00      1.00      1.00         1
      OneDrive       0.00      0.00      0.00         1
       SPankki       1.00      1.00      1.00         1
         Steam       1.00      0.67      0.80         3
    WellsFargo       0.00      0.00      0.00         1
     ackouperm       1.00      1.00      1.00         4
  errorMessage       0.50      1.00      0.67         5
      formul

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Accuracy: 0.6813
Precision: 0.6561
Recall: 0.6813
F1 Score: 0.6581

Classification Report:
                precision    recall  f1-score   support

      Advanzia       1.00      1.00      1.00         5
    BancoInter       0.75      1.00      0.86         3
     CarreBlue       0.00      0.00      0.00         2
CirclWebServer       1.00      1.00      1.00         2
 CounterStrike       0.57      0.80      0.67         5
   EmiratesNBD       0.00      0.00      0.00         1
           KBC       1.00      0.67      0.80         3
           N26       0.82      0.69      0.75        13
        Office       0.00      0.00      0.00         0
      Oldcircl       0.33      1.00      0.50         1
      OneDrive       0.00      0.00      0.00         1
       SPankki       1.00      1.00      1.00         1
         Steam       1.00      0.67      0.80         3
 TaggingServer       0.00      0.00      0.00         0
    WellsFargo       0.00      0.00      0.00         1
     ackoupe

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Accuracy: 0.6374
Precision: 0.6643
Recall: 0.6374
F1 Score: 0.6332

Classification Report:
                precision    recall  f1-score   support

      Advanzia       1.00      0.80      0.89         5
    BancoInter       0.75      1.00      0.86         3
     CarreBlue       0.00      0.00      0.00         2
CirclWebServer       1.00      1.00      1.00         2
 CounterStrike       0.50      0.60      0.55         5
   EmiratesNBD       0.00      0.00      0.00         1
           KBC       1.00      0.33      0.50         3
           N26       0.75      0.69      0.72        13
        Office       0.00      0.00      0.00         0
      Oldcircl       0.50      1.00      0.67         1
      OneDrive       0.00      0.00      0.00         1
       Outlook       0.00      0.00      0.00         0
       SPankki       1.00      1.00      1.00         1
         Steam       1.00      0.33      0.50         3
    WeTransfer       0.00      0.00      0.00         0
    WellsFar

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [36]:
# save classification_report(test_labels, predictions)
report = classification_report(test_labels, predictions, output_dict=True)
pd.DataFrame(report).transpose().to_csv("/Volumes/T7/DLCOURSEWORK/circl_phishing_dataset/classification_report_multimedia.csv")

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Autogluon with image data

In [37]:
from autogluon.tabular import TabularPredictor
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    top_k_accuracy_score,
    classification_report
)
from sklearn.preprocessing import label_binarize
import pandas as pd
import numpy as np

# Prepare DataFrame for AutoGluon using only image embeddings
embedding_cols = pd.DataFrame(df["image_embedding"].to_list())
embedding_cols.columns = [f"feature_{i}" for i in range(embedding_cols.shape[1])]
df_for_autogluon = pd.concat([embedding_cols, df["brand"]], axis=1)

# Split the data into training and testing sets
train_data = df_for_autogluon.sample(frac=0.8, random_state=42)
test_data = df_for_autogluon.drop(train_data.index)

# Define label column
label = "brand"

# Train AutoGluon model
predictor = TabularPredictor(label=label).fit(train_data)

# Get leaderboard
leaderboard = predictor.leaderboard(test_data, silent=True)
print("Leaderboard of Models:")
print(leaderboard)

# Test labels and data without the label column
test_labels = test_data[label]
test_data_no_label = test_data.drop(columns=[label])

# Unique labels (needed for ROC-AUC alignment)
unique_labels = predictor.class_labels

# Evaluate each model individually
for model_name in leaderboard["model"]:
    print(f"\nEvaluating Model: {model_name}")
    
    # Predict using the specific model
    predictions = predictor.predict(test_data_no_label, model=model_name)
    probs = predictor.predict_proba(test_data_no_label, model=model_name)

    # Ensure alignment of probabilities and labels
    probs_aligned = probs.loc[:, unique_labels].to_numpy()
    test_labels_binarized = label_binarize(test_labels, classes=unique_labels)

    # Calculate metrics
    accuracy = accuracy_score(test_labels, predictions)
    precision = precision_score(test_labels, predictions, average="weighted")
    recall = recall_score(test_labels, predictions, average="weighted")
    f1 = f1_score(test_labels, predictions, average="weighted")

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

    # Classification Report
    print("\nClassification Report:")
    print(classification_report(test_labels, predictions))

# save classification_report(test_labels, predictions)
report = classification_report(test_labels, predictions, output_dict=True)
pd.DataFrame(report).transpose().to_csv("/Volumes/T7/DLCOURSEWORK/circl_phishing_dataset/classification_report_image.csv")

No path specified. Models will be saved in: "AutogluonModels/ag-20241208_001352"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.10.8
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 23.4.0: Wed Feb 21 21:44:06 PST 2024; root:xnu-10063.101.15~2/RELEASE_ARM64_T8103
CPU Count:          8
Memory Avail:       3.78 GB / 16.00 GB (23.6%)
Disk Space Avail:   112.62 GB / 931.48 GB (12.1%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='experimental' : New in v1.2: Pre-trained foundation model + parallel fits. The absolute best accuracy without consideration for inference speed. Does not support GPU.
	presets='best'         : Maximize accuracy. Recommended for most users. Use in competitions a

Leaderboard of Models:
                  model  score_test  score_val eval_metric  pred_time_test  \
0        ExtraTreesEntr    0.714286   0.753425    accuracy        0.055167   
1       NeuralNetFastAI    0.703297   0.780822    accuracy        0.009560   
2   WeightedEnsemble_L2    0.703297   0.794521    accuracy        0.068064   
3        NeuralNetTorch    0.692308   0.780822    accuracy        0.011495   
4        KNeighborsDist    0.681319   0.712329    accuracy        0.007625   
5      RandomForestEntr    0.681319   0.753425    accuracy        0.050731   
6      RandomForestGini    0.681319   0.726027    accuracy        0.061158   
7        ExtraTreesGini    0.681319   0.712329    accuracy        0.065458   
8            LightGBMXT    0.670330   0.739726    accuracy        0.023267   
9        KNeighborsUnif    0.659341   0.657534    accuracy        0.006299   
10             CatBoost    0.648352   0.712329    accuracy        0.038296   
11        LightGBMLarge    0.637363   0.6

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Accuracy: 0.7033
Precision: 0.6663
Recall: 0.7033
F1 Score: 0.6774

Classification Report:
                precision    recall  f1-score   support

      Advanzia       0.83      1.00      0.91         5
    BancoInter       1.00      1.00      1.00         3
     CarreBlue       0.00      0.00      0.00         2
CirclWebServer       1.00      1.00      1.00         2
 CounterStrike       0.60      0.60      0.60         5
       DropBox       0.00      0.00      0.00         0
   EmiratesNBD       0.00      0.00      0.00         1
           KBC       1.00      0.67      0.80         3
           N26       0.82      0.69      0.75        13
        Office       0.00      0.00      0.00         0
      Oldcircl       0.50      1.00      0.67         1
      OneDrive       0.00      0.00      0.00         1
       Outlook       0.00      0.00      0.00         0
       SPankki       1.00      1.00      1.00         1
         Steam       1.00      1.00      1.00         3
 TaggingServ

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Accuracy: 0.6813
Precision: 0.6551
Recall: 0.6813
F1 Score: 0.6573

Classification Report:
                precision    recall  f1-score   support

      Advanzia       1.00      1.00      1.00         5
    BancoInter       1.00      1.00      1.00         3
     CarreBlue       0.00      0.00      0.00         2
CirclWebServer       1.00      1.00      1.00         2
 CounterStrike       0.60      0.60      0.60         5
   EmiratesNBD       0.00      0.00      0.00         1
           KBC       1.00      0.67      0.80         3
           N26       0.75      0.69      0.72        13
      Oldcircl       1.00      1.00      1.00         1
      OneDrive       0.00      0.00      0.00         1
       SPankki       1.00      1.00      1.00         1
         Steam       1.00      0.67      0.80         3
 TaggingServer       0.00      0.00      0.00         0
    WellsFargo       0.00      0.00      0.00         1
     ackouperm       1.00      1.00      1.00         4
  errorMessa

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                precision    recall  f1-score   support

      Advanzia       1.00      0.80      0.89         5
    BancoInter       1.00      1.00      1.00         3
     CarreBlue       0.00      0.00      0.00         2
CirclWebServer       1.00      1.00      1.00         2
 CounterStrike       0.40      0.40      0.40         5
   EmiratesNBD       0.00      0.00      0.00         1
           KBC       1.00      0.33      0.50         3
           N26       0.82      0.69      0.75        13
      Oldcircl       0.33      1.00      0.50         1
      OneDrive       0.00      0.00      0.00         1
       SPankki       1.00      1.00      1.00         1
         Steam       1.00      0.33      0.50         3
 TaggingServer       0.00      0.00      0.00         0
    WellsFargo       0.00      0.00      0.00         1
     ackouperm       1.00      1.00      1.00         4
  errorMessage       0.40      0.80      0.53         5
      formular       0.00      0.00      0.00  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.6374
Precision: 0.6348
Recall: 0.6374
F1 Score: 0.6229

Classification Report:
                precision    recall  f1-score   support

      Advanzia       1.00      0.60      0.75         5
    BancoInter       1.00      1.00      1.00         3
     CarreBlue       0.00      0.00      0.00         2
CirclWebServer       1.00      1.00      1.00         2
 CounterStrike       0.57      0.80      0.67         5
   EmiratesNBD       0.00      0.00      0.00         1
           KBC       1.00      0.33      0.50         3
           N26       0.64      0.69      0.67        13
        Office       0.00      0.00      0.00         0
      Oldcircl       1.00      1.00      1.00         1
      OneDrive       0.00      0.00      0.00         1
 OrangeWebmail       0.00      0.00      0.00         0
       SPankki       0.50      1.00      0.67         1
         Steam       1.00      0.67      0.80         3
 TaggingServer       0.00      0.00      0.00         0
    WellsFar

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

## Autogluon with textual data

In [38]:
from autogluon.tabular import TabularPredictor
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    top_k_accuracy_score,
    classification_report
)
from sklearn.preprocessing import label_binarize
import pandas as pd
import numpy as np

# Prepare DataFrame for AutoGluon using only image embeddings
embedding_cols = pd.DataFrame(df["text_embedding"].to_list())
embedding_cols.columns = [f"feature_{i}" for i in range(embedding_cols.shape[1])]
df_for_autogluon = pd.concat([embedding_cols, df["brand"]], axis=1)

# Split the data into training and testing sets
train_data = df_for_autogluon.sample(frac=0.8, random_state=42)
test_data = df_for_autogluon.drop(train_data.index)

# Define label column
label = "brand"

# Train AutoGluon model
predictor = TabularPredictor(label=label).fit(train_data)

# Get leaderboard
leaderboard = predictor.leaderboard(test_data, silent=True)
print("Leaderboard of Models:")
print(leaderboard)

# Test labels and data without the label column
test_labels = test_data[label]
test_data_no_label = test_data.drop(columns=[label])

# Unique labels (needed for ROC-AUC alignment)
unique_labels = predictor.class_labels

# Evaluate each model individually
for model_name in leaderboard["model"]:
    print(f"\nEvaluating Model: {model_name}")
    
    # Predict using the specific model
    predictions = predictor.predict(test_data_no_label, model=model_name)
    probs = predictor.predict_proba(test_data_no_label, model=model_name)

    # Ensure alignment of probabilities and labels
    probs_aligned = probs.loc[:, unique_labels].to_numpy()
    test_labels_binarized = label_binarize(test_labels, classes=unique_labels)

    # Calculate metrics
    accuracy = accuracy_score(test_labels, predictions)
    precision = precision_score(test_labels, predictions, average="weighted")
    recall = recall_score(test_labels, predictions, average="weighted")
    f1 = f1_score(test_labels, predictions, average="weighted")

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

    # Classification Report
    print("\nClassification Report:")
    print(classification_report(test_labels, predictions))

# save classification_report(test_labels, predictions)
report = classification_report(test_labels, predictions, output_dict=True)
pd.DataFrame(report).transpose().to_csv("/Volumes/T7/DLCOURSEWORK/circl_phishing_dataset/classification_report_text.csv")

No path specified. Models will be saved in: "AutogluonModels/ag-20241208_002022"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.10.8
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 23.4.0: Wed Feb 21 21:44:06 PST 2024; root:xnu-10063.101.15~2/RELEASE_ARM64_T8103
CPU Count:          8
Memory Avail:       3.56 GB / 16.00 GB (22.3%)
Disk Space Avail:   112.49 GB / 931.48 GB (12.1%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='experimental' : New in v1.2: Pre-trained foundation model + parallel fits. The absolute best accuracy without consideration for inference speed. Does not support GPU.
	presets='best'         : Maximize accuracy. Recommended for most users. Use in competitions a

Leaderboard of Models:
                  model  score_test  score_val eval_metric  pred_time_test  \
0        ExtraTreesEntr    0.648352   0.671233    accuracy        0.054372   
1        ExtraTreesGini    0.637363   0.657534    accuracy        0.065196   
2       NeuralNetFastAI    0.626374   0.671233    accuracy        0.009309   
3      RandomForestGini    0.626374   0.630137    accuracy        0.065241   
4      RandomForestEntr    0.615385   0.643836    accuracy        0.050736   
5        NeuralNetTorch    0.593407   0.630137    accuracy        0.010454   
6        KNeighborsDist    0.582418   0.589041    accuracy        0.010795   
7              LightGBM    0.582418   0.616438    accuracy        0.020953   
8              CatBoost    0.582418   0.684932    accuracy        0.037545   
9        KNeighborsUnif    0.571429   0.561644    accuracy        0.007898   
10  WeightedEnsemble_L2    0.571429   0.712329    accuracy        0.109503   
11           LightGBMXT    0.560440   0.6

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Accuracy: 0.6374
Precision: 0.5923
Recall: 0.6374
F1 Score: 0.6011

Classification Report:
                precision    recall  f1-score   support

      Advanzia       1.00      1.00      1.00         5
    BancoInter       0.60      1.00      0.75         3
     CarreBlue       0.00      0.00      0.00         2
CirclWebServer       1.00      1.00      1.00         2
 CounterStrike       0.75      0.60      0.67         5
   EmiratesNBD       0.00      0.00      0.00         1
           KBC       1.00      0.67      0.80         3
           N26       0.82      0.69      0.75        13
      Oldcircl       1.00      1.00      1.00         1
      OneDrive       0.00      0.00      0.00         1
       Outlook       0.00      0.00      0.00         0
       SPankki       1.00      1.00      1.00         1
         Steam       1.00      1.00      1.00         3
 TaggingServer       0.00      0.00      0.00         0
    WellsFargo       0.00      0.00      0.00         1
     ackoupe

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Accuracy: 0.6154
Precision: 0.5983
Recall: 0.6154
F1 Score: 0.5924

Classification Report:
                precision    recall  f1-score   support

      Advanzia       1.00      0.80      0.89         5
    BancoInter       0.75      1.00      0.86         3
     CarreBlue       0.00      0.00      0.00         2
CirclWebServer       1.00      1.00      1.00         2
 CounterStrike       0.33      0.40      0.36         5
   EmiratesNBD       0.00      0.00      0.00         1
           KBC       1.00      0.67      0.80         3
           N26       0.80      0.62      0.70        13
        Office       0.00      0.00      0.00         0
      Oldcircl       0.50      1.00      0.67         1
      OneDrive       0.00      0.00      0.00         1
       SPankki       1.00      1.00      1.00         1
         Steam       1.00      0.67      0.80         3
    WellsFargo       0.00      0.00      0.00         1
     ackouperm       1.00      1.00      1.00         4
  errorMessa

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Accuracy: 0.5714
Precision: 0.5539
Recall: 0.5714
F1 Score: 0.5442

Classification Report:
                precision    recall  f1-score   support

      Advanzia       1.00      0.80      0.89         5
    BancoInter       0.75      1.00      0.86         3
     CarreBlue       0.00      0.00      0.00         2
CirclWebServer       1.00      1.00      1.00         2
 CounterStrike       0.33      0.40      0.36         5
   EmiratesNBD       0.00      0.00      0.00         1
           KBC       1.00      0.67      0.80         3
           N26       0.80      0.62      0.70        13
        Office       0.00      0.00      0.00         0
      Oldcircl       0.00      0.00      0.00         1
      OneDrive       0.00      0.00      0.00         1
       SPankki       1.00      1.00      1.00         1
         Steam       1.00      0.67      0.80         3
    WellsFargo       0.00      0.00      0.00         1
     ackouperm       0.80      1.00      0.89         4
  errorMessa

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.5604
Precision: 0.5487
Recall: 0.5604
F1 Score: 0.5368

Classification Report:
                precision    recall  f1-score   support

      Advanzia       1.00      0.80      0.89         5
    BancoInter       0.75      1.00      0.86         3
     CarreBlue       0.00      0.00      0.00         2
CirclWebServer       1.00      1.00      1.00         2
 CounterStrike       0.40      0.40      0.40         5
   EmiratesNBD       0.00      0.00      0.00         1
           KBC       1.00      0.33      0.50         3
           N26       0.82      0.69      0.75        13
      Oldcircl       0.00      0.00      0.00         1
      OneDrive       0.00      0.00      0.00         1
 OrangeWebmail       0.00      0.00      0.00         0
       SPankki       1.00      1.00      1.00         1
         Steam       0.75      1.00      0.86         3
 TaggingServer       0.00      0.00      0.00         0
    WellsFargo       0.00      0.00      0.00         1
     ackoupe

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

## SVM with multimodal data

In [45]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import numpy as np

# Prepare data for SVM
X = df['combined_embedding'].to_list()
y = df["brand"].factorize()[0]  # Encode labels

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train SVM
svm_model = SVC(kernel="linear", probability=True)
svm_model.fit(X_train, y_train)

# Evaluate SVM
y_pred = svm_model.predict(X_test)
print("SVM Classification Report:")
print(classification_report(y_test, y_pred))
#save report 
report = classification_report(y_test, y_pred, output_dict=True)
pd.DataFrame(report).transpose().to_csv("/Volumes/T7/DLCOURSEWORK/circl_phishing_dataset/classification_report_svm_multimodality.csv")


SVM Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.23      0.33        13
           1       0.00      0.00      0.00         1
           2       0.45      0.83      0.59         6
           3       0.36      1.00      0.53         4
           4       1.00      1.00      1.00         1
           5       1.00      1.00      1.00         1
           6       0.00      0.00      0.00         1
           7       1.00      1.00      1.00         2
           9       1.00      1.00      1.00         3
          10       1.00      1.00      1.00         1
          11       1.00      1.00      1.00         3
          13       1.00      0.92      0.96        13
          14       0.75      1.00      0.86         3
          15       0.75      1.00      0.86         3
          16       0.00      0.00      0.00         4
          17       1.00      1.00      1.00         3
          18       0.67      1.00      0.80         2


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## SVM with image data

In [43]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import numpy as np

#
X = np.hstack([np.vstack(df["image_embedding"])])  # Combine embeddings
y = df["brand"].factorize()[0]  # Encode labels

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train SVM
svm_model = SVC(kernel="linear", probability=True)
svm_model.fit(X_train, y_train)

# Evaluate SVM
y_pred = svm_model.predict(X_test)
print("SVM Classification Report:")
print(classification_report(y_test, y_pred))
#save report 
report = classification_report(y_test, y_pred, output_dict=True)
pd.DataFrame(report).transpose().to_csv("/Volumes/T7/DLCOURSEWORK/circl_phishing_dataset/classification_report_svm_img.csv")


SVM Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.23      0.32        13
           1       0.00      0.00      0.00         1
           2       0.42      0.83      0.56         6
           3       0.38      0.75      0.50         4
           4       1.00      1.00      1.00         1
           5       1.00      1.00      1.00         1
           6       0.00      0.00      0.00         1
           7       1.00      1.00      1.00         2
           9       1.00      1.00      1.00         3
          10       1.00      1.00      1.00         1
          11       1.00      1.00      1.00         3
          13       1.00      0.92      0.96        13
          14       0.75      1.00      0.86         3
          15       0.75      1.00      0.86         3
          16       0.00      0.00      0.00         4
          17       1.00      1.00      1.00         3
          18       0.67      1.00      0.80         2


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## SVM with text data

In [46]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import numpy as np

#
X = np.hstack([np.vstack(df["text_embedding"])])  # Combine embeddings
y = df["brand"].factorize()[0]  # Encode labels

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train SVM
svm_model = SVC(kernel="linear", probability=True)
svm_model.fit(X_train, y_train)

# Evaluate SVM
y_pred = svm_model.predict(X_test)
print("SVM Classification Report:")
print(classification_report(y_test, y_pred))
#save report 
report = classification_report(y_test, y_pred, output_dict=True)
pd.DataFrame(report).transpose().to_csv("/Volumes/T7/DLCOURSEWORK/circl_phishing_dataset/classification_report_svm_text.csv")


SVM Classification Report:
              precision    recall  f1-score   support

           0       0.33      0.23      0.27        13
           1       0.00      0.00      0.00         1
           2       0.43      1.00      0.60         6
           3       0.33      0.75      0.46         4
           4       1.00      1.00      1.00         1
           5       1.00      1.00      1.00         1
           6       0.00      0.00      0.00         1
           7       1.00      1.00      1.00         2
           9       1.00      1.00      1.00         3
          10       1.00      1.00      1.00         1
          11       1.00      1.00      1.00         3
          13       0.85      0.85      0.85        13
          14       0.40      0.67      0.50         3
          15       0.60      1.00      0.75         3
          16       0.00      0.00      0.00         4
          17       1.00      1.00      1.00         3
          18       1.00      1.00      1.00         2


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## CNN with multimodal data

In [63]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import numpy as np

# Prepare data for CNN
class ImageDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = torch.tensor(embeddings, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]

mul_embeddings = np.vstack(df["combined_embedding"])
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df["brand"])

# Split data
X_train, X_test, y_train, y_test = train_test_split(mul_embeddings, labels, test_size=0.2, random_state=42)
train_dataset = ImageDataset(X_train, y_train)
test_dataset = ImageDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Define CNN
class CNNClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(CNNClassifier, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_classes),
        )

    def forward(self, x):
        return self.fc(x)

# Train CNN
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Training on {device}")
model = CNNClassifier(input_dim=mul_embeddings.shape[1], num_classes=len(label_encoder.classes_)).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Early stopping parameters
best_loss = float('inf')
patience = 10
early_stop_counter = 0

for epoch in range(1000):  # Max epochs
    model.train()
    epoch_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    
    epoch_loss /= len(train_loader)
    print(f"Epoch {epoch + 1}, Loss: {epoch_loss:.4f}")

    # Early stopping check
    if epoch_loss < best_loss:
        best_loss = epoch_loss
        early_stop_counter = 0
        torch.save(model.state_dict(), "best_model.pth")  # Save the best model
    else:
        early_stop_counter += 1
        if early_stop_counter >= patience:
            print("Early stopping triggered.")
            break

# Load the best model
model.load_state_dict(torch.load("best_model.pth"))

# Evaluate CNN
model.eval()
correct = 0
total = 0
all_predictions = []
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch)
        _, predicted = torch.max(outputs, 1)
        total += y_batch.size(0)
        correct += (predicted == y_batch).sum().item()
        all_predictions.extend(predicted.cpu().numpy())

# Metrics
accuracy = accuracy_score(y_test, all_predictions)
precision = precision_score(y_test, all_predictions, average="weighted")
recall = recall_score(y_test, all_predictions, average="weighted")
f1 = f1_score(y_test, all_predictions, average="weighted")
print(f"Test Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

# Save report
report = classification_report(y_test, all_predictions, output_dict=True)
report_df = pd.DataFrame(report).transpose()
# Add overall metrics to the report
report_df.loc["overall"] = {"precision": precision, "recall": recall, "f1-score": f1, "accuracy": accuracy}
report_df.to_csv("/Volumes/T7/DLCOURSEWORK/circl_phishing_dataset/classification_report_cnn_multimodality.csv")

print("Classification report saved.")


Training on cpu
Epoch 1, Loss: 3.2122
Epoch 2, Loss: 2.2428
Epoch 3, Loss: 1.7214
Epoch 4, Loss: 1.3607
Epoch 5, Loss: 1.0502
Epoch 6, Loss: 0.8116
Epoch 7, Loss: 0.6931
Epoch 8, Loss: 0.5481
Epoch 9, Loss: 0.4075
Epoch 10, Loss: 0.3389
Epoch 11, Loss: 0.2540
Epoch 12, Loss: 0.2506
Epoch 13, Loss: 0.1608
Epoch 14, Loss: 0.1663
Epoch 15, Loss: 0.1062
Epoch 16, Loss: 0.1262
Epoch 17, Loss: 0.0957
Epoch 18, Loss: 0.0935
Epoch 19, Loss: 0.0860
Epoch 20, Loss: 0.0726
Epoch 21, Loss: 0.0777
Epoch 22, Loss: 0.0661
Epoch 23, Loss: 0.0672
Epoch 24, Loss: 0.0583
Epoch 25, Loss: 0.0496
Epoch 26, Loss: 0.0698
Epoch 27, Loss: 0.0626
Epoch 28, Loss: 0.0431
Epoch 29, Loss: 0.0650
Epoch 30, Loss: 0.0655
Epoch 31, Loss: 0.0538
Epoch 32, Loss: 0.0320
Epoch 33, Loss: 0.0445
Epoch 34, Loss: 0.0626
Epoch 35, Loss: 0.0539
Epoch 36, Loss: 0.0735
Epoch 37, Loss: 0.0507
Epoch 38, Loss: 0.0506
Epoch 39, Loss: 0.0555
Epoch 40, Loss: 0.0717
Epoch 41, Loss: 0.0478
Epoch 42, Loss: 0.0529
Early stopping triggered.
T

  model.load_state_dict(torch.load("best_model.pth"))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## CNN with image data

In [64]:
image_embeddings = np.vstack(df["image_embedding"])
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df["brand"])

# Split data
X_train, X_test, y_train, y_test = train_test_split(image_embeddings, labels, test_size=0.2, random_state=42)
train_dataset = ImageDataset(X_train, y_train)
test_dataset = ImageDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


# Train CNN
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Training on {device}")
model = CNNClassifier(input_dim=image_embeddings.shape[1], num_classes=len(label_encoder.classes_)).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Early stopping parameters
best_loss = float('inf')
patience = 10
early_stop_counter = 0

for epoch in range(1000):  # Max epochs
    model.train()
    epoch_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    
    epoch_loss /= len(train_loader)
    print(f"Epoch {epoch + 1}, Loss: {epoch_loss:.4f}")

    # Early stopping check
    if epoch_loss < best_loss:
        best_loss = epoch_loss
        early_stop_counter = 0
        torch.save(model.state_dict(), "best_model.pth")  # Save the best model
    else:
        early_stop_counter += 1
        if early_stop_counter >= patience:
            print("Early stopping triggered.")
            break

# Load the best model
model.load_state_dict(torch.load("best_model.pth"))

# Evaluate CNN
model.eval()
correct = 0
total = 0
all_predictions = []
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch)
        _, predicted = torch.max(outputs, 1)
        total += y_batch.size(0)
        correct += (predicted == y_batch).sum().item()
        all_predictions.extend(predicted.cpu().numpy())

# Metrics
accuracy = accuracy_score(y_test, all_predictions)
precision = precision_score(y_test, all_predictions, average="weighted")
recall = recall_score(y_test, all_predictions, average="weighted")
f1 = f1_score(y_test, all_predictions, average="weighted")
print(f"Test Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

# Save report
report = classification_report(y_test, all_predictions, output_dict=True)
report_df = pd.DataFrame(report).transpose()
# Add overall metrics to the report
report_df.loc["overall"] = {"precision": precision, "recall": recall, "f1-score": f1, "accuracy": accuracy}
report_df.to_csv("/Volumes/T7/DLCOURSEWORK/circl_phishing_dataset/classification_report_cnn_image.csv")

print("Classification report saved.")


Training on cpu
Epoch 1, Loss: 3.3001
Epoch 2, Loss: 2.4926
Epoch 3, Loss: 1.9622
Epoch 4, Loss: 1.5636
Epoch 5, Loss: 1.3660
Epoch 6, Loss: 1.1073
Epoch 7, Loss: 0.8743
Epoch 8, Loss: 0.7726
Epoch 9, Loss: 0.6399
Epoch 10, Loss: 0.5087
Epoch 11, Loss: 0.4326
Epoch 12, Loss: 0.3677
Epoch 13, Loss: 0.3163
Epoch 14, Loss: 0.2289
Epoch 15, Loss: 0.1737
Epoch 16, Loss: 0.1666
Epoch 17, Loss: 0.1740
Epoch 18, Loss: 0.1736
Epoch 19, Loss: 0.1331
Epoch 20, Loss: 0.1290
Epoch 21, Loss: 0.1242
Epoch 22, Loss: 0.0959
Epoch 23, Loss: 0.0977
Epoch 24, Loss: 0.1119
Epoch 25, Loss: 0.1140
Epoch 26, Loss: 0.0771
Epoch 27, Loss: 0.0742
Epoch 28, Loss: 0.0834
Epoch 29, Loss: 0.0662
Epoch 30, Loss: 0.0736
Epoch 31, Loss: 0.0666
Epoch 32, Loss: 0.0603
Epoch 33, Loss: 0.0782
Epoch 34, Loss: 0.0853
Epoch 35, Loss: 0.0864
Epoch 36, Loss: 0.0674
Epoch 37, Loss: 0.0590
Epoch 38, Loss: 0.0577
Epoch 39, Loss: 0.0539
Epoch 40, Loss: 0.0791
Epoch 41, Loss: 0.0756
Epoch 42, Loss: 0.0730
Epoch 43, Loss: 0.0579
Epoc

  model.load_state_dict(torch.load("best_model.pth"))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## CNN with text

In [65]:
text_embeddings = np.vstack(df["text_embedding"])
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df["brand"])

# Split data
X_train, X_test, y_train, y_test = train_test_split(text_embeddings, labels, test_size=0.2, random_state=42)
train_dataset = ImageDataset(X_train, y_train)
test_dataset = ImageDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


# Train CNN
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Training on {device}")
model = CNNClassifier(input_dim=text_embeddings.shape[1], num_classes=len(label_encoder.classes_)).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Early stopping parameters
best_loss = float('inf')
patience = 10
early_stop_counter = 0

for epoch in range(1000):  # Max epochs
    model.train()
    epoch_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    
    epoch_loss /= len(train_loader)
    print(f"Epoch {epoch + 1}, Loss: {epoch_loss:.4f}")

    # Early stopping check
    if epoch_loss < best_loss:
        best_loss = epoch_loss
        early_stop_counter = 0
        torch.save(model.state_dict(), "best_model.pth")  # Save the best model
    else:
        early_stop_counter += 1
        if early_stop_counter >= patience:
            print("Early stopping triggered.")
            break

# Load the best model
model.load_state_dict(torch.load("best_model.pth"))

# Evaluate CNN
model.eval()
correct = 0
total = 0
all_predictions = []
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch)
        _, predicted = torch.max(outputs, 1)
        total += y_batch.size(0)
        correct += (predicted == y_batch).sum().item()
        all_predictions.extend(predicted.cpu().numpy())

# Metrics
accuracy = accuracy_score(y_test, all_predictions)
precision = precision_score(y_test, all_predictions, average="weighted")
recall = recall_score(y_test, all_predictions, average="weighted")
f1 = f1_score(y_test, all_predictions, average="weighted")
print(f"Test Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

# Save report
report = classification_report(y_test, all_predictions, output_dict=True)
report_df = pd.DataFrame(report).transpose()
# Add overall metrics to the report
report_df.loc["overall"] = {"precision": precision, "recall": recall, "f1-score": f1, "accuracy": accuracy}
report_df.to_csv("/Volumes/T7/DLCOURSEWORK/circl_phishing_dataset/classification_report_cnn_text.csv")

print("Classification report saved.")


Training on cpu
Epoch 1, Loss: 3.3692
Epoch 2, Loss: 2.5785
Epoch 3, Loss: 2.1604
Epoch 4, Loss: 1.8357
Epoch 5, Loss: 1.5833
Epoch 6, Loss: 1.3058
Epoch 7, Loss: 1.1717
Epoch 8, Loss: 0.9984
Epoch 9, Loss: 0.8585
Epoch 10, Loss: 0.7301
Epoch 11, Loss: 0.6422
Epoch 12, Loss: 0.5354
Epoch 13, Loss: 0.4428
Epoch 14, Loss: 0.3980
Epoch 15, Loss: 0.3449
Epoch 16, Loss: 0.3549
Epoch 17, Loss: 0.3046
Epoch 18, Loss: 0.2890
Epoch 19, Loss: 0.3319
Epoch 20, Loss: 0.2701
Epoch 21, Loss: 0.2278
Epoch 22, Loss: 0.2278
Epoch 23, Loss: 0.2083
Epoch 24, Loss: 0.2545
Epoch 25, Loss: 0.2133
Epoch 26, Loss: 0.1859
Epoch 27, Loss: 0.1967
Epoch 28, Loss: 0.1872
Epoch 29, Loss: 0.1913
Epoch 30, Loss: 0.1865
Epoch 31, Loss: 0.1752
Epoch 32, Loss: 0.1881
Epoch 33, Loss: 0.1824
Epoch 34, Loss: 0.1913
Epoch 35, Loss: 0.1621
Epoch 36, Loss: 0.1696
Epoch 37, Loss: 0.1725
Epoch 38, Loss: 0.1812
Epoch 39, Loss: 0.1784
Epoch 40, Loss: 0.1966
Epoch 41, Loss: 0.2087
Epoch 42, Loss: 0.1850
Epoch 43, Loss: 0.1642
Epoc

  model.load_state_dict(torch.load("best_model.pth"))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## MLP

In [61]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import pandas as pd
import numpy as np

# Dataset Class
class EmbeddingDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = torch.tensor(embeddings, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]

# Define Model
class DeepClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(DeepClassifier, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_classes),
        )

    def forward(self, x):
        return self.fc(x)

# Training and Evaluation Function
def train_and_evaluate(embedding_type, embeddings, labels, report_path):
    print(f"\nRunning for {embedding_type}...")
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.2, random_state=42, stratify=labels)
    train_dataset = EmbeddingDataset(X_train, y_train)
    test_dataset = EmbeddingDataset(X_test, y_test)

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    # Model setup
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = DeepClassifier(input_dim=embeddings.shape[1], num_classes=len(np.unique(labels))).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    # Early stopping setup
    best_loss = float('inf')
    patience = 10
    early_stop_counter = 0

    # Training loop
    for epoch in range(1000):  # Maximum epochs; early stopping will determine actual limit
        model.train()
        epoch_loss = 0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

        epoch_loss /= len(train_loader)
        print(f"Epoch {epoch + 1}, Loss: {epoch_loss:.4f}")

        # Early stopping check
        if epoch_loss < best_loss:
            best_loss = epoch_loss
            early_stop_counter = 0
            torch.save(model.state_dict(), f"best_model_{embedding_type}.pth")
        else:
            early_stop_counter += 1
            if early_stop_counter >= patience:
                print(f"Early stopping triggered at epoch {epoch + 1}.")
                break

    # Load best model
    model.load_state_dict(torch.load(f"best_model_{embedding_type}.pth"))
    model.eval()

    # Evaluation
    all_predictions = []
    with torch.no_grad():
        for X_batch, _ in test_loader:
            X_batch = X_batch.to(device)
            outputs = model(X_batch)
            _, predicted = torch.max(outputs, 1)
            all_predictions.extend(predicted.cpu().numpy())

    # Metrics
    accuracy = accuracy_score(y_test, all_predictions)
    precision = precision_score(y_test, all_predictions, average="weighted")
    recall = recall_score(y_test, all_predictions, average="weighted")
    f1 = f1_score(y_test, all_predictions, average="weighted")
    print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

    # Save report
    report = classification_report(y_test, all_predictions, output_dict=True)
    report_df = pd.DataFrame(report).transpose()
    report_df.loc["overall"] = {"precision": precision, "recall": recall, "f1-score": f1, "accuracy": accuracy}
    report_df.to_csv(report_path)
    print(f"Classification report saved to {report_path}")

# Prepare Data
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df["brand"])

# Run for each embedding type
embedding_types = {
    "combined": {"data": np.vstack(df["combined_embedding"]), "path": "/Volumes/T7/DLCOURSEWORK/circl_phishing_dataset/classification_report_mlp_combined.csv"},
    "image": {"data": np.vstack(df["image_embedding"]), "path": "/Volumes/T7/DLCOURSEWORK/circl_phishing_dataset/classification_report_mlp_image.csv"},
    "text": {"data": np.vstack(df["text_embedding"]), "path": "/Volumes/T7/DLCOURSEWORK/circl_phishing_dataset/classification_report_mlp_text.csv"},
}

for embedding_type, info in embedding_types.items():
    train_and_evaluate(embedding_type, info["data"], labels, info["path"])



Running for combined...
Epoch 1, Loss: 3.2524
Epoch 2, Loss: 2.3090
Epoch 3, Loss: 1.7826
Epoch 4, Loss: 1.4085
Epoch 5, Loss: 1.0808
Epoch 6, Loss: 0.8316
Epoch 7, Loss: 0.6341
Epoch 8, Loss: 0.4804
Epoch 9, Loss: 0.3784
Epoch 10, Loss: 0.3081
Epoch 11, Loss: 0.2290
Epoch 12, Loss: 0.2202
Epoch 13, Loss: 0.1629
Epoch 14, Loss: 0.2075
Epoch 15, Loss: 0.1755
Epoch 16, Loss: 0.1795
Epoch 17, Loss: 0.1671
Epoch 18, Loss: 0.1262
Epoch 19, Loss: 0.1216
Epoch 20, Loss: 0.1466
Epoch 21, Loss: 0.1129
Epoch 22, Loss: 0.1053
Epoch 23, Loss: 0.0860
Epoch 24, Loss: 0.0993
Epoch 25, Loss: 0.1031
Epoch 26, Loss: 0.1170
Epoch 27, Loss: 0.0895
Epoch 28, Loss: 0.0988
Epoch 29, Loss: 0.1066
Epoch 30, Loss: 0.0786
Epoch 31, Loss: 0.0881
Epoch 32, Loss: 0.0801
Epoch 33, Loss: 0.0819
Epoch 34, Loss: 0.1201
Epoch 35, Loss: 0.0957
Epoch 36, Loss: 0.0745
Epoch 37, Loss: 0.0780
Epoch 38, Loss: 0.0843
Epoch 39, Loss: 0.0837
Epoch 40, Loss: 0.0871
Epoch 41, Loss: 0.0770
Epoch 42, Loss: 0.0633
Epoch 43, Loss: 0.

  model.load_state_dict(torch.load(f"best_model_{embedding_type}.pth"))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 6, Loss: 1.1292
Epoch 7, Loss: 0.9146
Epoch 8, Loss: 0.7483
Epoch 9, Loss: 0.6009
Epoch 10, Loss: 0.5185
Epoch 11, Loss: 0.4611
Epoch 12, Loss: 0.3328
Epoch 13, Loss: 0.2591
Epoch 14, Loss: 0.2392
Epoch 15, Loss: 0.2361
Epoch 16, Loss: 0.2190
Epoch 17, Loss: 0.2095
Epoch 18, Loss: 0.1921
Epoch 19, Loss: 0.1756
Epoch 20, Loss: 0.1295
Epoch 21, Loss: 0.1233
Epoch 22, Loss: 0.1346
Epoch 23, Loss: 0.1118
Epoch 24, Loss: 0.1135
Epoch 25, Loss: 0.1162
Epoch 26, Loss: 0.1197
Epoch 27, Loss: 0.1158
Epoch 28, Loss: 0.1178
Epoch 29, Loss: 0.1059
Epoch 30, Loss: 0.1009
Epoch 31, Loss: 0.1234
Epoch 32, Loss: 0.1430
Epoch 33, Loss: 0.1526
Epoch 34, Loss: 0.0992
Epoch 35, Loss: 0.0876
Epoch 36, Loss: 0.0805
Epoch 37, Loss: 0.0890
Epoch 38, Loss: 0.1053
Epoch 39, Loss: 0.0946
Epoch 40, Loss: 0.0994
Epoch 41, Loss: 0.0763
Epoch 42, Loss: 0.0833
Epoch 43, Loss: 0.0888
Epoch 44, Loss: 0.0713
Epoch 45, Loss: 0.0702
Epoch 46, Loss: 0.0882
Epoch 47, Loss: 0.0881
Epoch 48, Loss: 0.0840
Epoch 49, Loss:

  model.load_state_dict(torch.load(f"best_model_{embedding_type}.pth"))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 2, Loss: 2.6299
Epoch 3, Loss: 2.2127
Epoch 4, Loss: 1.8190
Epoch 5, Loss: 1.5981
Epoch 6, Loss: 1.3594
Epoch 7, Loss: 1.1462
Epoch 8, Loss: 1.0008
Epoch 9, Loss: 0.8331
Epoch 10, Loss: 0.7069
Epoch 11, Loss: 0.6221
Epoch 12, Loss: 0.5275
Epoch 13, Loss: 0.4736
Epoch 14, Loss: 0.4133
Epoch 15, Loss: 0.4102
Epoch 16, Loss: 0.4024
Epoch 17, Loss: 0.3907
Epoch 18, Loss: 0.3200
Epoch 19, Loss: 0.2713
Epoch 20, Loss: 0.2913
Epoch 21, Loss: 0.2568
Epoch 22, Loss: 0.2668
Epoch 23, Loss: 0.2608
Epoch 24, Loss: 0.2393
Epoch 25, Loss: 0.1980
Epoch 26, Loss: 0.2549
Epoch 27, Loss: 0.1937
Epoch 28, Loss: 0.2139
Epoch 29, Loss: 0.1902
Epoch 30, Loss: 0.1963
Epoch 31, Loss: 0.2148
Epoch 32, Loss: 0.1897
Epoch 33, Loss: 0.2203
Epoch 34, Loss: 0.2223
Epoch 35, Loss: 0.2275
Epoch 36, Loss: 0.2147
Epoch 37, Loss: 0.1944
Epoch 38, Loss: 0.2063
Epoch 39, Loss: 0.2048
Epoch 40, Loss: 0.2114
Epoch 41, Loss: 0.1938
Epoch 42, Loss: 0.1735
Epoch 43, Loss: 0.1980
Epoch 44, Loss: 0.1989
Epoch 45, Loss: 0.1

  model.load_state_dict(torch.load(f"best_model_{embedding_type}.pth"))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Transformer

In [66]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import pandas as pd
import numpy as np

# Dataset Class
class EmbeddingDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = torch.tensor(embeddings, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]

# Define Transformer Model
class TransformerClassifier(nn.Module):
    def __init__(self, input_dim, num_classes, num_heads=8, num_layers=2, hidden_dim=256):
        super(TransformerClassifier, self).__init__()
        self.embedding_layer = nn.Linear(input_dim, hidden_dim)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_dim, nhead=num_heads, dim_feedforward=hidden_dim * 4, dropout=0.3
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_classes),
        )

    def forward(self, x):
        x = self.embedding_layer(x)
        x = self.transformer_encoder(x.unsqueeze(1)).squeeze(1)  # Add and remove sequence dimension
        x = self.classifier(x)
        return x

# Training and Evaluation Function
def train_and_evaluate(embedding_type, embeddings, labels, report_path):
    print(f"\nRunning for {embedding_type}...")
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.2, random_state=42, stratify=labels)
    train_dataset = EmbeddingDataset(X_train, y_train)
    test_dataset = EmbeddingDataset(X_test, y_test)

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    # Model setup
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = TransformerClassifier(input_dim=embeddings.shape[1], num_classes=len(np.unique(labels))).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

    # Early stopping setup
    best_loss = float('inf')
    patience = 10
    early_stop_counter = 0

    # Training loop
    for epoch in range(1000):  # Maximum epochs; early stopping will determine actual limit
        model.train()
        epoch_loss = 0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

        epoch_loss /= len(train_loader)
        print(f"Epoch {epoch + 1}, Loss: {epoch_loss:.4f}")

        # Early stopping check
        if epoch_loss < best_loss:
            best_loss = epoch_loss
            early_stop_counter = 0
            torch.save(model.state_dict(), f"best_model_{embedding_type}.pth")
        else:
            early_stop_counter += 1
            if early_stop_counter >= patience:
                print(f"Early stopping triggered at epoch {epoch + 1}.")
                break

    # Load best model
    model.load_state_dict(torch.load(f"best_model_{embedding_type}.pth"))
    model.eval()

    # Evaluation
    all_predictions = []
    with torch.no_grad():
        for X_batch, _ in test_loader:
            X_batch = X_batch.to(device)
            outputs = model(X_batch)
            _, predicted = torch.max(outputs, 1)
            all_predictions.extend(predicted.cpu().numpy())

    # Metrics
    accuracy = accuracy_score(y_test, all_predictions)
    precision = precision_score(y_test, all_predictions, average="weighted")
    recall = recall_score(y_test, all_predictions, average="weighted")
    f1 = f1_score(y_test, all_predictions, average="weighted")
    print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

    # Save report
    report = classification_report(y_test, all_predictions, output_dict=True)
    report_df = pd.DataFrame(report).transpose()
    report_df.loc["overall"] = {"precision": precision, "recall": recall, "f1-score": f1, "accuracy": accuracy}
    report_df.to_csv(report_path)
    print(f"Classification report saved to {report_path}")

# Prepare Data
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df["brand"])

# Run for each embedding type
embedding_types = {
    "combined": {"data": np.vstack(df["combined_embedding"]), "path": "/Volumes/T7/DLCOURSEWORK/circl_phishing_dataset/classification_report_transformer_combined.csv"},
    "image": {"data": np.vstack(df["image_embedding"]), "path": "/Volumes/T7/DLCOURSEWORK/circl_phishing_dataset/classification_report_transformer_image.csv"},
    "text": {"data": np.vstack(df["text_embedding"]), "path": "/Volumes/T7/DLCOURSEWORK/circl_phishing_dataset/classification_report_transformer_text.csv"},
}

for embedding_type, info in embedding_types.items():
    train_and_evaluate(embedding_type, info["data"], labels, info["path"])



Running for combined...
Epoch 1, Loss: 3.5954




Epoch 2, Loss: 3.2525
Epoch 3, Loss: 2.9112
Epoch 4, Loss: 2.6301
Epoch 5, Loss: 2.3628
Epoch 6, Loss: 2.1040
Epoch 7, Loss: 1.8859
Epoch 8, Loss: 1.7143
Epoch 9, Loss: 1.5579
Epoch 10, Loss: 1.4087
Epoch 11, Loss: 1.3030
Epoch 12, Loss: 1.1674
Epoch 13, Loss: 1.0581
Epoch 14, Loss: 0.9728
Epoch 15, Loss: 0.8765
Epoch 16, Loss: 0.8110
Epoch 17, Loss: 0.7525
Epoch 18, Loss: 0.6936
Epoch 19, Loss: 0.6515
Epoch 20, Loss: 0.6067
Epoch 21, Loss: 0.5548
Epoch 22, Loss: 0.4971
Epoch 23, Loss: 0.4704
Epoch 24, Loss: 0.4358
Epoch 25, Loss: 0.4106
Epoch 26, Loss: 0.3965
Epoch 27, Loss: 0.3735
Epoch 28, Loss: 0.3487
Epoch 29, Loss: 0.3143
Epoch 30, Loss: 0.2858
Epoch 31, Loss: 0.2810
Epoch 32, Loss: 0.2566
Epoch 33, Loss: 0.2295
Epoch 34, Loss: 0.2400
Epoch 35, Loss: 0.2493
Epoch 36, Loss: 0.2252
Epoch 37, Loss: 0.2020
Epoch 38, Loss: 0.2181
Epoch 39, Loss: 0.2006
Epoch 40, Loss: 0.1873
Epoch 41, Loss: 0.1989
Epoch 42, Loss: 0.1735
Epoch 43, Loss: 0.1770
Epoch 44, Loss: 0.1670
Epoch 45, Loss: 0.1

  model.load_state_dict(torch.load(f"best_model_{embedding_type}.pth"))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 2, Loss: 3.2599
Epoch 3, Loss: 3.0543
Epoch 4, Loss: 2.7720
Epoch 5, Loss: 2.5198
Epoch 6, Loss: 2.3690
Epoch 7, Loss: 2.1089
Epoch 8, Loss: 1.9723
Epoch 9, Loss: 1.7811
Epoch 10, Loss: 1.6507
Epoch 11, Loss: 1.5469
Epoch 12, Loss: 1.4596
Epoch 13, Loss: 1.3807
Epoch 14, Loss: 1.2512
Epoch 15, Loss: 1.1736
Epoch 16, Loss: 1.1164
Epoch 17, Loss: 1.0246
Epoch 18, Loss: 0.9687
Epoch 19, Loss: 0.9285
Epoch 20, Loss: 0.8472
Epoch 21, Loss: 0.7944
Epoch 22, Loss: 0.7233
Epoch 23, Loss: 0.7070
Epoch 24, Loss: 0.6857
Epoch 25, Loss: 0.6180
Epoch 26, Loss: 0.5876
Epoch 27, Loss: 0.5729
Epoch 28, Loss: 0.5334
Epoch 29, Loss: 0.4786
Epoch 30, Loss: 0.5059
Epoch 31, Loss: 0.4488
Epoch 32, Loss: 0.4046
Epoch 33, Loss: 0.3738
Epoch 34, Loss: 0.4071
Epoch 35, Loss: 0.3627
Epoch 36, Loss: 0.3520
Epoch 37, Loss: 0.3285
Epoch 38, Loss: 0.2984
Epoch 39, Loss: 0.2946
Epoch 40, Loss: 0.2716
Epoch 41, Loss: 0.2921
Epoch 42, Loss: 0.2563
Epoch 43, Loss: 0.2542
Epoch 44, Loss: 0.2277
Epoch 45, Loss: 0.2

  model.load_state_dict(torch.load(f"best_model_{embedding_type}.pth"))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 2, Loss: 3.3974
Epoch 3, Loss: 3.1884
Epoch 4, Loss: 3.0139
Epoch 5, Loss: 2.7626
Epoch 6, Loss: 2.5922
Epoch 7, Loss: 2.4007
Epoch 8, Loss: 2.2336
Epoch 9, Loss: 2.0868
Epoch 10, Loss: 1.9423
Epoch 11, Loss: 1.8101
Epoch 12, Loss: 1.6717
Epoch 13, Loss: 1.5736
Epoch 14, Loss: 1.4885
Epoch 15, Loss: 1.4175
Epoch 16, Loss: 1.3052
Epoch 17, Loss: 1.2219
Epoch 18, Loss: 1.1603
Epoch 19, Loss: 1.0831
Epoch 20, Loss: 1.0221
Epoch 21, Loss: 0.9890
Epoch 22, Loss: 0.9126
Epoch 23, Loss: 0.8645
Epoch 24, Loss: 0.7960
Epoch 25, Loss: 0.7641
Epoch 26, Loss: 0.7128
Epoch 27, Loss: 0.6681
Epoch 28, Loss: 0.6555
Epoch 29, Loss: 0.6427
Epoch 30, Loss: 0.5975
Epoch 31, Loss: 0.5209
Epoch 32, Loss: 0.5265
Epoch 33, Loss: 0.5160
Epoch 34, Loss: 0.5130
Epoch 35, Loss: 0.4472
Epoch 36, Loss: 0.4512
Epoch 37, Loss: 0.4419
Epoch 38, Loss: 0.4084
Epoch 39, Loss: 0.4164
Epoch 40, Loss: 0.4123
Epoch 41, Loss: 0.3914
Epoch 42, Loss: 0.3788
Epoch 43, Loss: 0.3771
Epoch 44, Loss: 0.3535
Epoch 45, Loss: 0.3

  model.load_state_dict(torch.load(f"best_model_{embedding_type}.pth"))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [68]:
## BERT

In [67]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np

# Dataset Class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

# Training and Evaluation Function
def train_and_evaluate(texts, labels, report_path):
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42, stratify=labels)

    # Tokenizer and Dataset
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    train_dataset = TextDataset(X_train, y_train, tokenizer)
    test_dataset = TextDataset(X_test, y_test, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

    # Model Setup
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(np.unique(labels)))
    model = model.to(device)

    # Optimizer and Loss
    optimizer = AdamW(model.parameters(), lr=2e-5)
    criterion = torch.nn.CrossEntropyLoss()

    # Training Loop
    best_loss = float("inf")
    patience = 3
    early_stop_counter = 0

    for epoch in range(10):  # Maximum epochs
        model.train()
        epoch_loss = 0
        for batch in train_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

        epoch_loss /= len(train_loader)
        print(f"Epoch {epoch + 1}, Loss: {epoch_loss:.4f}")

        # Early stopping
        if epoch_loss < best_loss:
            best_loss = epoch_loss
            early_stop_counter = 0
            torch.save(model.state_dict(), "best_bert_model.pth")
        else:
            early_stop_counter += 1
            if early_stop_counter >= patience:
                print(f"Early stopping triggered at epoch {epoch + 1}")
                break

    # Load Best Model
    model.load_state_dict(torch.load("best_bert_model.pth"))
    model.eval()

    # Evaluation
    all_predictions = []
    true_labels = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)

            all_predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    # Metrics
    accuracy = accuracy_score(true_labels, all_predictions)
    precision = precision_score(true_labels, all_predictions, average="weighted")
    recall = recall_score(true_labels, all_predictions, average="weighted")
    f1 = f1_score(true_labels, all_predictions, average="weighted")
    print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

    # Save Report
    report = classification_report(true_labels, all_predictions, output_dict=True)
    report_df = pd.DataFrame(report).transpose()
    report_df.loc["overall"] = {"precision": precision, "recall": recall, "f1-score": f1, "accuracy": accuracy}
    report_df.to_csv(report_path)
    print(f"Classification report saved to {report_path}")

# Prepare Data
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df["brand"])
texts = df["extracted_text"].tolist()  # Use extracted text for BERT input

# Train and Evaluate
train_and_evaluate(
    texts,
    labels,
    report_path="/Volumes/T7/DLCOURSEWORK/circl_phishing_dataset/classification_report_bert.csv",
)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Loss: 3.5482


KeyboardInterrupt: 