# Preprocess data

In [None]:
from google.colab import drive
drive.mount('/content/drive')



In [None]:
!ls "/content/drive/My Drive/720Project/"

In [None]:
import re
import pandas as pd
import matplotlib.pyplot as plt
from urllib.parse import unquote

In [None]:
# Function to extract hyperlinks from email body text
def extract_hyperlinks(text):
    if not isinstance(text, str):
        return []

    hyperlinks = []

    # Pattern 1: Standard HTML hyperlinks <a href="http://...">text</a>
    href_pattern = re.compile(r'<a\s+href=[\'\"]?([^\'\">]+)[\'\"]?[^>]*>.*?</a>', re.IGNORECASE)
    hyperlinks.extend(href_pattern.findall(text))

    # Pattern 2: Plain URLs starting with http:// or https://
    url_pattern = re.compile(r'(?:https?://|www\.)[^\s<>"\']+', re.IGNORECASE)
    hyperlinks.extend(url_pattern.findall(text))

    # Pattern 3: URLs in Yahoo Groups format (common in the provided examples)
    yahoo_pattern = re.compile(r'http://(?:us\.click\.yahoo\.com|docs\.yahoo\.com)[^\s<>"\']+', re.IGNORECASE)
    hyperlinks.extend(yahoo_pattern.findall(text))

    # Pattern 4: URLs with line breaks or spaces
    broken_url_pattern = re.compile(r'https?://[^\s<>"\'\(\)]*(?:\s+[^\s<>"\'\(\)]+)*', re.IGNORECASE)
    broken_urls = broken_url_pattern.findall(text)
    for url in broken_urls:
        if ' ' in url:  # Only process URLs that were broken with spaces
            cleaned_url = url.replace(' ', '')
            if cleaned_url not in hyperlinks:
                hyperlinks.append(cleaned_url)

    # Clean up URLs - remove any trailing punctuation or closing parentheses
    cleaned_hyperlinks = []
    for url in hyperlinks:
        # Clean trailing punctuation
        url = re.sub(r'[.,;:!?"\')]$', '', url)
        # Decode URL-encoded characters
        url = unquote(url)
        # Remove any =3D encoding (common in email URLs)
        url = url.replace('=3D', '=')
        cleaned_hyperlinks.append(url)

    # Remove duplicates while preserving order
    unique_hyperlinks = []
    for url in cleaned_hyperlinks:
        if url not in unique_hyperlinks:
            unique_hyperlinks.append(url)

    return unique_hyperlinks


In [None]:
# Split Dataset Into Training, Evaluation and Validation Sets
# Training/Fine-tuning data sets= Ling, Nazarus, Nigerian; Evaluation= SpamAssassin; Validation=?

# List of your CSV files
csv_files = ['/content/drive/My Drive/720Project/SpamAssasin.csv', '/content/drive/My Drive/720Project/Nazario.csv', '/content/drive/My Drive/720Project/Nigerian_Fraud.csv']  # Replace with actual paths

columns_to_use = ['sender', 'receiver', 'subject', 'body', 'label']

df_list = [pd.read_csv(file, usecols=columns_to_use) for file in csv_files]
combined_df = pd.concat(df_list, ignore_index=True)

# Extract hyperlinks from the body text
combined_df['hyperlinks'] = combined_df['body'].apply(extract_hyperlinks)

In [None]:
# Count the number of hyperlinks per email
combined_df['hyperlink_count'] = combined_df['hyperlinks'].apply(len)

# Preview the result
print(combined_df.head())
print(f"There are missing values?: {combined_df.isna().any().any()}")
print(f"There are duplicated emails?: {combined_df['body'].duplicated().any()}") # TODO: implement a better way of removing duplicates
print(f"There are empty strings in text column?: {combined_df.where(combined_df['body'] =='').any().any()}")

# Display some statistics about hyperlinks
print(f"\nEmails containing hyperlinks: {sum(combined_df['hyperlink_count'] > 0)}")
print(f"Total hyperlinks found: {combined_df['hyperlink_count'].sum()}")
print(f"Maximum hyperlinks in a single email: {combined_df['hyperlink_count'].max()}")

# Fine-tune model

In [None]:
!pip install datasets
!pip install evaluate

In [None]:
from transformers import (AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments,
                          DataCollatorWithPadding, Trainer, pipeline)
from sklearn.model_selection import train_test_split
from transformers.pipelines.pt_utils import KeyDataset
from datasets import load_dataset, Dataset
import torch, wandb, evaluate
from tqdm.auto import tqdm
import numpy as np

In [None]:
from datasets import Dataset
import pandas as pd

dataset = Dataset.from_pandas(combined_df)

In [None]:
dataset['hyperlinks'][1]

In [None]:
tokenizer = AutoTokenizer.from_pretrained("cybersectony/phishing-email-detection-distilbert_v2.4.1")

In [None]:
def prepare_data(data):
    # # Extract selected features
    # features = {
    #     'sender': data['sender'],
    #     'receiver': data['receiver'],
    #     'hyperlinks': data['hyperlinks'],
    #     'subject': data['subject'],
    # }

    # Tokenize email body (if still relevant)
    body_tokens = tokenizer(str(data["body"]),truncation=True,
                       max_length=512, return_overflowing_tokens=True)

    sample_map = body_tokens.pop("overflow_to_sample_mapping")
    for key, values in data.items():
        body_tokens[key] = [values[i] for i in sample_map]
    return body_tokens

tokenized_dataset = dataset.map(prepare_data, batched=True)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define evaluation metrics

In [None]:
metrics = evaluate.combine(["accuracy", "precision", "recall", "ealvaradob/false_positive_rate"])

In [None]:
def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  predictions = np.argmax(predictions, axis=1)
  return metrics.compute(predictions=predictions, references=labels)

# Fine-tune model

In [None]:
id2label = {0: "benign", 1: "phishing"}
label2id = {"benign": 0, "phishing": 1}

In [None]:
df = tokenized_dataset.to_pandas()
train, test = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
train, test = Dataset.from_pandas(train, preserve_index=False), Dataset.from_pandas(test, preserve_index=False)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "cybersectony/phishing-email-detection-distilbert_v2.4.1",
    num_labels=2,
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

In [None]:
training_args = TrainingArguments(
    output_dir="bert-large-finetuned-phishing",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=4,
    # torch_compile=True,
    fp16=False,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    gradient_accumulation_steps=2,
    save_total_limit=1,
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

# # Evaluate on the evaluation dataset
# results = trainer.evaluate()
# print(results)

In [None]:
trainer.save_model("bert-large-finetuned-phishing")
tokenizer.save_pretrained("bert-large-finetuned-phishing")

In [None]:
!zip -r bert-large-finetuned-phishing.zip bert-large-finetuned-phishing
from google.colab import files
files.download("bert-large-finetuned-phishing.zip")