In [2]:
!pip install tldextract
!pip install xgboost
!pip install pandas numpy beautifulsoup4 textblob scikit-learn
!pip install matplotlib
!pip install seaborn
!pip install scipy
!pip install nltk
!pip install wordcloud
!pip install scipy
!pip install nltk
!pip install wordcloud
!pip install transformers torch

Collecting tldextract
  Using cached tldextract-5.3.0-py3-none-any.whl.metadata (11 kB)
Collecting idna (from tldextract)
  Using cached idna-3.10-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.1.0 (from tldextract)
  Using cached requests-2.32.4-py3-none-any.whl.metadata (4.9 kB)
Collecting requests-file>=1.4 (from tldextract)
  Using cached requests_file-2.1.0-py2.py3-none-any.whl.metadata (1.7 kB)
Collecting filelock>=3.0.8 (from tldextract)
  Using cached filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting charset_normalizer<4,>=2 (from requests>=2.1.0->tldextract)
  Using cached charset_normalizer-3.4.2-cp311-cp311-macosx_10_9_universal2.whl.metadata (35 kB)
Collecting urllib3<3,>=1.21.1 (from requests>=2.1.0->tldextract)
  Using cached urllib3-2.5.0-py3-none-any.whl.metadata (6.5 kB)
Collecting certifi>=2017.4.17 (from requests>=2.1.0->tldextract)
  Using cached certifi-2025.7.14-py3-none-any.whl.metadata (2.4 kB)
Using cached tldextract-5.3.0-py3-none-any.whl

In [None]:
import pandas as pd
import re
import urllib.parse
from collections import Counter
import tldextract
import matplotlib.pyplot as plt
import numpy as np


In [None]:
#Creating phising email dataset
#Utilizing `7 Email Phising Datasets` and merging them into a single dataset (https://figshare.com/articles/dataset/Seven_Phishing_Email_Datasets/25432108)
import glob

# Path to your datasets; adjust the pattern if they are in a different folder or format
dataset_files = glob.glob("7PhisingEmailsDataset/*.csv")  # Example: all CSVs in a 'datasets' folder

# List for storing the reduced DataFrames
dataframes = []

for file in dataset_files:
    try:
        # Read only the columns you need; ignore others (extra columns will be dropped)
        df = pd.read_csv(file, usecols=['subject', 'body', 'label'])
        dataframes.append(df)
        print(f"Loaded {file} with {len(df)} rows")
    except Exception as e:
        print(f"Error loading {file}: {e}")

# Concatenate all reduced DataFrames into one
merged_df = pd.concat(dataframes, ignore_index=True)
print(f"\nMerged dataset shape: {merged_df.shape}")

# Save as CSV
merged_df.to_csv("merged_phishing_emails.csv", index=False)
print("Saved merged dataset to merged_phishing_emails.csv")

#Preview first few rows
print("\nSample data:")
print(merged_df.head())
print(merged_df.info())


In [None]:
#Next step is to take a look a the dataset and see if:
    #1. The dataset is balanced
    #2. The dataset is clean

#Load the dataset
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv("merged_phishing_emails.csv")
print(f"Dataframe Shape Before Making Changes: {df.shape}")
#Preview the dataset
df.isnull().sum()
df.dropna(inplace=True)
df.info()
df.head()

#Check for duplicates
df.duplicated().sum()
df.drop_duplicates(inplace=True)
print(f"Dataframe Shape After Making Changes: {df.shape}")

#Check for balance 
df['label'].value_counts() 

df.to_csv("cleaned_phishing_emails.csv", index=False)


: 

In [3]:
import pandas as pd
import re
from bs4 import BeautifulSoup
from textblob import TextBlob
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch

def count_keywords(text):
    # counts occurrences of suspicious keywords in text
    keywords = ["urgent", "verify your account", "click here", "login now", "password reset",
                "account suspended", "update your information", "confirm your identity",
                "secure your account", "action required"]
    text = text.lower()
    counts = {}
    for word in keywords:
        counts["count_" + word.replace(" ", "_")] = len(re.findall(r'\b' + word + r'\b', text))
    return counts

def check_greeting(text):
    # checks for generic greetings in first 200 characters
    greetings = ["dear customer", "dear user", "hello sir", "hello madam", "dear client"]
    first_bit = text.lower()[:200]
    for greeting in greetings:
        if greeting in first_bit:
            return 1
    return 0

def get_sentiment(text):
    # computes sentiment polarity and subjectivity
    blob = TextBlob(text)
    return blob.sentiment.polarity, blob.sentiment.subjectivity

def persuasion_cues(text):
    # counts gain and loss persuasion phrases
    good_phrases = ["win", "prize", "bonus", "reward"]
    bad_phrases = ["lose", "suspended", "locked", "expired"]
    text = text.lower()
    good_count = 0
    bad_count = 0
    for phrase in good_phrases:
        good_count += len(re.findall(r'\b' + phrase + r'\b', text))
    for phrase in bad_phrases:
        bad_count += len(re.findall(r'\b' + phrase + r'\b', text))
    return good_count, bad_count

def get_lengths(subject, body):
    return len(subject), len(body.split())

def count_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    return len(soup.find_all())

def count_urls(text):
    url_pattern = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
    return len(re.findall(url_pattern, text))

def count_attachments(text):
    return text.lower().count("content-disposition: attachment")

def count_exclamation(text):
    return text.count("!")

def get_bert_embeddings(texts, model_name="bert-base-uncased", max_length=512):
    print("Loading BERT model and tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    
    # Set model to evaluation mode
    model.eval()
    
    embeddings = []
    
    print("Generating BERT embeddings...")
    for i, text in enumerate(texts):
        if i % 1000 == 0:
            print(f"Processing text {i+1}/{len(texts)}")
        
        # Tokenize and encode the text
        inputs = tokenizer(
            text, 
            return_tensors="pt", 
            truncation=True, 
            max_length=max_length, 
            padding=True
        )
        
        # Generate embeddings
        with torch.no_grad():
            outputs = model(**inputs)
            # Use the [CLS] token embedding (first token) as the sentence representation
            embedding = outputs.last_hidden_state[:, 0, :].numpy().flatten()
            embeddings.append(embedding)
    
    return np.array(embeddings)

# Load the data
data = pd.read_csv('cleaned_phishing_emails.csv')
all_features = []

print("Extracting engineered features...")
for index, row in data.iterrows():
    subject = row['subject']
    body = row['body']
    features = {}

    features.update(count_keywords(subject + ' ' + body))
    features['generic_greeting'] = check_greeting(body)
    polarity, subjectivity = get_sentiment(body)
    features['polarity'] = polarity
    features['subjectivity'] = subjectivity
    good_count, bad_count = persuasion_cues(body)
    features['good_phrases'] = good_count
    features['bad_phrases'] = bad_count
    sub_len, body_len = get_lengths(subject, body)
    features['subject_length'] = sub_len
    features['body_length'] = body_len
    features['html_tags'] = count_html_tags(body)
    features['url_count'] = count_urls(body)
    features['attachment_count'] = count_attachments(body)
    features['exclamation_count'] = count_exclamation(subject + ' ' + body)

    all_features.append(features)

features_df = pd.DataFrame(all_features)

# Generate BERT embeddings for the combined subject and body text
print("Combining subject and body for BERT embeddings...")
combined_texts = data['subject'] + ' ' + data['body']

# Generate BERT embeddings
bert_embeddings = get_bert_embeddings(combined_texts)

# Create DataFrame for BERT embeddings
bert_columns = [f'bert_dim_{i}' for i in range(bert_embeddings.shape[1])]
bert_df = pd.DataFrame(bert_embeddings, columns=bert_columns)

# Combine engineered features with BERT embeddings
final_df = pd.concat([features_df, bert_df], axis=1)
final_df['label'] = data['label']
final_df = final_df.fillna(0)

# Save the final dataset
final_df.to_csv('final_phishing_dataset.csv', index=False)
print("Dataset saved as final_phishing_dataset.csv")
print(f"Dataset shape: {final_df.shape}")
print(f"Engineered features: {len(features_df.columns)}")
print(f"BERT features: {len(bert_df.columns)}")
print(f"Total features: {len(final_df.columns) - 1}")  # -1 for label column

# Save the BERT model info for later use
import json
bert_info = {
    "model_name": "bert-base-uncased",
    "embedding_dimensions": 768,
    "max_length": 512
}
with open('bert_info.json', 'w') as f:
    json.dump(bert_info, f)
print("BERT model info saved as bert_info.json")

  from .autonotebook import tqdm as notebook_tqdm


Extracting engineered features...



If you meant to use Beautiful Soup to parse the web page found at a certain URL, then something has gone wrong. You should use an Python package like 'requests' to fetch the content behind the URL. Once you have the content as a string, you can feed that string into Beautiful Soup.



    
  soup = BeautifulSoup(text, 'html.parser')


Combining subject and body for BERT embeddings...
Loading BERT model and tokenizer...
Generating BERT embeddings...
Processing text 1/200609


  return forward_call(*args, **kwargs)


Processing text 1001/200609
Processing text 2001/200609
Processing text 3001/200609
Processing text 4001/200609
Processing text 5001/200609
Processing text 6001/200609
Processing text 7001/200609
Processing text 8001/200609
Processing text 9001/200609
Processing text 10001/200609
Processing text 11001/200609
Processing text 12001/200609
Processing text 13001/200609
Processing text 14001/200609
Processing text 15001/200609
Processing text 16001/200609
Processing text 17001/200609
Processing text 18001/200609
Processing text 19001/200609
Processing text 20001/200609
Processing text 21001/200609
Processing text 22001/200609
Processing text 23001/200609
Processing text 24001/200609
Processing text 25001/200609
Processing text 26001/200609
Processing text 27001/200609
Processing text 28001/200609
Processing text 29001/200609
Processing text 30001/200609
Processing text 31001/200609
Processing text 32001/200609
Processing text 33001/200609
Processing text 34001/200609
Processing text 35001/2