# Importing The Libraries

In [1]:
# importing of library
import os
import pandas as pd
import numpy as np  
import re
from email import message_from_string
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
from IPython.display import display
from bs4 import BeautifulSoup
import html as ihtml  # stdlib html entity unescape

# Importing the dataset

In [None]:
# folder where everything is stored
root = r"C:\Users\tengt\Downloads\archive"

# folders for each type of email
folders = {
    "spam": os.path.join(root, r"spam_2\spam_2"),
    "easy_ham": os.path.join(root, r"easy_ham\easy_ham"),
    "hard_ham": os.path.join(root, r"hard_ham\hard_ham"),
}

data = []

# go through each folder
for label, folder in folders.items():

    # go through each file in the folder
    for file in os.listdir(folder):

        # full path to the file
        path = os.path.join(folder, file)

        # make sure it's a file
        if os.path.isfile(path): 
            
            # read the file
            with open(path, "r", encoding="latin-1", errors="ignore") as f:
                text = f.read()

            # spam = 1, ham = 0
            data.append({
                "label": 1 if label == "spam" else 0,
                "message": text
            })

# put into a dataframe
dataset = pd.DataFrame(data)

# show how many emails in each class
print("Total emails:", len(dataset), "\n")

# print the label distribution
print(dataset['label'].value_counts())

Note:
- `0` → ham (normal / not spam)
- `1` → spam (could be ads, scams, phishing, malware, etc.)

In [None]:
# Display the first few rows of the dataframe
display(dataset.head())

Unnamed: 0,label,message
0,1,From ilug-admin@linux.ie Tue Aug 6 11:51:02 ...
1,1,From lmrn@mailexcite.com Mon Jun 24 17:03:24 ...
2,1,From amknight@mailexcite.com Mon Jun 24 17:03...
3,1,From jordan23@mailexcite.com Mon Jun 24 17:04...
4,1,From merchantsworld2001@juno.com Tue Aug 6 1...


In [None]:
# Save the DataFrame to a CSV file named 'spamAssassin.csv'
dataset.to_csv(r'..\Datasets\spamAssassin.csv', index=False) # The index=False ensures the index is not saved

# Data Exploration

Exploring the dataset helps us better understand its structure and characteristics.

This dataset contains a collection of **ham (legitimate) and spam emails** made available by the **Spam Assassin Project**. The dataset is widely used for email filtering research and benchmarking. It includes plain-text emails without attachments, and the messages are organized into separate folders for spam and ham.

In total, the dataset contains **4,198 emails**, consisting of both spam and ham messages.
For this section, we will follow these steps:

1. Access a sample email from the dataset (first, middle, and last)  
2. Generate descriptive statistics  
3. Handle missing/null values  
4. Check for duplicate rows  
5. Check for empty emails  
6. Check for emails containing non-ASCII characters  

### Accessing Sample Emails from the Dataset (First, Middle, and Last)

The dataset contains 4198 rows (indexed 0 to 4197).  
We will examine the first, middle, and last emails to inspect their structure and determine the cleaning steps required.

In [None]:
# Accessing the content of the first email at index 0
print(dataset["message"][0])

From ilug-admin@linux.ie  Tue Aug  6 11:51:02 2002
Return-Path: <ilug-admin@linux.ie>
Delivered-To: yyyy@localhost.netnoteinc.com
Received: from localhost (localhost [127.0.0.1])
	by phobos.labs.netnoteinc.com (Postfix) with ESMTP id 9E1F5441DD
	for <jm@localhost>; Tue,  6 Aug 2002 06:48:09 -0400 (EDT)
Received: from phobos [127.0.0.1]
	by localhost with IMAP (fetchmail-5.9.0)
	for jm@localhost (single-drop); Tue, 06 Aug 2002 11:48:09 +0100 (IST)
Received: from lugh.tuatha.org (root@lugh.tuatha.org [194.125.145.45]) by
    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g72LqWv13294 for
    <jm-ilug@jmason.org>; Fri, 2 Aug 2002 22:52:32 +0100
Received: from lugh (root@localhost [127.0.0.1]) by lugh.tuatha.org
    (8.9.3/8.9.3) with ESMTP id WAA31224; Fri, 2 Aug 2002 22:50:17 +0100
Received: from bettyjagessar.com (w142.z064000057.nyc-ny.dsl.cnc.net
    [64.0.57.142]) by lugh.tuatha.org (8.9.3/8.9.3) with ESMTP id WAA31201 for
    <ilug@linux.ie>; Fri, 2 Aug 2002 22:50:11 +0100
    [6

In [None]:
# Accessing the content of the middle email at index 258700
print(dataset["message"][2098])

From fork-admin@xent.com  Thu Sep 19 13:14:49 2002
Return-Path: <fork-admin@xent.com>
Delivered-To: yyyy@localhost.example.com
Received: from localhost (jalapeno [127.0.0.1])
	by jmason.org (Postfix) with ESMTP id E6F3016F03
	for <jm@localhost>; Thu, 19 Sep 2002 13:14:47 +0100 (IST)
Received: from jalapeno [127.0.0.1]
	by localhost with IMAP (fetchmail-5.9.0)
	for jm@localhost (single-drop); Thu, 19 Sep 2002 13:14:47 +0100 (IST)
Received: from xent.com ([64.161.22.236]) by dogma.slashnull.org
    (8.11.6/8.11.6) with ESMTP id g8JC7hC18737 for <jm@jmason.org>;
    Thu, 19 Sep 2002 13:07:43 +0100
Received: from lair.xent.com (localhost [127.0.0.1]) by xent.com (Postfix)
    with ESMTP id 2E69B2940FC; Thu, 19 Sep 2002 05:04:06 -0700 (PDT)
Delivered-To: fork@example.com
Received: from sunserver.permafrost.net (u172n16.hfx.eastlink.ca
    [24.222.172.16]) by xent.com (Postfix) with ESMTP id 4BE5029409E for
    <fork@xent.com>; Thu, 19 Sep 2002 05:03:15 -0700 (PDT)
Received: from [192.168.12

In [None]:
# Accessing the content of the last email at index 517400
print(dataset["message"][4197])

Return-Path: <test-admin@lists.sourceforge.net>
Received: from usw-sf-list2.sourceforge.net (usw-sf-fw2.sourceforge.net
	[216.136.171.252]) by home.sewingwitch.com (8.11.6/8.11.6) with ESMTP id
	g9208B729827 for <shiva+qpopper-webdev@sewingwitch.com>; Tue, 1 Oct 2002
	17:08:11 -0700
Received: from usw-sf-list1-b.sourceforge.net ([10.3.1.13]
	helo=usw-sf-list1.sourceforge.net) by usw-sf-list2.sourceforge.net with
	esmtp (Exim 3.31-VA-mm2 #1 (Debian)) id 17wX3l-0004o7-00 for
	<shiva+qpopper-webdev@sewingwitch.com>; Tue, 01 Oct 2002 17:08:13 -0700
Date: Tue, 01 Oct 2002 17:08:10 -0700
Subject: (SPAM? 08.00) lists.sourceforge.net mailing list memberships reminder
From: mailman-owner@lists.sourceforge.net
To: shiva+qpopper-webdev@sewingwitch.com
X-No-Archive: yes
X-Ack: no
Sender: test-admin@lists.sourceforge.net
Errors-To: test-admin@lists.sourceforge.net
X-BeenThere: test@lists.sourceforge.net
X-Mailman-Version: 2.0.9-sf.net
Precedence: bulk
Message-Id: <E17wX3l-0004o7-00@usw-sf-list2.sou

From inspecting the first, middle, and last emails, we can see the general structure and content of the dataset.  

Key observations include:
- Emails contain extensive headers and metadata, which are not needed for text analysis.
- There is inconsistent formatting, including line breaks, tabs, and spaces, which will need cleaning.
- All emails appear to use standard ASCII encoding, but we will still check for encoding issues.

These insights help us identify potential issues and guide the next steps in cleaning and parsing the dataset. Before proceeding, we will continue with data exploration to gain a better understanding of the dataset.

### Descriptive Statistics 

In [None]:
# Make a copy to prevent mutation
data_ds = dataset.copy()

# Descriptive statistics
print(data_ds.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4198 entries, 0 to 4197
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    4198 non-null   int64 
 1   message  4198 non-null   object
dtypes: int64(1), object(1)
memory usage: 65.7+ KB
None


### Handling Missing Values

In [None]:
# Check for missing values in the dataframe
print(data_ds.isna().sum().sort_values())

label      0
message    0
dtype: int64


### Check for Duplicate Rows

In [None]:
# Shape of data_ds before removing duplicates
print(f"Shape before removing duplicates: {data_ds.shape}")

# Removing duplicate rows
data_ds = data_ds.drop_duplicates(subset=["message"]).reset_index(drop=True)

# Shape of data_ds after removing duplicates
print(f"Shape after removing duplicates: {data_ds.shape}")

Shape before removing duplicates: (4198, 2)
Shape after removing duplicates: (4178, 2)


Based on the dataset summary from `info()`, all 4178 emails have non-null values in the `message` column, so there are no missing entries. However, this does not guarantee that all emails contain meaningful content, as some messages body could still be empty. Therefore, we performed a check to identify any emails with empty message bodies.

In addition, 20 duplicate emails (based on the `message` column) were removed, resulting in a cleaner dataset.

### Check for empty emails

In [None]:
# Check for completely empty emails without removing spaces for parsing
empty_rows = data_ds[data_ds['message'] == ""]
print(f"Number of completely empty emails: {empty_rows.shape[0]}")

Number of completely empty emails: 0


### Check to see if there is any emails in non-ASCII Characters

In [None]:
# Function to check if a text contains any non-ASCII characters
def non_ascii_check(text):
    """
    Check if a string contains any non-ASCII characters.
    ASCII range = 0–127
    """
    # Ensure the input is a string
    text = str(text)

    # Loop through each character in the text
    for char in text:
        # ord(char) gives the Unicode code point
        if ord(char) > 127:  
            # Found a non-ASCII character
            return True

    # If we finish the loop, all characters are ASCII
    return False

# Apply to the 'message' column
non_ascii_rows = dataset[dataset['message'].apply(non_ascii_check)]
print(f"Number of emails with non-ASCII characters: {non_ascii_rows.shape[0]}")

Number of emails with non-ASCII characters: 294


Based on the data exploration, we observed that the Spam Assassin dataset contains no null values in the `message` column, no completely empty emails, and 20 duplicate rows were found. After this cleaning, the dataset consists of 4178 unique emails. We also identified a small number of emails containing non-ASCII characters. Since we are building a phishing email detection system, we have decided **not to remove these emails** and will handle them appropriately during system development. This is beneficial, as such emails may help detect unusual or suspicious patterns while also verifying legitimate cases.  

Next, we proceed to clean the dataset to prepare the emails for parsing and analysis.

# Data Cleaning

In this section, we will clean the dataset using several methods:

1. Email Parsing  
2. Text Cleaning  
3. Post-Parsing Data Checks  

**Email parsing** involves extracting the meaningful content from each email, such as the body text, while removing unnecessary components like headers, metadata, or special formatting. This step is essential to prepare the emails for further cleaning, analysis, or natural language processing tasks.

### Email Parsing
Email parsing is essential to extract structured information from raw emails.  
We will split this process into three main sections:

1. **Header extraction:** Important fields like `Message-ID`, `Date`, `From`, `To`, `Subject`, etc will be extracted from the email headers.  
2. **Message body extraction:** The main content of the email will be isolated for further analysis, including text cleaning and phishing detection.  
3. **URL extraction:** Links are crucial for identifying suspicious or malicious content.  


In [None]:
# transform the email into correct format
message = dataset.loc[0]['message']
e = message_from_string(message)

e.items()

[('Return-Path', '<ilug-admin@linux.ie>'),
 ('Delivered-To', 'yyyy@localhost.netnoteinc.com'),
 ('Received',
  'from localhost (localhost [127.0.0.1])\n\tby phobos.labs.netnoteinc.com (Postfix) with ESMTP id 9E1F5441DD\n\tfor <jm@localhost>; Tue,  6 Aug 2002 06:48:09 -0400 (EDT)'),
 ('Received',
  'from phobos [127.0.0.1]\n\tby localhost with IMAP (fetchmail-5.9.0)\n\tfor jm@localhost (single-drop); Tue, 06 Aug 2002 11:48:09 +0100 (IST)'),
 ('Received',
  'from lugh.tuatha.org (root@lugh.tuatha.org [194.125.145.45]) by\n    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g72LqWv13294 for\n    <jm-ilug@jmason.org>; Fri, 2 Aug 2002 22:52:32 +0100'),
 ('Received',
  'from lugh (root@localhost [127.0.0.1]) by lugh.tuatha.org\n    (8.9.3/8.9.3) with ESMTP id WAA31224; Fri, 2 Aug 2002 22:50:17 +0100'),
 ('Received',
  'from bettyjagessar.com (w142.z064000057.nyc-ny.dsl.cnc.net\n    [64.0.57.142]) by lugh.tuatha.org (8.9.3/8.9.3) with ESMTP id WAA31201 for\n    <ilug@linux.ie>; Fri, 2 Aug 2

After examining a sample email, we found that the headers contain valuable information, including `Message-ID`, `Date`, `From`, `To`, `Subject`, and other relevant metadata such as `Sender`, `Return-Path`, and mailing list information (`List-Id`).  

To support further analysis and enable rule-based phishing detection, we extracted these fields from all emails and organized them into a structured DataFrame.

In [None]:
# Function to parse email and extract specified fields
def parse_email(raw_msg, fields=None):
    """
    Parse a raw email string and extract specified header fields.
    """

    # Extract fields from a raw email string
    if fields is None:
        fields = ["Message-ID", "Date", "From", "To", "Subject", "Sender", "List-Id"] # Standard fields to extract if none provided

    try:
        email_obj = message_from_string(raw_msg)
        result = {}

        for field in fields:
            # make field names easier to use in df (lowercase, underscores)
            key = field.lower().replace("-", "_") # X-to -> x_to
            result[key] = email_obj.get(field) # Extract field value or None if missing
        return result
    
    except Exception as e:
        # if parsing fails, just fill with None
        return {field.lower().replace("-", "_"): None for field in fields}


def build_email_dataframe(df, message_col="message", fields=None):
    """
    Parse a DataFrame column of raw email messages into structured fields.
    """

    # Parse emails in a DataFrame column into structured fields
    parsed_rows = []
    
    # Loop through each raw email in the DataFrame, show a progress bar while parsing,
    # and store the extracted fields as dictionaries in parsed_rows
    for msg in tqdm(df[message_col], total=len(df), desc="Parsing emails"):
        parsed_rows.append(parse_email(msg, fields))
    
    # Form a DataFrame from the list of parsed email dictionaries
    parsed_df = pd.DataFrame(parsed_rows, index=df.index)  # keep same index

    # Return the df with original and parsed fields
    return pd.concat([df, parsed_df], axis=1)  # merge with original

# extract specified fields from all emails in the dataset
extracted_df = build_email_dataframe(data_ds, message_col="message")

Parsing emails: 100%|██████████| 4178/4178 [00:00<00:00, 6079.99it/s]


#### Message Body Extraction

In [None]:
# Function to extract the body of each email
def body(messages):
    # Create an empty list to store email bodies
    column = []

    # Loop through each raw email message with a progress bar
    for message in tqdm(messages, total=len(messages), desc="Extracting email bodies"):
        # Parse the raw email string into an email object
        e = message_from_string(message)

        # Extract the body (payload) of the email
        column.append(e.get_payload())

    # Return the list of all extracted bodies
    return column

# Add a new column 'body' to the DataFrame by extracting the email body
extracted_df['body'] = body(data_ds['message'])

Extracting email bodies: 100%|██████████| 4178/4178 [00:00<00:00, 6804.53it/s]


In [None]:
# Display the first few rows of the new dataframe with extracted fields
display(extracted_df.head())

Unnamed: 0,label,message,message_id,date,from,to,subject,sender,list_id,body
0,1,From ilug-admin@linux.ie Tue Aug 6 11:51:02 ...,<1028311679.886@0.57.142>,"Fri, 02 Aug 2002 23:37:59 0530","""Start Now"" <startnow2002@hotmail.com>",ilug@linux.ie,[ILUG] STOP THE MLM INSANITY,ilug-admin@linux.ie,Irish Linux Users' Group <ilug.linux.ie>,Greetings!\n\nYou are receiving this letter be...
1,1,From lmrn@mailexcite.com Mon Jun 24 17:03:24 ...,<B0000178595@203.129.205.5.205.129.203.in-addr...,"Mon, 28 Jul 1980 14:01:35",lmrn@mailexcite.com,ranmoore@cybertime.net,"Real Protection, Stun Guns! Free Shipping! Ti...",,,"<html>\n<body>\n<center>\n<h3>\n<font color=""b..."
2,1,From amknight@mailexcite.com Mon Jun 24 17:03...,<0845b5355070f52WEBCUST2@webcust2.hightowertec...,"Wed, 30 Jul 1980 18:25:49",amknight@mailexcite.com,cbmark@cbmark.com,"New Improved Fat Burners, Now With TV Fat Abso...",,,"<html>\n<body>\n<center>\n<b>\n<font color=""bl..."
3,1,From jordan23@mailexcite.com Mon Jun 24 17:04...,<0925c5750200f52WEBCUST2@webcust2.hightowertec...,"Thu, 31 Jul 1980 07:20:54",jordan23@mailexcite.com,ranmoore@swbell.net,"New Improved Fat Burners, Now With TV Fat Abso...",,,"<html>\n<body>\n<center>\n<b>\n<font color=""bl..."
4,1,From merchantsworld2001@juno.com Tue Aug 6 1...,<200208040037.BAA09623@webnote.net>,"Sun, 19 Oct 1980 10:55:16",yyyy@pluriproj.pt,yyyy@pluriproj.pt,"Never Repay Cash Grants, $500 - $50,000, Secre...",,,"<html><xbody>\n<hr width = ""100%"">\n<center><h..."


### Validation of Body and Header Extraction  

After extracting the body of each email, it is important to validate the results. This is because, we want to check if there is any formatting issues within the dataset, we are unknown of. As this may caused some of the emails to not be parse correctly. As a results, leading to:  

1. **Incomplete or incorrect body extraction**  
   - In certain cases, parts of the email headers may still remain inside the `body` field instead of being fully separated.  
   - This requires manual or programmatic checks to confirm that the `body` column truly contains only the message content.  

2. **Null or missing values in other headers fields**  
   - Some header fields such as `to`, `from`, or `subject` may appear as null after parsing.  
   - These values may still exist within the raw email text but were not properly extracted during parsing.  

To address this, we will:  
- Inspect a sample of emails to verify that the `body` field contains the actual message rather than residual headers.  
- Cross-check the raw `message` text for cases where header fields (e.g., `to`) are null, and attempt to recover these values if possible.  

This step ensures that the dataset is **accurately structured** before proceeding to further cleaning and analysis.  

In [None]:
# Random Sample email 1
random_index = np.random.randint(0, len(extracted_df))
print(f"Random Sample Email at index {random_index}:\n")
print(extracted_df['body'][random_index])

Random Sample Email at index 3638:

URL: http://jeremy.zawodny.com/blog/archives/000203.html
Date: 2002-09-30T17:22:34-08:00

This is an issue that comes up all the time at work. It is an issue for roughly 
four reasons: Yahoo is a FreeBSD shop Someone has heard that MySQL runs better 
on Linux Someone knows that we run some...





In [None]:
# Random Sample email 2
random_index = np.random.randint(0, len(extracted_df))
print(f"Random Sample Email at index {random_index}:\n")
print(extracted_df['body'][random_index])

Random Sample Email at index 4043:

<html>
<head>
<title>Tech Update Today</title>
</head>
<body style="margin:8px 9px 9px 12px" bgcolor="#ffffff" background="http://techupdate.zdnet.com/techupdate/i/bg_232850.gif" link="#003399" alink="#cc0000" vlink="#666699">

<div align="center">

<!-- main -->
<a name="top"></a>
<table width=612 bgcolor="#232850" cellpadding=0 cellspacing=0 border=0>
<tr valign=bottom>
<td width=440 colspan=4><a href="http://clickthru.online.com/Click?q=c1-PIljQpW7B4RlTh3wiOPYOLipujlR" ><img src="http://www.zdnet.com/techupdate/i/itnewsletter_today.gif" width="440" height="60" border="0"  alt="Tech Update Today"></a></td>
<td width=160 align=center valign=top rowspan=2 bgcolor="#ffffff">
      
        <!--tower -->
       <iframe src="http://www.zdnet.com/include/ads/ifc/RGROUP=2766" scrolling="no" frameborder="0" hspace="0" vspace="0" height="600" width="160" marginheight="0" marginwidth="0">
<script language="JavaScript" src="http://www.zdnet.com/include/ads/js

In [None]:
# Random Sample email 3
random_index = np.random.randint(0, len(extracted_df))
print(f"Random Sample Email at index {random_index}:\n")
print(extracted_df['body'][random_index])

Random Sample Email at index 808:

<html>
<body>
<p align="center"><b><font size="5" color="#FF0000">Fire Your Boss...</font></b><font size="5"><br>
</font>Say &quot;Goodbye&quot; to the 9-5!</p>
<font FACE="Times New Roman" SIZE="3">
<p align="center">Tired of working to make someone else wealthy?</p>
<p align="center">FREE tape teaches you how to make YOU wealthy!</p>
<p align="center"><a href="mailto:reachme@btamail.net.cn?subject=free-tape">Click here and 
send your name and mailing address for a free copy</a></p>
<p align="center">&nbsp;</p>
</font><font FACE="Times New Roman" size="2">
<p align="center"><a href="mailto:removals@btamail.net.cn">To unsubscribe click 
here</a></p>
</font>
</body>
</html> 

xgcqyahtsvdwhqnhjiuweimhfumiaiyawr





From the review of three sample emails, no major issues were identified. However, with over 4,000 emails in the dataset, manual inspection is not feasible. To ensure data quality, we will implement an automated validation process to:  

- Move any existing headers (if spotted) into their respective dataframe columns.  
- Extract the main body message.  

This process will produce a cleaned, standardized dataset that is ready for further analysis.

In [None]:
# Regex: normal headers that should not appear in the body
HEADER_RE = re.compile(
    r'^\s*(return-path|delivered-to|message-id|date|from|to|subject|sender|errors-to|list-id|reply-to|cc|bcc|mime-version|content-type|content-transfer-encoding|x-[\w-]+)\s*:',
    re.IGNORECASE
)

# Map raw header names -> your DataFrame columns
HEADER_TO_COL = {
    "message-id": "message_id",
    "date": "date",
    "from": "from",
    "to": "to",
    "subject": "subject",
    "sender": "sender",
    "list-id": "list_id",
}

# Split header block and body by the first blank line
def split_header_body(text):
    """
    Split a raw message-like string into (header_block, body_block).
    Uses the first blank line as the boundary. If none found, returns ("", whole_text_as_body).
    """
    # Check if text is empty
    if text is None:
        return "", ""
    
    # Normalize line endings 
    s = str(text).replace("\r\n", "\n").replace("\r", "\n")

    # Split into parts
    parts = s.split("\n\n", 1)

    # Check if its split into header block and body block
    if len(parts) == 2:
        return parts[0], parts[1]
    
    # no clear header/body split
    return "", s

# Parse the *top* header block (supports folded continuation lines)
def parse_header_block(header_block):
    """
    Return a dict of header_name_lower -> value (unfolded).
    We only capture the headers listed in HEADER_RE for safety.
    """
    # Initialise the variables
    headers = {}
    cur_name = None
    cur_val_parts = []

    # Check if there it is header blocks
    if not header_block:
        return headers

    # Split lines by '\n'
    lines = header_block.split("\n")

    # C
    def commit():
    # If we already have a header name being processed,
    # finalize it by joining its accumulated value parts
    # into a single string and save it in the headers dict.
        if cur_name is not None:
            headers[cur_name] = " ".join(cur_val_parts).strip()

    for ln in lines:
        if HEADER_RE.match(ln):
            # Case 1: Line looks like a new header (e.g., "Subject: Hello")
            # - Save the previous header first
            commit()

            # - Split the line into name and value at the first colon
            name, val = ln.split(":", 1)

            # - Store the header name in lowercase
            cur_name = name.strip().lower()

            # - Start a fresh list of value parts for this header
            cur_val_parts = [val.strip()]

        elif ln.startswith((" ", "\t")) and cur_name is not None:
            # Case 2: Continuation line (folded header)
            # - If the line starts with a space or tab, it belongs
            #   to the current header value
            cur_val_parts.append(ln.strip())

        else:
            # Case 3: Line doesn’t look like a header or continuation
            # - Just ignore it
            pass

    # After the loop, commit the last header in progress
    commit()

    # Return the dictionary of headers
    return headers

# Remove header-like lines that *leak* into the body (your original intent)
def strip_headerish_lines_in_body(body_text):
    """
    Remove lines inside the body that look like headers (e.g. 'From:' in quoted replies).
    Does NOT remove normal content.
    """
    if not body_text:
        return ""
    # Normalize line endings to '\n' and split into list of lines
    lines = body_text.replace("\r\n", "\n").replace("\r", "\n").split("\n")
    kept = []
    for ln in lines:
        ls = ln.strip()
        if HEADER_RE.match(ls):
            # If line looks like a header (e.g., "From:" / "Subject:"), skip it
            continue
        # Otherwise, keep the line as part of the body
        kept.append(ln)
    # Recombine into text, stripping leading/trailing whitespace
    return "\n".join(kept).strip()


# Minimal HTML → text (preserve content, just strip tags)
def html_to_text_preserve(html_or_text):
    """
    Minimal HTML-to-text conversion:
    - Removes tags (while preserving content)
    - Drops script/style/noscript/iframe blocks
    - Keeps line breaks
    """
    if html_or_text is None:
        return ""
    # Parse input with BeautifulSoup
    soup = BeautifulSoup(html_or_text, "html.parser")

    # Remove non-content tags completely
    for tag in soup(["script", "style", "noscript", "iframe"]):
        tag.decompose()

    # Extract text, keeping '\n' between blocks
    txt = soup.get_text("\n")

    # Decode HTML entities (e.g. "&amp;" → "&")
    txt = ihtml.unescape(txt)

    # Normalize line endings
    txt = txt.replace("\r\n", "\n").replace("\r", "\n")

    # Strip only trailing spaces from each line, keep meaningful breaks
    return "\n".join(line.rstrip() for line in txt.split("\n")).strip()


# Your driver: fill header columns first, then clean body (strip headers-in-body, then strip HTML)
def apply_data(df, col_body="body", out_col_body_text="body_text"):
    """
    - Reads df[col_body] (which may contain raw headers+HTML or just HTML).
    - If a header block exists at the top, backfills header columns from it (HEADER_TO_COL).
    - Removes header-like lines leaked inside the body.
    - Produces a plain-text version (HTML stripped) in df[out_col_body_text].
    Returns fill_counts (how many cells were filled per header column).
    """

    # Track how many values we fill for each header-mapped column
    fill_counts = {col: 0 for col in df.columns if col in df.columns}
    out_texts = []

    for i, raw in enumerate(df[col_body]):
        # Split raw text into header block and body block
        header_block, body_block = split_header_body(raw)

        # Backfill header columns from *top* header block
        top_headers = parse_header_block(header_block)
        for hdr_name, col_name in HEADER_TO_COL.items():
            if col_name in df.columns and hdr_name in top_headers:
                # Only fill if the dataframe cell is missing/empty
                if pd.isna(df.at[i, col_name]) or (
                    isinstance(df.at[i, col_name], str)
                    and df.at[i, col_name].strip() == ""
                ):
                    df.at[i, col_name] = top_headers[hdr_name]
                    fill_counts[col_name] += 1

        # If no header block was detected, just keep the raw body
        body_core = body_block if body_block else (raw or "")

        # Remove header-like lines that leaked into the body text
        body_no_headerish = strip_headerish_lines_in_body(body_core)

        # Convert remaining content to plain text (strip HTML, preserve content)
        body_text = html_to_text_preserve(body_no_headerish)

        # Save clean plain-text body
        out_texts.append(body_text)

    # Write new plain-text body column (original "body" is untouched)
    df[out_col_body_text] = out_texts

    return fill_counts

# Fill headers first, create a plain-text body column
fill_counts = apply_data(extracted_df, col_body="body", out_col_body_text="body_text")

# Print how many cells were filled for each header column
for col, c in fill_counts.items():
    if c:
        print(f'Filled "{col}" in {c} rows.')


Filled "subject" in 2 rows.
Filled "sender" in 1 rows.
Filled "list_id" in 1 rows.


#### URL Extraction

Next, we extract all URLs contained in the email bodies.  

URLs are important for phishing detection because suspicious or malicious links are often key indicators of phishing attempts.  

By isolating the URLs, we can analyze them separately and apply rules to identify potentially harmful links.

In [None]:
# Function to extract URLs from dataset['message'] directly
def extract_urls_from_message(raw_msg):
    
    # Ensure the input is a string
    if not isinstance(raw_msg, str):
        return None
    
    # Regex pattern to capture links:
    url_pattern = r'((?:https?://|www\.)[^\s,\)\]>\[}]+)'

    # Find all URLs in the raw message
    urls = re.findall(url_pattern, raw_msg)

    return urls if urls else None

# apply directly on the raw message column
extracted_df['urls'] = data_ds['message'].apply(extract_urls_from_message)
extracted_df['num_urls'] = extracted_df['urls'].apply(lambda x: len(x) if x is not None else 0)

In [None]:
# Display the first few rows to verify extraction
display(extracted_df.head())

Unnamed: 0,label,message,message_id,date,from,to,subject,sender,list_id,body,body_text,urls,num_urls
0,1,From ilug-admin@linux.ie Tue Aug 6 11:51:02 ...,<1028311679.886@0.57.142>,"Fri, 02 Aug 2002 23:37:59 0530","""Start Now"" <startnow2002@hotmail.com>",ilug@linux.ie,[ILUG] STOP THE MLM INSANITY,ilug-admin@linux.ie,Irish Linux Users' Group <ilug.linux.ie>,Greetings!\n\nYou are receiving this letter be...,You are receiving this letter because you have...,[http://www.linux.ie/mailman/listinfo/ilug],1
1,1,From lmrn@mailexcite.com Mon Jun 24 17:03:24 ...,<B0000178595@203.129.205.5.205.129.203.in-addr...,"Mon, 28 Jul 1980 14:01:35",lmrn@mailexcite.com,ranmoore@cybertime.net,"Real Protection, Stun Guns! Free Shipping! Ti...",,,"<html>\n<body>\n<center>\n<h3>\n<font color=""b...","IT'S GETTING TO BE SPRING AGAIN, PROTECT YOURS...",[http://www.geocities.com/realprotection_20022...,4
2,1,From amknight@mailexcite.com Mon Jun 24 17:03...,<0845b5355070f52WEBCUST2@webcust2.hightowertec...,"Wed, 30 Jul 1980 18:25:49",amknight@mailexcite.com,cbmark@cbmark.com,"New Improved Fat Burners, Now With TV Fat Abso...",,,"<html>\n<body>\n<center>\n<b>\n<font color=""bl...",LOSE 30 POUNDS IN 30 DAYS... GUARANTEED!!!\n\...,[http://www.geocities.com/ultra_weightloss_200...,4
3,1,From jordan23@mailexcite.com Mon Jun 24 17:04...,<0925c5750200f52WEBCUST2@webcust2.hightowertec...,"Thu, 31 Jul 1980 07:20:54",jordan23@mailexcite.com,ranmoore@swbell.net,"New Improved Fat Burners, Now With TV Fat Abso...",,,"<html>\n<body>\n<center>\n<b>\n<font color=""bl...",LOSE 30 POUNDS IN 30 DAYS... GUARANTEED!!!\n\...,[http://www.geocities.com/ultra_weightloss_200...,4
4,1,From merchantsworld2001@juno.com Tue Aug 6 1...,<200208040037.BAA09623@webnote.net>,"Sun, 19 Oct 1980 10:55:16",yyyy@pluriproj.pt,yyyy@pluriproj.pt,"Never Repay Cash Grants, $500 - $50,000, Secre...",,,"<html><xbody>\n<hr width = ""100%"">\n<center><h...","To Order by postal mail, please send $15.95 Pl...","[http://www.geocities.com/grantzone_2002/"", ht...",2


Based on the extracted dataset, all key headers, the email body, and URLs have been successfully captured and standardized. The `body` field is readable and normalized, while `message_id` preserves its original format. Although this sample shows no URLs, the dataset is structured to capture them if present in other emails.  

### Text Cleaning

In this step, we clean the relevant text fields in the dataset to prepare for analysis and phishing detection.  

The cleaning process includes:

1. **Removing content inside angle brackets (`<...>`)** for all columns except `message_id`  
   - Standardizes email addresses and header fields.  

2. **Normalizing whitespace and removing unwanted content**  
   - Replace multiple spaces, tabs, and newlines with a single space.  
   - Remove leading and trailing spaces.  
   - Remove separator lines such as `----------` and `************`.  
   - Remove embedded HTML code.  

3. **Reordering and dropping columns**  
   - Adjust column order to match the workflow for phishing detection.  
   - Drop any unnecessary or redundant columns to simplify the dataset.  
   - This makes the dataset more organized and easier to work with in subsequent steps.  

This process ensures all text fields are **clean, consistent, and ready** for further processing, while preserving important information for phishing detection, including URLs, attachments, and non-ASCII characters.

In [None]:
# Make a copy to prevent mutation
final_df = extracted_df.copy()

# Function to clean text fields for phishing detection analysis
def clean_text(x, keep_tags=False):
    """
    Clean text fields for phishing detection analysis.

    Steps:
    1. Collapse whitespace.
    2. Remove <...> entirely unless keep_tags=True (preserve for message_id).
    3. Remove [] but keep the content inside.
    4. Remove quotes ' and ".
    5. Remove common separator lines: ----------, ************.
    6. Remove embedded HTML tags.
    """
    if x is None:
        return None  # No change

    text = str(x)

    # Remove <...> unless we want to keep tags (e.g., message_id)
    if not keep_tags:
        text = re.sub(r'<[^>]*>', '', text)  

    # Remove separator lines
    text = re.sub(r'[-*]{4,}', ' ', text)  # sequences of 4+ - or *

    # Remove brackets [] but keep content inside
    text = re.sub(r'[\[\]]+', '', text)

    # Remove quotes ' and "
    text = re.sub(r"[\'\"]+", '', text)

    # Collapse multiple spaces, tabs, newlines into a single space, and strip
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# NEW: helper to normalize address fields (From / To / Sender)
EMAIL_RE = re.compile(r'[A-Z0-9._%+\-]+@[A-Z0-9.\-]+\.[A-Z]{2,}', re.IGNORECASE)

# Normalize email address fields
def normalize_address_field(x):
    """
    Return only the email(s). If <...> present, prefer those.
    If multiple addresses, join with ', '.
    """
    if x is None:
        return None
    s = str(x)

    # Prefer emails inside <...>
    in_angles = re.findall(r'<\s*([^<>@\s]+@[^<>@\s]+)\s*>', s)
    if in_angles:
        return ', '.join(e.strip() for e in in_angles)

    # Fallback: any email-looking substrings
    any_emails = EMAIL_RE.findall(s)
    if any_emails:
        return ', '.join(e.strip() for e in any_emails)

    # If nothing matched, return cleaned plain text
    return clean_text(s)

# Apply cleaning to relevant columns
columns_to_clean = ['message_id', 'date', 'from', 'to', 'subject','sender', 'list_id', 'body', 'urls']

for col in columns_to_clean:
    if col in ('from', 'to', 'sender'):
        final_df[col] = final_df[col].apply(normalize_address_field)
    elif col == 'message_id':
        final_df[col] = final_df[col].apply(lambda x: clean_text(x, keep_tags=True))
    else:
        final_df[col] = final_df[col].apply(clean_text)


In [None]:
# Specify the desired column order
cols = ['message_id', 'date', 'from', 'to', 'subject', 'sender', 'list_id', 'body', 'urls', 'num_urls', 'label']

# Drop the unnecessary column and rearranging columns for better readability
final_df = final_df[cols]

In [None]:
# display the cleaned dataframe
display(final_df.head())

Unnamed: 0,message_id,date,from,to,subject,sender,list_id,body,urls,num_urls,label
0,<1028311679.886@0.57.142>,"Fri, 02 Aug 2002 23:37:59 0530",startnow2002@hotmail.com,ilug@linux.ie,ILUG STOP THE MLM INSANITY,ilug-admin@linux.ie,Irish Linux Users Group,Greetings! You are receiving this letter becau...,http://www.linux.ie/mailman/listinfo/ilug,1,1
1,<B0000178595@203.129.205.5.205.129.203.in-addr...,"Mon, 28 Jul 1980 14:01:35",lmrn@mailexcite.com,ranmoore@cybertime.net,"Real Protection, Stun Guns! Free Shipping! Tim...",,,"The Need For Safety Is Real In 2002, You Might...",http://www.geocities.com/realprotection_200220...,4,1
2,<0845b5355070f52WEBCUST2@webcust2.hightowertec...,"Wed, 30 Jul 1980 18:25:49",amknight@mailexcite.com,cbmark@cbmark.com,"New Improved Fat Burners, Now With TV Fat Abso...",,,"Bonus Fat Absorbers As Seen On TV, Included Fr...",http://www.geocities.com/ultra_weightloss_2002...,4,1
3,<0925c5750200f52WEBCUST2@webcust2.hightowertec...,"Thu, 31 Jul 1980 07:20:54",jordan23@mailexcite.com,ranmoore@swbell.net,"New Improved Fat Burners, Now With TV Fat Abso...",,,"Bonus Fat Absorbers As Seen On TV, Included Fr...",http://www.geocities.com/ultra_weightloss_2002...,4,1
4,<200208040037.BAA09623@webnote.net>,"Sun, 19 Oct 1980 10:55:16",yyyy@pluriproj.pt,yyyy@pluriproj.pt,"Never Repay Cash Grants, $500 - $50,000, Secre...",,,"Government Grants E-Book 2002 edition, Just $1...","http://www.geocities.com/grantzone_2002/, http...",2,1


final### Post-Parsing Data Validation

After parsing and splitting the emails into separate columns, it is important to verify the integrity of the new dataset.  

We will:

1. **Inspect dataset summary** – Using `data.info()` to review column names, data types, and non-null counts.  
2. **Check for null values** – Some fields such as `subject` or `body` may be empty even if the original message was not null.  
3. **Check for duplicate rows** – Parsing may create redundant entries that should be removed.  

These steps ensure that the parsed dataset is **clean, consistent, and ready** for further analysis and phishing detection.

#### Descriptive Statistics

In [None]:
# Check the info of cleaned dataframe
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4178 entries, 0 to 4177
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   message_id  4176 non-null   object
 1   date        4177 non-null   object
 2   from        4177 non-null   object
 3   to          4008 non-null   object
 4   subject     4176 non-null   object
 5   sender      1978 non-null   object
 6   list_id     1723 non-null   object
 7   body        4178 non-null   object
 8   urls        3789 non-null   object
 9   num_urls    4178 non-null   int64 
 10  label       4178 non-null   int64 
dtypes: int64(2), object(9)
memory usage: 359.2+ KB


##### Handling of Null Values

In [None]:
# Check for missing values in the dataframe
print(final_df.isna().sum().sort_values())

body             0
num_urls         0
label            0
date             1
from             1
message_id       2
subject          2
to             170
urls           389
sender        2200
list_id       2455
dtype: int64


##### Checking for duplicates

In [None]:
# shape of dataset before removing duplicates
print(f"Shape before removing duplicates: {final_df.shape}") 

# Removing duplicate rows
final_df = final_df.drop_duplicates().reset_index(drop=True)

# Shape of dataset after removing duplicates
print(f"Shape after removing duplicates: {final_df.shape}")

Shape before removing duplicates: (4178, 11)
Shape after removing duplicates: (4090, 11)


# Save the cleaned dataset
After checking for duplicates and adding the label column, the cleaned dataset is saved to a CSV file for later processing.

In [None]:
# Save the DataFrame to a CSV file named 'cleaned_SA.csv'
final_df.to_csv(r'..\Datasets\cleaned_SA.csv', index=False) # The index=False ensures the index is not saved