<h1>Dataset preparation for model training</h1>
<h3>Steps</h3>
<h4>Load and clean emails.csv</h4>
    <em>Load</em>
    <em>Inspect</em>
    <em>Standardise</em>
<h4>Extract trusted chains of email</h4>
    <em>Sender receiver pairs</em>
    <em>Frequency of interactions</em>
    <em>Email grouping into chains</em>
<h4>Introduce phishing samples</h4>
    <em>Spoofed email generation</em>
    <em>Combining ligit and spoofed emails</em>
<h4>Dataset inspection and validate</h4>
    <em>Verify the combined and labelled dataset</em>
    <em>Sample and review the chains to make sure spoofed emails were injected correctly</em>
<h4>Feature engineering</h4>
    <em>Extract header features</em>
    <em>Extract content features</em>
    <em>Extract behaviour features</em>
<h4>Prepare for model training</h4>
    <em>Preprocess</em>
    <em>Train-Test split</em>

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import StandardScaler

<h2>Parse Data</h2>

In [None]:
##Loading data
emails_df = pd.read_csv('data/emails.csv')

In [73]:
#~~~PARSE MESSAGE~~~
def parse_message(raw_message):
    try:
        fields = {
            'Message-ID': re.search(r'Message-ID:\s*(.*)', raw_message).group(1) if re.search(r'Message-ID:\s*(.*)', raw_message) else None,
            'Date': re.search(r'Date:\s*(.*)', raw_message).group(1) if re.search(r'Date:\s*(.*)', raw_message) else None,
            'From': re.search(r'From:\s*(.*)', raw_message).group(1) if re.search(r'From:\s*(.*)', raw_message) else None,
            'To': re.search(r'To:\s*(.*)', raw_message).group(1) if re.search(r'To:\s*(.*)', raw_message) else None,
            'Subject': re.search(r'Subject:\s*(.*)', raw_message).group(1) if re.search(r'Subject:\s*(.*)', raw_message) else None,
        }
        body_match = re.split(r'\n\s*\n', raw_message, maxsplit=1)
        fields['Body'] = body_match[1] if len(body_match) > 1 else ""
        return fields
    except Exception as e:
        print(f"Error parsing message: {e}")
        return {}
    

In [74]:
## apply parser to message column
parsed_messages = emails_df['message'].apply(parse_message)

In [75]:
#convert parsed data into dataframe
parsed_df = pd.DataFrame(parsed_messages.tolist())

In [76]:
#merge with original; file column
emails_df = pd.concat([emails_df[['file']], parsed_df], axis=1)

In [77]:
#func to convert and clean dates
def parse_date(date_string):
    try:
        return pd.to_datetime(date_string, errors='coerce')
    except Exception:
        return None

In [78]:
#apply date parser
emails_df['Date'] = emails_df['Date'].apply(parse_date)

In [79]:
#drop rows with invalid dates.
emails_df = emails_df.dropna(subset=['Date'])

In [80]:
#metadata extraction
words2split = [
    'Message-ID: ', 'Date: ', 'From: ', 'To: ', 'Subject: ', 'Cc: ', 
    'Mime-Version: ', 'Content-Type: ', 'Content-Transfer-Encoding: ', 
    'Bcc: ', 'X-From: ', 'X-To: ', 'X-cc: ', 'X-bcc: ', 
    'X-Folder: ', 'X-Origin: ', 'X-FileName: '
]

In [81]:
features_naming = [i[:-2] for i in words2split]  # Strip ': ' for column names
split_condition = '|'.join(words2split)

In [82]:
#func to extract meta fields
def extract_metadata(pre_info, split_condition):
    fields = re.split(split_condition, pre_info)
    if len(fields) - 1 != len(features_naming):
        print(f"Warning: Metadata length mismatch in '{pre_info[:50]}...'")  # Log first 50 characters for context
    metadata = dict(zip(features_naming, fields[1:]))  # Skip first empty split
    return metadata


print(emails_df.head())
emails_df.to_csv('cleaned_emails.csv', index=False)

                       file                                     Message-ID  \
0     allen-p/_sent_mail/1.  <18782981.1075855378110.JavaMail.evans@thyme>   
1    allen-p/_sent_mail/10.  <15464986.1075855378456.JavaMail.evans@thyme>   
2   allen-p/_sent_mail/100.  <24216240.1075855687451.JavaMail.evans@thyme>   
3  allen-p/_sent_mail/1000.  <13505866.1075863688222.JavaMail.evans@thyme>   
4  allen-p/_sent_mail/1001.  <30922949.1075863688243.JavaMail.evans@thyme>   

                        Date                     From  \
0  2001-05-14 16:39:00-07:00  phillip.allen@enron.com   
1  2001-05-04 13:51:00-07:00  phillip.allen@enron.com   
2  2000-10-18 03:00:00-07:00  phillip.allen@enron.com   
3  2000-10-23 06:13:00-07:00  phillip.allen@enron.com   
4  2000-08-31 05:07:00-07:00  phillip.allen@enron.com   

                        To            Subject  \
0     tim.belden@enron.com  Mime-Version: 1.0   
1  john.lavorato@enron.com                Re:   
2   leah.arsdall@enron.com           Re: 

<h2> Standardization</h2>

In [83]:
emails_df['Body_Length'] = emails_df['Body'].apply(len)

In [84]:
emails_df['Num_Recipients'] = emails_df['To'].apply(lambda x: len(x.split(',')) if isinstance(x, str) else 0)


In [85]:
emails_df['Subject_Word_Count'] = emails_df['Subject'].apply(lambda x: len(x.split()) if isinstance(x, str) else 0)

In [86]:
emails_df['Timestamp'] = emails_df['Date'].apply(lambda x: x.timestamp() if pd.notnull(x) else 0)

In [87]:
numerical_features = ['Body_Length', 'Num_Recipients', 'Subject_Word_Count', 'Timestamp']

In [88]:
#init scalar
scaler = StandardScaler()

In [89]:
#apply standardisation
emails_df[numerical_features] = scaler.fit_transform(emails_df[numerical_features])

In [90]:
#verification
print(emails_df[numerical_features].head())

   Body_Length  Num_Recipients  Subject_Word_Count  Timestamp
0    -0.222735       -0.342952           -0.948929   0.256478
1    -0.129466       -0.342952           -1.341740   0.225712
2    -0.221880       -0.342952           -0.948929  -0.377824
3    -0.202688       -0.342952           -0.948929  -0.362210
4    -0.221268       -0.342952           -0.948929  -0.523534


In [91]:
from collections import Counter


In [92]:
#extracting sender-receiver pairs
emails_df['To'] = emails_df['To'].fillna("")  # Handle missing 'To' values
emails_df['Recipients_List'] = emails_df['To'].apply(lambda x: x.split(','))

#use a list, faster to process
email_pairs = [(row['From'], recipient.strip()) 
               for _, row in emails_df.iterrows() 
               for recipient in row['Recipients_List']]

In [94]:
#count frequent interaction
pair_counter = Counter(email_pairs)

In [95]:
#define trusted pairs
trusted_pairs = [pair for pair, count in pair_counter.items() if count > 10]

In [96]:
#filter emails belonging to trusted pairs
emails_df['Is_Trusted'] = emails_df.apply(
    lambda row: any((row['From'], recipient) in trusted_pairs for recipient in row['Recipients_List']),
    axis=1
)

In [97]:
#group emails by trusted interactions
trusted_chains = emails_df[emails_df['Is_Trusted']]

In [98]:
#inspeect to confirm trusted sender and receiver pairs are cirrectly identified
print(f"Number of Trusted Pairs: {len(trusted_pairs)}")
print("Sample Trusted Pairs:")
print(trusted_pairs[:10])

Number of Trusted Pairs: 13445
Sample Trusted Pairs:
[('phillip.allen@enron.com', 'tim.belden@enron.com'), ('phillip.allen@enron.com', 'john.lavorato@enron.com'), ('phillip.allen@enron.com', 'stagecoachmama@hotmail.com'), ('phillip.allen@enron.com', 'keith.holst@enron.com'), ('phillip.allen@enron.com', 'paula.harris@enron.com'), ('phillip.allen@enron.com', 'ina.rangel@enron.com'), ('phillip.allen@enron.com', 'tim.heizenrader@enron.com'), ('phillip.allen@enron.com', 'pallen70@hotmail.com'), ('phillip.allen@enron.com', 'bs_stone@yahoo.com'), ('phillip.allen@enron.com', 'stouchstone@natsource.com')]


In [99]:
print("Number of Emails in Trusted Chains:", len(trusted_chains))
print("Sample Trusted Emails:")
print(trusted_chains[['From', 'To', 'Body']].head())

Number of Emails in Trusted Chains: 334803
Sample Trusted Emails:
                       From                          To  \
0   phillip.allen@enron.com        tim.belden@enron.com   
1   phillip.allen@enron.com     john.lavorato@enron.com   
11  phillip.allen@enron.com  stagecoachmama@hotmail.com   
12  phillip.allen@enron.com       keith.holst@enron.com   
13  phillip.allen@enron.com       keith.holst@enron.com   

                                                 Body  
0                           Here is our forecast\n\n   
1   Traveling to have a business meeting takes the...  
11  Lucy,\n\n Here are the rentrolls:\n\n\n\n Open...  
12  ---------------------- Forwarded by Phillip K ...  
13  ---------------------- Forwarded by Phillip K ...  


In [100]:
#cross check
sample_pair = trusted_pairs[0]
print(f"Emails for Trusted Pair {sample_pair}:")
print(trusted_chains[
    (trusted_chains['From'] == sample_pair[0]) & 
    (trusted_chains['Recipients_List'].apply(lambda x: sample_pair[1] in x))
][['Body']].head())

Emails for Trusted Pair ('phillip.allen@enron.com', 'tim.belden@enron.com'):
                                                  Body
0                            Here is our forecast\n\n 
153  Tim,\n Matt sent you a email with his attempt ...
186  ---------------------- Forwarded by Phillip K ...
269  forecast for socal demand/rec/storage.  Looks ...
277                             Here is our forecast\n


In [101]:
chain_count = 0


print("Number of Chains:", len(trusted_chains))
print("Sample Chains:")

# Loop over each chain 
for chain, emails in trusted_chains.groupby(['From', 'To']):
    if chain_count >= 4:  # Limit nmber of chains
        break
    print(f"Chain: {chain}")
    print(f"Number of Emails: {len(emails)}")
    print(emails['Body'].head(), "\n")
    chain_count += 1

Number of Chains: 334803
Sample Chains:
Chain: ('1.10043390.-2@multexinvestornetwork.com', 'jwillia@enron.com')
Number of Emails: 38
507999    In today's Daily Update, you'll find research ...
508007    From today's edition of the Daily Update, you ...
508011    In today's Daily Update, we feature research o...
508024    From today's Daily Update, you'll have the opp...
508036    From today's special weekend edition of the Da...
Name: Body, dtype: object 

Chain: ('1.10969419.-2@multexinvestornetwork.com', 'harry.arora@enron.com')
Number of Emails: 11
8041    Our gifts to you this week: Several reports th...
8051    Features: angels find investment opportunities...
8085    Read what independent analyst Charles Payne ha...
8127    To help celebrate the Holiday season and the N...
8141    In today's edition of the Daily Update, equity...
Name: Body, dtype: object 

Chain: ('1.11176403.-2@multexinvestornetwork.com', 'alewis@ect.enron.com')
Number of Emails: 18
279739    In today's Daily U

<h2> INTRODUCE PHISHING SAMPLES: </h2>

In [102]:
import random

In [103]:
#funct generate spoofed emails
def generate_spoofed_email(row):
    spoofed_email = row.copy()
    spoofed_email['From'] = re.sub(r'@.+$', '@spoofed.com', row['From'])  # Change domain
    spoofed_email['Body'] += "\nClick here: http://malicious-link.com"  # Add phishing content
    spoofed_email['Label'] = 1  # Mark as phishing
    return spoofed_email

In [104]:
#clone and spoof percentage of trusted emails
spoofed_emails = trusted_chains.sample(frac=0.2, random_state=42).apply(generate_spoofed_email, axis=1)


In [105]:
#combine legit and spoofed emails
emails_df['Label'] = 0  # Legitimate emails
final_dataset = pd.concat([emails_df, spoofed_emails])

In [106]:
#check the structof spoofed_emails dataframe
print("Spoofed Emails Sample:")
print(spoofed_emails.head())

Spoofed Emails Sample:
                                       file  \
246821      kean-s/discussion_threads/3115.   
244566      kean-s/discussion_threads/1069.   
326450          mcconnell-m/_sent_mail/359.   
275117         lenhart-m/all_documents/366.   
443986  skilling-j/discussion_threads/1075.   

                                           Message-ID  \
246821  <10170593.1075848177616.JavaMail.evans@thyme>   
244566  <28983404.1075848097229.JavaMail.evans@thyme>   
326450   <4093499.1075843986028.JavaMail.evans@thyme>   
275117   <6074789.1075849747487.JavaMail.evans@thyme>   
443986  <25520575.1075845504060.JavaMail.evans@thyme>   

                             Date                         From  \
246821  2001-04-27 02:48:00-07:00    jeff.dasovich@spoofed.com   
244566  2000-08-31 09:16:00-07:00     robert.frank@spoofed.com   
326450  2000-12-04 10:36:00-08:00   mike.mcconnell@spoofed.com   
275117  2001-02-08 03:47:00-08:00  matthew.lenhart@spoofed.com   
443986  2001-03-07 02

In [107]:
#confirm size of spoofed mails matches expected percentage of trusted
expected_count = int(len(trusted_chains) * 0.2)
actual_count = len(spoofed_emails)
print(f"Expected Spoofed Emails: {expected_count}, Actual: {actual_count}")

Expected Spoofed Emails: 66960, Actual: 66961


In [108]:
#confirm combination of ligit and spoofed.*
assert len(final_dataset) == len(emails_df) + len(spoofed_emails), "Mismatch in dataset sizes!"
assert final_dataset['Label'].value_counts()[1] == len(spoofed_emails), "Incorrect spoofed email count!"

In [60]:
#validate concat of dataframes

print("First 5 Rows of Final Dataset:")
print(final_dataset.head())
print("Last 5 Rows of Final Dataset:")
print(final_dataset.tail())

First 5 Rows of Final Dataset:
                       file                                     Message-ID  \
0     allen-p/_sent_mail/1.  <18782981.1075855378110.JavaMail.evans@thyme>   
1    allen-p/_sent_mail/10.  <15464986.1075855378456.JavaMail.evans@thyme>   
2   allen-p/_sent_mail/100.  <24216240.1075855687451.JavaMail.evans@thyme>   
3  allen-p/_sent_mail/1000.  <13505866.1075863688222.JavaMail.evans@thyme>   
4  allen-p/_sent_mail/1001.  <30922949.1075863688243.JavaMail.evans@thyme>   

                        Date                     From  \
0  2001-05-14 16:39:00-07:00  phillip.allen@enron.com   
1  2001-05-04 13:51:00-07:00  phillip.allen@enron.com   
2  2000-10-18 03:00:00-07:00  phillip.allen@enron.com   
3  2000-10-23 06:13:00-07:00  phillip.allen@enron.com   
4  2000-08-31 05:07:00-07:00  phillip.allen@enron.com   

                        To            Subject  \
0     tim.belden@enron.com  Mime-Version: 1.0   
1  john.lavorato@enron.com                Re:   
2   leah.a

In [109]:
emails_df['Label'] = 0#mark all legit emails as 0

In [110]:
spoofed_emails['Label'] = 1 #mark all spoofed emails as 1

In [111]:
#concat legit and spoofed into one dataset.
final_dataset = pd.concat([emails_df, spoofed_emails], ignore_index=True)

In [112]:
#check distribution
print("Label Distribution in Final Dataset:")
print(final_dataset['Label'].value_counts())

Label Distribution in Final Dataset:
Label
0    517401
1     66961
Name: count, dtype: int64


In [114]:
#save final dataset with labels for model prep
final_dataset.to_csv('labeled_emails_dataset.csv', index=False)
print("Dataset saved successfully.")

Dataset saved successfully.


In [115]:
#inspect the rows
print("Legitimate Emails Sample:")
print(final_dataset[final_dataset['Label'] == 0].head())


Legitimate Emails Sample:
                       file                                     Message-ID  \
0     allen-p/_sent_mail/1.  <18782981.1075855378110.JavaMail.evans@thyme>   
1    allen-p/_sent_mail/10.  <15464986.1075855378456.JavaMail.evans@thyme>   
2   allen-p/_sent_mail/100.  <24216240.1075855687451.JavaMail.evans@thyme>   
3  allen-p/_sent_mail/1000.  <13505866.1075863688222.JavaMail.evans@thyme>   
4  allen-p/_sent_mail/1001.  <30922949.1075863688243.JavaMail.evans@thyme>   

                        Date                     From  \
0  2001-05-14 16:39:00-07:00  phillip.allen@enron.com   
1  2001-05-04 13:51:00-07:00  phillip.allen@enron.com   
2  2000-10-18 03:00:00-07:00  phillip.allen@enron.com   
3  2000-10-23 06:13:00-07:00  phillip.allen@enron.com   
4  2000-08-31 05:07:00-07:00  phillip.allen@enron.com   

                        To            Subject  \
0     tim.belden@enron.com  Mime-Version: 1.0   
1  john.lavorato@enron.com                Re:   
2   leah.arsdal

In [116]:
#inspect the rows 
print("Phishing Emails Sample:")
print(final_dataset[final_dataset['Label'] == 1].head())

Phishing Emails Sample:
                                       file  \
517401      kean-s/discussion_threads/3115.   
517402      kean-s/discussion_threads/1069.   
517403          mcconnell-m/_sent_mail/359.   
517404         lenhart-m/all_documents/366.   
517405  skilling-j/discussion_threads/1075.   

                                           Message-ID  \
517401  <10170593.1075848177616.JavaMail.evans@thyme>   
517402  <28983404.1075848097229.JavaMail.evans@thyme>   
517403   <4093499.1075843986028.JavaMail.evans@thyme>   
517404   <6074789.1075849747487.JavaMail.evans@thyme>   
517405  <25520575.1075845504060.JavaMail.evans@thyme>   

                             Date                         From  \
517401  2001-04-27 02:48:00-07:00    jeff.dasovich@spoofed.com   
517402  2000-08-31 09:16:00-07:00     robert.frank@spoofed.com   
517403  2000-12-04 10:36:00-08:00   mike.mcconnell@spoofed.com   
517404  2001-02-08 03:47:00-08:00  matthew.lenhart@spoofed.com   
517405  2001-03-07 0

In [117]:
#check balance of distrubution
total_legitimate = len(emails_df)
total_phishing = len(spoofed_emails)
print(f"Legitimate: {total_legitimate}, Phishing: {total_phishing}")

Legitimate: 517401, Phishing: 66961
