<h1>Dataset preparation for model training</h1>
<h3>Steps</h3>
<h4>Load and clean emails.csv</h4>
    <em>Load</em>
    <em>Inspect</em>
    <em>Standardise</em>
<h4>Extract trusted chains of email</h4>
    <em>Sender receiver pairs</em>
    <em>Frequency of interactions</em>
    <em>Email grouping into chains</em>
<h4>Introduce phishing samples</h4>
    <em>Spoofed email generation</em>
    <em>Combining ligit and spoofed emails</em>
<h4>Dataset inspection and validate</h4>
    <em>Verify the combined and labelled dataset</em>
    <em>Sample and review the chains to make sure spoofed emails were injected correctly</em>
<h4>Feature engineering</h4>
    <em>Extract header features</em>
    <em>Extract content features</em>
    <em>Extract behaviour features</em>
<h4>Prepare for model training</h4>
    <em>Preprocess</em>
    <em>Train-Test split</em>

In [8]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import StandardScaler

<h2>Parse Data</h2>

In [9]:
##Loading data
emails_df = pd.read_csv('data/emails.csv')

In [10]:
#~~~PARSE MESSAGE~~~
def parse_message(raw_message):
    try:
        fields = {
            'Message-ID': re.search(r'Message-ID:\s*(.*)', raw_message).group(1) if re.search(r'Message-ID:\s*(.*)', raw_message) else None,
            'Date': re.search(r'Date:\s*(.*)', raw_message).group(1) if re.search(r'Date:\s*(.*)', raw_message) else None,
            'From': re.search(r'From:\s*(.*)', raw_message).group(1) if re.search(r'From:\s*(.*)', raw_message) else None,
            'To': re.search(r'To:\s*(.*)', raw_message).group(1) if re.search(r'To:\s*(.*)', raw_message) else None,
            'Subject': re.search(r'Subject:\s*(.*)', raw_message).group(1) if re.search(r'Subject:\s*(.*)', raw_message) else None,
        }
        body_match = re.split(r'\n\s*\n', raw_message, maxsplit=1)
        fields['Body'] = body_match[1] if len(body_match) > 1 else ""
        return fields
    except Exception as e:
        print(f"Error parsing message: {e}")
        return {}
    

In [11]:
## apply parser to message column
parsed_messages = emails_df['message'].apply(parse_message)

In [12]:
#convert parsed data into dataframe
parsed_df = pd.DataFrame(parsed_messages.tolist())

In [13]:
#merge with original; file column
emails_df = pd.concat([emails_df[['file']], parsed_df], axis=1)

In [14]:
#func to convert and clean dates
def parse_date(date_string):
    try:
        return pd.to_datetime(date_string, errors='coerce')
    except Exception:
        return None

In [15]:
#apply date parser
emails_df['Date'] = emails_df['Date'].apply(parse_date)

In [16]:
#drop rows with invalid dates.
emails_df = emails_df.dropna(subset=['Date'])

In [17]:
#metadata extraction
words2split = [
    'Message-ID: ', 'Date: ', 'From: ', 'To: ', 'Subject: ', 'Cc: ', 
    'Mime-Version: ', 'Content-Type: ', 'Content-Transfer-Encoding: ', 
    'Bcc: ', 'X-From: ', 'X-To: ', 'X-cc: ', 'X-bcc: ', 
    'X-Folder: ', 'X-Origin: ', 'X-FileName: '
]

In [18]:
features_naming = [i[:-2] for i in words2split]  # Strip ': ' for column names
split_condition = '|'.join(words2split)

In [19]:
#func to extract meta fields
def extract_metadata(pre_info, split_condition):
    fields = re.split(split_condition, pre_info)
    if len(fields) - 1 != len(features_naming):
        print(f"Warning: Metadata length mismatch in '{pre_info[:50]}...'")  # Log first 50 characters for context
    metadata = dict(zip(features_naming, fields[1:]))  # Skip first empty split
    return metadata


print(emails_df.head())
emails_df.to_csv('cleaned_emails.csv', index=False)

                       file                                     Message-ID  \
0     allen-p/_sent_mail/1.  <18782981.1075855378110.JavaMail.evans@thyme>   
1    allen-p/_sent_mail/10.  <15464986.1075855378456.JavaMail.evans@thyme>   
2   allen-p/_sent_mail/100.  <24216240.1075855687451.JavaMail.evans@thyme>   
3  allen-p/_sent_mail/1000.  <13505866.1075863688222.JavaMail.evans@thyme>   
4  allen-p/_sent_mail/1001.  <30922949.1075863688243.JavaMail.evans@thyme>   

                        Date                     From  \
0  2001-05-14 16:39:00-07:00  phillip.allen@enron.com   
1  2001-05-04 13:51:00-07:00  phillip.allen@enron.com   
2  2000-10-18 03:00:00-07:00  phillip.allen@enron.com   
3  2000-10-23 06:13:00-07:00  phillip.allen@enron.com   
4  2000-08-31 05:07:00-07:00  phillip.allen@enron.com   

                        To            Subject  \
0     tim.belden@enron.com  Mime-Version: 1.0   
1  john.lavorato@enron.com                Re:   
2   leah.arsdall@enron.com           Re: 

<h2> Standardization</h2>

In [20]:
emails_df['Body_Length'] = emails_df['Body'].apply(len)

In [21]:
emails_df['Num_Recipients'] = emails_df['To'].apply(lambda x: len(x.split(',')) if isinstance(x, str) else 0)


In [22]:
emails_df['Subject_Word_Count'] = emails_df['Subject'].apply(lambda x: len(x.split()) if isinstance(x, str) else 0)

In [23]:
emails_df['Timestamp'] = emails_df['Date'].apply(lambda x: x.timestamp() if pd.notnull(x) else 0)

In [24]:
numerical_features = ['Body_Length', 'Num_Recipients', 'Subject_Word_Count', 'Timestamp']

In [25]:
#init scalar
scaler = StandardScaler()

In [26]:
#apply standardisation
emails_df[numerical_features] = scaler.fit_transform(emails_df[numerical_features])

In [27]:
#verification
print(emails_df[numerical_features].head())

   Body_Length  Num_Recipients  Subject_Word_Count  Timestamp
0    -0.222735       -0.342952           -0.948929   0.256478
1    -0.129466       -0.342952           -1.341740   0.225712
2    -0.221880       -0.342952           -0.948929  -0.377824
3    -0.202688       -0.342952           -0.948929  -0.362210
4    -0.221268       -0.342952           -0.948929  -0.523534


In [28]:
from collections import Counter

In [29]:
#extracting sender-receiver pairs
emails_df['To'] = emails_df['To'].fillna("")  # Handle missing 'To' values
emails_df['Recipients_List'] = emails_df['To'].apply(lambda x: x.split(','))

#use a list, faster to process
email_pairs = [(row['From'], recipient.strip()) 
               for _, row in emails_df.iterrows() 
               for recipient in row['Recipients_List']]

In [30]:
#count frequent interaction
pair_counter = Counter(email_pairs)

In [31]:
#define trusted pairs
trusted_pairs = [pair for pair, count in pair_counter.items() if count > 10]

In [32]:
#filter emails belonging to trusted pairs
emails_df['Is_Trusted'] = emails_df.apply(
    lambda row: any((row['From'], recipient) in trusted_pairs for recipient in row['Recipients_List']),
    axis=1
)

In [33]:
#group emails by trusted interactions
trusted_chains = emails_df[emails_df['Is_Trusted']]

In [34]:
#inspeect to confirm trusted sender and receiver pairs are cirrectly identified
print(f"Number of Trusted Pairs: {len(trusted_pairs)}")
print("Sample Trusted Pairs:")
print(trusted_pairs[:10])

Number of Trusted Pairs: 13445
Sample Trusted Pairs:
[('phillip.allen@enron.com', 'tim.belden@enron.com'), ('phillip.allen@enron.com', 'john.lavorato@enron.com'), ('phillip.allen@enron.com', 'stagecoachmama@hotmail.com'), ('phillip.allen@enron.com', 'keith.holst@enron.com'), ('phillip.allen@enron.com', 'paula.harris@enron.com'), ('phillip.allen@enron.com', 'ina.rangel@enron.com'), ('phillip.allen@enron.com', 'tim.heizenrader@enron.com'), ('phillip.allen@enron.com', 'pallen70@hotmail.com'), ('phillip.allen@enron.com', 'bs_stone@yahoo.com'), ('phillip.allen@enron.com', 'stouchstone@natsource.com')]


In [35]:
print("Number of Emails in Trusted Chains:", len(trusted_chains))
print("Sample Trusted Emails:")
print(trusted_chains[['From', 'To', 'Body']].head())

Number of Emails in Trusted Chains: 334803
Sample Trusted Emails:
                       From                          To  \
0   phillip.allen@enron.com        tim.belden@enron.com   
1   phillip.allen@enron.com     john.lavorato@enron.com   
11  phillip.allen@enron.com  stagecoachmama@hotmail.com   
12  phillip.allen@enron.com       keith.holst@enron.com   
13  phillip.allen@enron.com       keith.holst@enron.com   

                                                 Body  
0                           Here is our forecast\n\n   
1   Traveling to have a business meeting takes the...  
11  Lucy,\n\n Here are the rentrolls:\n\n\n\n Open...  
12  ---------------------- Forwarded by Phillip K ...  
13  ---------------------- Forwarded by Phillip K ...  


In [36]:
#cross check
sample_pair = trusted_pairs[0]
print(f"Emails for Trusted Pair {sample_pair}:")
print(trusted_chains[
    (trusted_chains['From'] == sample_pair[0]) & 
    (trusted_chains['Recipients_List'].apply(lambda x: sample_pair[1] in x))
][['Body']].head())

Emails for Trusted Pair ('phillip.allen@enron.com', 'tim.belden@enron.com'):
                                                  Body
0                            Here is our forecast\n\n 
153  Tim,\n Matt sent you a email with his attempt ...
186  ---------------------- Forwarded by Phillip K ...
269  forecast for socal demand/rec/storage.  Looks ...
277                             Here is our forecast\n


In [37]:
chain_count = 0


print("Number of Chains:", len(trusted_chains))
print("Sample Chains:")

# Loop over each chain 
for chain, emails in trusted_chains.groupby(['From', 'To']):
    if chain_count >= 4:  # Limit nmber of chains
        break
    print(f"Chain: {chain}")
    print(f"Number of Emails: {len(emails)}")
    print(emails['Body'].head(), "\n")
    chain_count += 1

Number of Chains: 334803
Sample Chains:
Chain: ('1.10043390.-2@multexinvestornetwork.com', 'jwillia@enron.com')
Number of Emails: 38
507999    In today's Daily Update, you'll find research ...
508007    From today's edition of the Daily Update, you ...
508011    In today's Daily Update, we feature research o...
508024    From today's Daily Update, you'll have the opp...
508036    From today's special weekend edition of the Da...
Name: Body, dtype: object 

Chain: ('1.10969419.-2@multexinvestornetwork.com', 'harry.arora@enron.com')
Number of Emails: 11
8041    Our gifts to you this week: Several reports th...
8051    Features: angels find investment opportunities...
8085    Read what independent analyst Charles Payne ha...
8127    To help celebrate the Holiday season and the N...
8141    In today's edition of the Daily Update, equity...
Name: Body, dtype: object 

Chain: ('1.11176403.-2@multexinvestornetwork.com', 'alewis@ect.enron.com')
Number of Emails: 18
279739    In today's Daily U

<h2> Introduce phishing samples: </h2>

In [38]:
import random

In [39]:
#funct generate spoofed emails
def generate_spoofed_email(row):
    spoofed_email = row.copy()
    spoofed_email['From'] = re.sub(r'@.+$', '@spoofed.com', row['From'])  # Change domain
    spoofed_email['Body'] += "\nClick here: http://malicious-link.com"  # Add phishing content
    spoofed_email['Label'] = 1  # Mark as phishing
    return spoofed_email

In [40]:
#clone and spoof percentage of trusted emails
spoofed_emails = trusted_chains.sample(frac=0.2, random_state=42).apply(generate_spoofed_email, axis=1)


In [41]:
#combine legit and spoofed emails
emails_df['Label'] = 0  # Legitimate emails
final_dataset = pd.concat([emails_df, spoofed_emails])

In [42]:
#check the structof spoofed_emails dataframe
print("Spoofed Emails Sample:")
print(spoofed_emails.head())

Spoofed Emails Sample:
                                       file  \
246821      kean-s/discussion_threads/3115.   
244566      kean-s/discussion_threads/1069.   
326450          mcconnell-m/_sent_mail/359.   
275117         lenhart-m/all_documents/366.   
443986  skilling-j/discussion_threads/1075.   

                                           Message-ID  \
246821  <10170593.1075848177616.JavaMail.evans@thyme>   
244566  <28983404.1075848097229.JavaMail.evans@thyme>   
326450   <4093499.1075843986028.JavaMail.evans@thyme>   
275117   <6074789.1075849747487.JavaMail.evans@thyme>   
443986  <25520575.1075845504060.JavaMail.evans@thyme>   

                             Date                         From  \
246821  2001-04-27 02:48:00-07:00    jeff.dasovich@spoofed.com   
244566  2000-08-31 09:16:00-07:00     robert.frank@spoofed.com   
326450  2000-12-04 10:36:00-08:00   mike.mcconnell@spoofed.com   
275117  2001-02-08 03:47:00-08:00  matthew.lenhart@spoofed.com   
443986  2001-03-07 02

In [43]:
#confirm size of spoofed mails matches expected percentage of trusted
expected_count = int(len(trusted_chains) * 0.2)
actual_count = len(spoofed_emails)
print(f"Expected Spoofed Emails: {expected_count}, Actual: {actual_count}")

Expected Spoofed Emails: 66960, Actual: 66961


In [44]:
#confirm combination of ligit and spoofed.*
assert len(final_dataset) == len(emails_df) + len(spoofed_emails), "Mismatch in dataset sizes!"
assert final_dataset['Label'].value_counts()[1] == len(spoofed_emails), "Incorrect spoofed email count!"

In [45]:
#validate concat of dataframes

print("First 5 Rows of Final Dataset:")
print(final_dataset.head())
print("Last 5 Rows of Final Dataset:")
print(final_dataset.tail())

First 5 Rows of Final Dataset:
                       file                                     Message-ID  \
0     allen-p/_sent_mail/1.  <18782981.1075855378110.JavaMail.evans@thyme>   
1    allen-p/_sent_mail/10.  <15464986.1075855378456.JavaMail.evans@thyme>   
2   allen-p/_sent_mail/100.  <24216240.1075855687451.JavaMail.evans@thyme>   
3  allen-p/_sent_mail/1000.  <13505866.1075863688222.JavaMail.evans@thyme>   
4  allen-p/_sent_mail/1001.  <30922949.1075863688243.JavaMail.evans@thyme>   

                        Date                     From  \
0  2001-05-14 16:39:00-07:00  phillip.allen@enron.com   
1  2001-05-04 13:51:00-07:00  phillip.allen@enron.com   
2  2000-10-18 03:00:00-07:00  phillip.allen@enron.com   
3  2000-10-23 06:13:00-07:00  phillip.allen@enron.com   
4  2000-08-31 05:07:00-07:00  phillip.allen@enron.com   

                        To            Subject  \
0     tim.belden@enron.com  Mime-Version: 1.0   
1  john.lavorato@enron.com                Re:   
2   leah.a

In [46]:
emails_df['Label'] = 0#mark all legit emails as 0

In [47]:
spoofed_emails['Label'] = 1 #mark all spoofed emails as 1

In [48]:
#concat legit and spoofed into one dataset.
final_dataset = pd.concat([emails_df, spoofed_emails], ignore_index=True)

In [49]:
#check distribution
print("Label Distribution in Final Dataset:")
print(final_dataset['Label'].value_counts())

Label Distribution in Final Dataset:
Label
0    517401
1     66961
Name: count, dtype: int64


In [50]:
#save final dataset with labels for model prep
final_dataset.to_csv('labelLed_emails_dataset.csv', index=False)
print("Dataset saved successfully.")

Dataset saved successfully.


In [51]:
#inspect the rows
print("Legitimate Emails Sample:")
print(final_dataset[final_dataset['Label'] == 0].head())


Legitimate Emails Sample:
                       file                                     Message-ID  \
0     allen-p/_sent_mail/1.  <18782981.1075855378110.JavaMail.evans@thyme>   
1    allen-p/_sent_mail/10.  <15464986.1075855378456.JavaMail.evans@thyme>   
2   allen-p/_sent_mail/100.  <24216240.1075855687451.JavaMail.evans@thyme>   
3  allen-p/_sent_mail/1000.  <13505866.1075863688222.JavaMail.evans@thyme>   
4  allen-p/_sent_mail/1001.  <30922949.1075863688243.JavaMail.evans@thyme>   

                        Date                     From  \
0  2001-05-14 16:39:00-07:00  phillip.allen@enron.com   
1  2001-05-04 13:51:00-07:00  phillip.allen@enron.com   
2  2000-10-18 03:00:00-07:00  phillip.allen@enron.com   
3  2000-10-23 06:13:00-07:00  phillip.allen@enron.com   
4  2000-08-31 05:07:00-07:00  phillip.allen@enron.com   

                        To            Subject  \
0     tim.belden@enron.com  Mime-Version: 1.0   
1  john.lavorato@enron.com                Re:   
2   leah.arsdal

In [52]:
#inspect the rows 
print("Phishing Emails Sample:")
print(final_dataset[final_dataset['Label'] == 1].head())

Phishing Emails Sample:
                                       file  \
517401      kean-s/discussion_threads/3115.   
517402      kean-s/discussion_threads/1069.   
517403          mcconnell-m/_sent_mail/359.   
517404         lenhart-m/all_documents/366.   
517405  skilling-j/discussion_threads/1075.   

                                           Message-ID  \
517401  <10170593.1075848177616.JavaMail.evans@thyme>   
517402  <28983404.1075848097229.JavaMail.evans@thyme>   
517403   <4093499.1075843986028.JavaMail.evans@thyme>   
517404   <6074789.1075849747487.JavaMail.evans@thyme>   
517405  <25520575.1075845504060.JavaMail.evans@thyme>   

                             Date                         From  \
517401  2001-04-27 02:48:00-07:00    jeff.dasovich@spoofed.com   
517402  2000-08-31 09:16:00-07:00     robert.frank@spoofed.com   
517403  2000-12-04 10:36:00-08:00   mike.mcconnell@spoofed.com   
517404  2001-02-08 03:47:00-08:00  matthew.lenhart@spoofed.com   
517405  2001-03-07 0

In [53]:
#check balance of distrubution
total_legitimate = len(emails_df)
total_phishing = len(spoofed_emails)
print(f"Legitimate: {total_legitimate}, Phishing: {total_phishing}")

Legitimate: 517401, Phishing: 66961


<h2>Feature Engineering</h2>

In [54]:
from datetime import datetime

In [55]:
dataset = pd.read_csv('labelled_emails_dataset.csv')

In [56]:
#***Header features***
#extract sender domain
dataset['Sender_Domain'] = dataset['From'].apply(lambda x: re.search(r'@([\w.-]+)', x).group(1) if isinstance(x, str) and '@' in x else "Unknown")

In [57]:
#count recipient
dataset['Recipient_Count'] = dataset['To'].apply(lambda x: len(x.split(',')) if isinstance(x, str) else 0)

In [58]:
#detect sus keywords in 'Subject'
suspicious_keywords = ['urgent', 'password', 'account', 'verify', 'login']
dataset['Suspicious_Subject'] = dataset['Subject'].apply(
    lambda x: any(keyword in str(x).lower() for keyword in suspicious_keywords)
)


In [59]:
#simulate DKIM + SPF status (real data would require external headers)
dataset['Auth_Status'] = dataset['From'].apply(lambda x: 1 if 'trusted.com' in x else 0)

In [60]:
#***CONTENT FEATURES***
#body lngth
dataset['Body_Length'] = dataset['Body'].apply(lambda x: len(str(x)) if isinstance(x, str) else 0)

In [61]:
#links?
dataset['Contains_Links'] = dataset['Body'].apply(lambda x: 1 if re.search(r'http[s]?://', str(x)) else 0)

In [62]:
#simulate presence of attachments (placeholder)
dataset['Has_Attachment'] = dataset['Body'].apply(lambda x: 1 if 'attachment' in str(x).lower() else 0)

In [63]:
#check phishing keywords in body
phishing_keywords = ['login', 'click', 'verify', 'bank', 'account']
dataset['Phishing_Keywords'] = dataset['Body'].apply(
    lambda x: any(keyword in str(x).lower() for keyword in phishing_keywords)
)

In [64]:
#***BEHAVIORAL FEATURES***
#simulate frequency of interaction (based on the From-To pair)
interaction_counts = dataset.groupby(['From', 'To']).size()
dataset['Interaction_Count'] = dataset.apply(
    lambda row: interaction_counts.get((row['From'], row['To']), 0), axis=1
)

In [76]:
#drop rows with invalid 'Date' vals
dataset = dataset.dropna(subset=['Date'])

In [77]:
#sort vals before calculating time deltas
dataset = dataset.sort_values(by=['From', 'To', 'Date'])

In [78]:
#calc time deltas between consecutive emails
dataset['Time_Delta'] = dataset.groupby(['From', 'To'])['Date'].diff()
dataset['Time_Delta'] = dataset['Time_Delta'].dt.total_seconds()  # Convert to seconds

In [79]:
#calc average time between emails
dataset['Avg_Time_Between_Emails'] = dataset.groupby(['From', 'To'])['Time_Delta'].transform('mean')

In [71]:
# Add a feature for emails sent at odd hours (e.g., late night)
dataset['Odd_Hours'] = dataset['Date'].apply(lambda x: 1 if x.hour < 6 or x.hour > 22 else 0)


In [74]:
#fill missing values in 'Time_Delta' and 'Avg_Time_Between_Emails' columns
dataset['Time_Delta'] = dataset['Time_Delta'].fillna(0)
dataset['Avg_Time_Between_Emails'] = dataset['Avg_Time_Between_Emails'].fillna(0)

In [75]:
#confirm dataset
print(dataset[['From', 'To', 'Date', 'Time_Delta', 'Avg_Time_Between_Emails', 'Odd_Hours']].head())

                                  From  \
406027   'todd'.delahoussaye@enron.com   
371016  --migrated--bmishkin@ercot.com   
509994              -nikole@excite.com   
509870              -nikole@excite.com   
509830              -nikole@excite.com   

                                                       To  \
406027  derek.bailey@enron.com, jean.bell@enron.com, r...   
371016                               mockmarket@ercot.com   
509994                            bill.williams@enron.com   
509870                            bill.williams@enron.com   
509830                            bill.williams@enron.com   

                            Date  Time_Delta  Avg_Time_Between_Emails  \
406027 2001-10-24 06:50:26-07:00         0.0                 0.000000   
371016 2001-10-22 15:10:32-07:00         0.0                 0.000000   
509994 2001-05-31 02:12:54-07:00         0.0            504852.714286   
509870 2001-06-19 15:49:37-07:00   1690603.0            504852.714286   
509830 2001-06-

In [80]:
#***SAVE FINAL DATASET***
#drop unnecessary columns to focus on features
columns_to_drop = ['file', 'message', 'Recipients_List']  # add others not required for modeling
dataset = dataset.drop(columns=[col for col in columns_to_drop if col in dataset])

In [81]:
#save engineered dataset
processed_filename = 'engineered_labeled_emails_dataset.csv'
dataset.to_csv(processed_filename, index=False)
print(f"Feature-engineered dataset saved to {processed_filename}")

Feature-engineered dataset saved to engineered_labeled_emails_dataset.csv


<h2>Preprocessing</h2>

<h4>Encoding categorical features</h4>

In [82]:
from sklearn.preprocessing import LabelEncoder

In [83]:
#create copies of dataset to avoid any overwriting
preprocessed_dataset = dataset.copy()

In [86]:
#init label encoders
label_encoder_from = LabelEncoder()
label_encoder_to = LabelEncoder()

In [87]:
#encode From and To columns
preprocessed_dataset['From_Encoded'] = label_encoder_from.fit_transform(preprocessed_dataset['From'])
preprocessed_dataset['To_Encoded'] = label_encoder_to.fit_transform(preprocessed_dataset['To'])

In [88]:
#drop og categorical columns if encoded version replaces
preprocessed_dataset = preprocessed_dataset.drop(columns=['From', 'To'])

In [89]:
#inspect
print(preprocessed_dataset[['From_Encoded', 'To_Encoded']].head())

        From_Encoded  To_Encoded
406027             0        7562
371016             1       21457
509994             2        3363
509870             2        3363
509830             2        3363


In [92]:
# save encoders for later use in Flask app (if need)
import pickle
with open('label_encoder_from.pkl', 'wb') as f:
    pickle.dump(label_encoder_from, f)

with open('label_encoder_to.pkl', 'wb') as f:
    pickle.dump(label_encoder_to, f)


In [93]:
#save the preprocessed dataset
preprocessed_dataset.to_csv('preprocessed_dataset.csv', index=False)
print("Categorical features encoded and dataset saved successfully.")

Categorical features encoded and dataset saved successfully.


<h2>Normalisation</h2>

In [94]:
numerical_features = ['Body_Length', 'Num_Recipients', 'Subject_Word_Count', 
                      'Timestamp', 'Time_Delta', 'Avg_Time_Between_Emails']


In [99]:
#init scaler
scaler = StandardScaler()

In [100]:
#apply scaler to normalize numerical features
dataset[numerical_features] = scaler.fit_transform(dataset[numerical_features])

In [101]:
#verify normalised values
print("Normalized Numerical Features:")
print(dataset[numerical_features].head())

Normalized Numerical Features:
        Body_Length  Num_Recipients  Subject_Word_Count  Timestamp  \
406027     0.188646        1.107863           -0.958488   1.285490   
371016    -0.227762       -0.321590           -0.958488   1.277355   
509994     0.147819       -0.321590           -1.349887   0.565915   
509870    -0.016689       -0.321590           -0.958488   0.662227   
509830     0.283642       -0.321590           -0.958488   0.686146   

        Time_Delta  Avg_Time_Between_Emails  
406027         NaN                      NaN  
371016         NaN                      NaN  
509994         NaN                -0.066012  
509870    0.385147                -0.066012  
509830   -0.060319                -0.066012  


<h2>Train test split</h2>

In [102]:
from sklearn.model_selection import train_test_split

In [103]:
#define features (X) and target (y)
X = dataset.drop(columns=['Label'])  #drop target column
y = dataset['Label']  #target column


In [104]:
#80-20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [105]:
#verify split
print("Training set size:", X_train.shape[0])
print("Testing set size:", X_test.shape[0])
print("Label distribution in training set:")
print(y_train.value_counts(normalize=True))
print("Label distribution in testing set:")
print(y_test.value_counts(normalize=True))

Training set size: 242912
Testing set size: 60729
Label distribution in training set:
Label
0    0.886193
1    0.113807
Name: proportion, dtype: float64
Label distribution in testing set:
Label
0    0.886199
1    0.113801
Name: proportion, dtype: float64


In [113]:
non_numeric_columns = X_train.select_dtypes(include=['object', 'category']).columns
print("Non-numeric columns:", non_numeric_columns)

Non-numeric columns: Index(['Message-ID', 'From', 'To', 'Subject', 'Body', 'Sender_Domain'], dtype='object')


In [114]:
#drop irrelevant non numeric columns
X_train = X_train.drop(columns=non_numeric_columns, errors='ignore')
X_test = X_test.drop(columns=non_numeric_columns, errors='ignore')


In [115]:
#ensure all features are numeric
print("X_train dtypes after dropping non-numeric columns:")
print(X_train.dtypes)

X_train dtypes after dropping non-numeric columns:
Date                       datetime64[ns, UTC-07:00]
Body_Length                                  float64
Num_Recipients                               float64
Subject_Word_Count                           float64
Timestamp                                    float64
Is_Trusted                                      bool
Recipient_Count                                int64
Suspicious_Subject                              bool
Auth_Status                                    int64
Contains_Links                                 int64
Has_Attachment                                 int64
Phishing_Keywords                               bool
Interaction_Count                              int64
Time_Delta                                   float64
Avg_Time_Between_Emails                      float64
Odd_Hours                                      int64
dtype: object


In [146]:
#check for remaining issues
print("Sample of X_train:", X_train.head())

Sample of X_train:               Date  Body_Length  Num_Recipients  Subject_Word_Count  \
36952   1003869992    -0.111685        0.631379            1.781310   
148686   987831720    -0.078197       -0.321590            0.998511   
91549    960152760    -0.048577       -0.321590           -0.958488   
283993   987687720    -0.217221       -0.321590            0.998511   
213916   971813460    -0.005082        0.154895           -0.958488   

        Timestamp  Is_Trusted  Recipient_Count  Suspicious_Subject  \
36952    1.281990        True                3               False   
148686   0.368311        True                1               False   
91549   -1.208523        True                1               False   
283993   0.360107        True                1               False   
213916  -0.544229       False                2               False   

        Auth_Status  Contains_Links  Has_Attachment  Phishing_Keywords  \
36952             0               0               0        

In [138]:
from sklearn.impute import SimpleImputer

In [139]:
#create an imputer object with strategy replacing NaN with mean of the column
imputer = SimpleImputer(strategy='mean')

#apply imputer to both the training and test data
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

#verify
print(pd.isnull(X_train_imputed).sum())

0


<h2>SVM model training</h2>

In [140]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [141]:
#init model
svm_model = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)  # 'rbf' kernel for non-linear classification

In [160]:
#weight adjustment after first result set.
svm_model = SVC(class_weight='balanced')

In [161]:
# Ttrain the SVM model
svm_model.fit(X_train_imputed, y_train)

In [162]:
#check for missing values in X_test
print(X_test.isnull().sum())

Date                           0
Body_Length                    0
Num_Recipients                 0
Subject_Word_Count             0
Timestamp                      0
Is_Trusted                     0
Recipient_Count                0
Suspicious_Subject             0
Auth_Status                    0
Contains_Links                 0
Has_Attachment                 0
Phishing_Keywords              0
Interaction_Count              0
Time_Delta                 13319
Avg_Time_Between_Emails     4877
Odd_Hours                      0
dtype: int64


In [163]:
#usesame imputer to transform the test data
X_test_imputed = imputer.transform(X_test)

#check again for missing values
print(pd.isnull(X_test_imputed).sum())


0


In [None]:
y_pred = svm_model.predict(X_test_imputed)

In [None]:
#evaluate the model
print("SVM Model Performance on Test Data:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


In [None]:
#confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)

In [159]:

if emails_df.isnull().values.any():
    print("There are missing values in the dataset")
else:
    print("No missing values in the dataset")

No missing values in the dataset
