# Dataset Preprocessing
This notebook covers loading, cleaning, and preprocessing the phishing email datasets for use in machine learning models.

In [7]:
# Install required packages
%pip install numpy pandas nltk

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
# Import necessary libraries
import numpy as np
import pandas as pd
import re
import string
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [None]:
# Load datasets
# Update the paths as needed for your environment
paths = [
    '../sample_data/CEAS_08.csv',
    '../sample_data/Nigerian_Fraud.csv',
    '../sample_data/SpamAssasin.csv',
    '../sample_data/Nazario.csv'
]
dfs = [pd.read_csv(path) for path in paths]
df = pd.concat(dfs, ignore_index=True)
print(f"Combined dataset shape: {df.shape}")
print(df.isnull().sum())

# check for missing values
print("Missing values in each column:")
# Randomly populate the subjects of emails that are missing but only if those emails are spam 
missing_spam_subjects = df['subject'].isnull() & (df['label'] == 1)
num_missing_spam_subjects = missing_spam_subjects.sum()
if num_missing_spam_subjects > 0:
    spam_subjects = df.loc[(df['label'] == 1) & df['subject'].notnull(), 'subject']
    df.loc[missing_spam_subjects, 'subject'] = np.random.choice(spam_subjects, size=num_missing_spam_subjects, replace=True)
#view the emails subjects and bodys of the missing values for subject 

print("Missing subjects and their corresponding bodies:")
missing_ham_subjects = df['subject'].isnull() & (df['label'] == 0)
titles = [
    "Pattern Recognition Contest (PRC)",
    "Family Greetings and Update",
    "Issue with Spam Filter Whitelist Configuration",
    "Ready for Monday's Competition!",
    "Exam and Student Feedback Update",
    "Scholarships & Awards Advertisement - 26 March 2008",
    "Re-sending Information on Playgroup",
    "Updates on Slide Show Functionality",
    "Request for Flash Support in Slide Show",
    "Flu Vaccine Reminder",
    "Massey University Auckland Graduation Ceremonies",
    "Order Jacs' Birthday Present",
    "ANZAC Day Poppies Availability",
    "Problem with Spamtrap - Missing Directory",
    "Spamtrap Lock Issue - File Exists",
    "Spamtrap Lock Issue - File Exists (Repeated)",
    "Wireless Network Configuration and Troubleshooting",
    "Wireless Network Bridging and Access Points Issue",
    "MSN Photos Service Overview"
]

# Set titles for the missing subjects 
df.loc[missing_ham_subjects, 'subject'] = titles[:num_missing_spam_subjects]

df.dropna(subset=['subject', 'body'], inplace=True)

print(df.isnull().sum())    





Combined dataset shape: (49860, 7)
sender       331
receiver    2092
date         483
subject       87
body           1
label          0
urls           0
dtype: int64
Missing values in each column:
Missing subjects and their corresponding bodies:
sender       331
receiver    2092
date         483
subject        0
body           0
label          0
urls           0
dtype: int64


In [10]:
# Clean text data: lowercase, remove punctuation, digits, extra spaces, and stopwords
def clean_text(text):
    text = str(text).lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = ' '.join([word for word in text.split() if word not in ENGLISH_STOP_WORDS])
    return text
print(df.isnull().sum())    

df['subject'] = df['subject'].apply(clean_text)
df['body'] = df['body'].apply(clean_text)
print(df.isnull().sum())    

print("Applied text cleaning and stopword removal on 'subject' and 'body'.")



sender       331
receiver    2092
date         483
subject        0
body           0
label          0
urls           0
dtype: int64
sender       331
receiver    2092
date         483
subject        0
body           0
label          0
urls           0
dtype: int64
Applied text cleaning and stopword removal on 'subject' and 'body'.


In [None]:
# Save the preprocessed dataset for use in model training
print(df.isnull().sum())    
df.to_csv('../sample_data/preprocessed_dataset.csv', index=False)
print("Preprocessed dataset saved to '../sample_data/preprocessed_dataset.csv'")
# load the preprocessed dataset
df = pd.read_csv('../sample_data/preprocessed_dataset.csv')
print(df.isnull().sum())
# drop missing values
df.dropna(subset=['subject', 'body'], inplace=True)
print("Dropped missing values from the dataset.")
#save the cleaned dataset
df.to_csv('../sample_data/cleaned_dataset.csv', index=False)
df = pd.read_csv('../sample_data/cleaned_dataset.csv')
print("Cleaned dataset saved to '../sample_data/cleaned_dataset.csv'")
print(df.isnull().sum())
# Check the distribution of labels
print("Label distribution:")
print(df['label'].value_counts())
# total number of emails




sender       331
receiver    2092
date         483
subject        0
body           0
label          0
urls           0
dtype: int64
Preprocessed dataset saved to '../sample_data/preprocessed_dataset.csv'
sender       331
receiver    2092
date         483
subject     1085
body          12
label          0
urls           0
dtype: int64
Dropped missing values from the dataset.
Cleaned dataset saved to '../sample_data/cleaned_dataset.csv'
sender       330
receiver    2067
date         479
subject        0
body           0
label          0
urls           0
dtype: int64
Label distribution:
label
1    27391
0    21371
Name: count, dtype: int64


In [12]:
total_emails = df.shape[0]
print(f"Total number of emails: {total_emails}")

Total number of emails: 48762
