In [2]:

import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter


In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...


True

In [5]:
human_legit = pd.read_csv(
    r"C:\Users\user\Desktop\Phishing_email\data\human_legit.csv",
    engine="python"
)

human_phishing = pd.read_csv(
    r"C:\Users\user\Desktop\Phishing_email\data\human_phishing.csv",
    engine="python"
)

llm_legit = pd.read_csv(
    r"C:\Users\user\Desktop\Phishing_email\data\llm_legit.csv",
    engine="python"
)




In [6]:


def load_llm_phishing_correctly(path):
    texts = []
    labels = []

    with open(path, "r", encoding="utf-8") as f:
        next(f)  # skip header

        for line in f:
            line = line.strip()

            if line:
                # Split only at the LAST comma
                text, label = line.rsplit(",", 1)

                texts.append(text)
                labels.append(int(label))

    df = pd.DataFrame({
        "text": texts,
        "label": labels
    })

    return df


llm_phishing = load_llm_phishing_correctly(
    r"C:\Users\user\Desktop\Phishing_email\data\llm_phishing.csv"
)

print(llm_phishing.head())
print(llm_phishing.shape)


                                                text  label
0  Dear User, We have received reports indicating...      1
1  Dear Sarah Thompson, I hope this email finds y...      1
2  Dear Michael, I hope this email finds you well...      1
3  Dear Sarah, I am Daniel Anderson, the Security...      1
4  Dear John, I hope this email finds you well. A...      1
(1000, 2)


In [7]:
print(human_legit.shape)
print(human_phishing.shape)
print(llm_legit.shape)
print(llm_phishing.shape)


(1000, 7)
(1000, 7)
(1000, 2)
(1000, 2)


In [8]:
print("Human Legit Columns:")
print(human_legit.columns)
print()

print("Human Phishing Columns:")
print(human_phishing.columns)
print()


Human Legit Columns:
Index(['sender', 'receiver', 'date', 'subject', 'body', 'urls', 'label'], dtype='str')

Human Phishing Columns:
Index(['sender', 'receiver', 'date', 'subject', 'body', 'urls', 'label'], dtype='str')



In [9]:
print("LLM Legit Columns:")
print(llm_legit.columns)
print()

print("LLM Phishing Columns:")
print(llm_phishing.columns)
print()


LLM Legit Columns:
Index(['text', 'label'], dtype='str')

LLM Phishing Columns:
Index(['text', 'label'], dtype='str')



In [10]:
# Combine subject + body into single text column
human_legit['text'] = human_legit['subject'].astype(str) + " " + human_legit['body'].astype(str)
human_phishing['text'] = human_phishing['subject'].astype(str) + " " + human_phishing['body'].astype(str)

# Assign final labels
human_legit['final_label'] = 0      # Legitimate
human_phishing['final_label'] = 1   # Human Phishing


In [11]:
# LLM Legitimate → 0
llm_legit['final_label'] = 0

# LLM Phishing → 2
llm_phishing['final_label'] = 2


In [12]:
print("Human Legit Sample:")
print(human_legit[['text', 'final_label']].head())
print()

print("Human Phishing Sample:")
print(human_phishing[['text', 'final_label']].head())


Human Legit Sample:
                                                text  final_label
0  Starting IC with wizard Hi\r\n\r\n\t\tI am run...            0
1  Trade Me -- A question on your auction: Auctio...            0
2  Trade Me - A request from a Trade Me member. A...            0
3  RE: NorthTec Account/Password Hi Tony\r\nNot s...            0
4  2008 timetable Kindly suggest changes\r\n\r\n-...            0

Human Phishing Sample:
                                                text  final_label
0  Your MetaMask wallet will be suspended Verify ...            1
1  Your MetaMask wallet will be suspended Verify ...            1
2  Your shipment is on the way Announcing JotForm...            1
3  Your shipment is on the way Announcing JotForm...            1
4  Netflix : We're having some trouble with your ...            1


In [13]:
print("Human Legit Label Counts:")
print(human_legit['final_label'].value_counts())
print()

print("Human Phishing Label Counts:")
print(human_phishing['final_label'].value_counts())


Human Legit Label Counts:
final_label
0    1000
Name: count, dtype: int64

Human Phishing Label Counts:
final_label
1    1000
Name: count, dtype: int64


In [14]:
print("LLM Legit Sample:")
print(llm_legit[['text', 'final_label']].head())
print()

print("LLM Phishing Sample:")
print(llm_phishing[['text', 'final_label']].head())


LLM Legit Sample:
                                                text  final_label
0  Dear Michael, I hope this message finds you we...            0
1  Dear Jennifer, We hope you're doing well. We'r...            0
2  Dear Robert, Your attention is urgently requir...            0
3  Dear Emily, We're writing to remind you of the...            0
4  Dear William, We need your immediate attention...            0

LLM Phishing Sample:
                                                text  final_label
0  Dear User, We have received reports indicating...            2
1  Dear Sarah Thompson, I hope this email finds y...            2
2  Dear Michael, I hope this email finds you well...            2
3  Dear Sarah, I am Daniel Anderson, the Security...            2
4  Dear John, I hope this email finds you well. A...            2


In [15]:
print("LLM Legit Label Counts:")
print(llm_legit['final_label'].value_counts())
print()

print("LLM Phishing Label Counts:")
print(llm_phishing['final_label'].value_counts())


LLM Legit Label Counts:
final_label
0    1000
Name: count, dtype: int64

LLM Phishing Label Counts:
final_label
2    1000
Name: count, dtype: int64


In [16]:
human_legit = human_legit[['text', 'final_label']]
human_phishing = human_phishing[['text', 'final_label']]

llm_legit = llm_legit[['text', 'final_label']]
llm_phishing = llm_phishing[['text', 'final_label']]


In [17]:
df = pd.concat([
    human_legit,
    human_phishing,
    llm_legit,
    llm_phishing
], ignore_index=True)

print("Final Dataset Shape:", df.shape)
print(df['final_label'].value_counts())


Final Dataset Shape: (4000, 2)
final_label
0    2000
1    1000
2    1000
Name: count, dtype: int64


In [18]:
print("Merged Dataset Columns:")
print(df.columns.tolist())
print(df.head())


Merged Dataset Columns:
['text', 'final_label']
                                                text  final_label
0  Starting IC with wizard Hi\r\n\r\n\t\tI am run...            0
1  Trade Me -- A question on your auction: Auctio...            0
2  Trade Me - A request from a Trade Me member. A...            0
3  RE: NorthTec Account/Password Hi Tony\r\nNot s...            0
4  2008 timetable Kindly suggest changes\r\n\r\n-...            0


In [20]:
df = df[['text', 'final_label']]
df.to_csv(
    r"C:\Users\user\Desktop\Phishing_email\data\merged_dataset.csv",
    index=False
)


In [21]:
print(df.info())

<class 'pandas.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   text         3998 non-null   str  
 1   final_label  4000 non-null   int64
dtypes: int64(1), str(1)
memory usage: 62.6 KB
None
