# Phishing emails detection

## Libraries

In [28]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /home/tofeha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [29]:
import numpy as numpy
import pandas as pd

import os
from collections import Counter
import re
from nltk.corpus import stopwords


DATA = "data/"
MAPPING = {
    "0": "Safe Email",
    "1": "Phishing Email",
}

if not os.path.exists(DATA):
    raise FileNotFoundError(f"Data directory {DATA} does not exist. Please create it and add the data files.")

In [30]:
# The data is here, unzip it if not already done
!if [ ! -f {DATA}Phishing_Email.csv ]; then unzip -o {DATA}phishing_email.zip -d {DATA}; fi


## Preprocessing (Execute ONLY ONCE)

These cells will create the `phishing_email_clean_csv` file, and zip it (for GitHub).

No need to run these cells again, unless you want to update the dataset.

In [31]:
emails = pd.read_csv(os.path.join(DATA, "Phishing_Email.csv"))

In [32]:
cleaned_emails = emails.copy()
my_columns = ["Email Text", "Email Type"]

# Rename columns
cleaned_emails = cleaned_emails[my_columns].rename(columns={
    my_columns[0]: "text",
    my_columns[1]: "label",
})
# NA emails are just empty
cleaned_emails = cleaned_emails.fillna("")

# Map label strings to integers: "Safe Email" -> 0, "Phishing Email" -> 1
label_map = {v: int(k) for k, v in MAPPING.items()}

# Affect an int to each label.
cleaned_emails["label"] = cleaned_emails["label"].map(label_map).astype("int8")
cleaned_emails["text"] = cleaned_emails["text"].astype(str)

# Remove any labels that are not in the dataset
cleaned_emails = cleaned_emails.dropna(subset=["label"])



### Save as CSV file in a zip file


In [33]:
cleaned_emails.to_csv(os.path.join(DATA, "phishing_email_cleaned.csv"), index=False)
print(f"Cleaned emails saved to {os.path.join(DATA, 'phishing_email_cleaned.csv')}")

Cleaned emails saved to data/phishing_email_cleaned.csv


In [34]:
! if [ ! -f {DATA}phishing_email_cleaned.zip ]; then zip -r {DATA}phishing_email_cleaned.zip {DATA}phishing_email_cleaned.csv; fi
print(f"Cleaned emails zipped to {os.path.join(DATA, 'phishing_email_cleaned.zip')}")

  adding: data/phishing_email_cleaned.csv (deflated 64%)
Cleaned emails zipped to data/phishing_email_cleaned.zip


## EDA

Each email has the following features:
- `Email Text`: The content of the email.
- `Email Type`: The label indicating whether the email is phishing or legitimate.

We will separate the analysis into both classification types.

Analysis that include both types will not approch numerical properties (safe mail are more present), but rather text properties.


In [35]:
# Check if the CSV file exists, if not, unzip the provided zip file
! if [ ! -f {DATA}phishing_email_cleaned.csv ]; then unzip -o {DATA}phishing_email_cleaned.zip -d {DATA}; fi

In [60]:
emails = pd.read_csv(os.path.join(DATA, "phishing_email_cleaned.csv"))
emails["text"] = emails["text"].astype(str)
safe_emails = emails[emails["label"] == 0].copy()
phishing_emails = emails[emails["label"] == 1].copy()

### General information

In [61]:
emails.tail(5)

Unnamed: 0,text,label
18645,date a lonely housewife always wanted to date ...,1
18646,request submitted : access request for anita ....,0
18647,"re : important - prc mtg hi dorn & john , as y...",0
18648,press clippings - letter on californian utilit...,0
18649,empty,1


### Most common words

In [62]:
def get_most_common_words(all_words, len_minimum=0, top_n=10, stop_words=True, special_chars=False):
    """
    Get the most common words in the dataset of a minimum length.
    
    Args:
        all_words (list): List of all words.
        len_minimum (int): Minimum length of words to consider.
        top_n (int): Number of most common words to return.
        stop_words (bool): Whether to exclude common stop words.
        special_chars (bool): Whether to include words with special characters.
    Returns:
        list: A list of tuples containing the most common words and their counts.
    """
    if stop_words:
        stop_words_set = set(stopwords.words('english'))
        all_words = [word for word in all_words if word.lower() not in stop_words_set]

    regex = r'[a-zA-Z]+' if not special_chars else r'.+'
    lowered_words = [word.lower() for word in all_words if re.fullmatch(regex, word) and len(word) >= len_minimum]

    word_counts = Counter(lowered_words)

    return word_counts.most_common(top_n)


#### Safe emails

In [63]:
all_safe_words = [word for text in safe_emails["text"] for word in text.split()]
get_most_common_words(all_safe_words, len_minimum=0, top_n=10)

[('enron', 19266),
 ('university', 17162),
 ('language', 16795),
 ('one', 11085),
 ('ect', 11002),
 ('would', 10816),
 ('new', 10240),
 ('information', 10033),
 ('please', 9881),
 ('linguistics', 8908)]

In [64]:
get_most_common_words(all_safe_words, len_minimum=4, top_n=10)

[('enron', 19266),
 ('university', 17162),
 ('language', 16795),
 ('would', 10816),
 ('information', 10033),
 ('please', 9881),
 ('linguistics', 8908),
 ('also', 8310),
 ('conference', 7954),
 ('papers', 6882)]

#### Phishing emails

In [65]:
all_phishing_words = [word for text in phishing_emails["text"] for word in text.split()]
get_most_common_words(all_phishing_words, len_minimum=0, top_n=10)

[('email', 5126),
 ('com', 3937),
 ('free', 3936),
 ('get', 3822),
 ('please', 3753),
 ('company', 3746),
 ('information', 3535),
 ('money', 3490),
 ('one', 3466),
 ('business', 3258)]

In [66]:
get_most_common_words(all_phishing_words, len_minimum=4, top_n=10)

[('email', 5126),
 ('free', 3936),
 ('please', 3753),
 ('company', 3746),
 ('information', 3535),
 ('money', 3490),
 ('business', 3258),
 ('http', 3113),
 ('report', 2960),
 ('time', 2945)]

It seems that phishing emails are more likely to contain words related to urgency, action, and financial terms.

Safe emails, on the other hand, tend to be more neutral and informative.

### Words per email

In [67]:
safe_emails["number_of_words"] = safe_emails["text"].apply(lambda x: len(x.split()))
phishing_emails["number_of_words"] = phishing_emails["text"].apply(lambda x: len(x.split()))

#### Safe emails

In [None]:
safe_emails["number_of_words"].describe().round(2)

count      11322.00
mean         685.87
std        33160.24
min            1.00
25%           79.00
50%          179.00
75%          389.00
max      3527576.00
Name: number_of_words, dtype: float64

#### Phishing emails

In [71]:
phishing_emails["number_of_words"].describe().round(2)

count     7328.00
mean       301.19
std        569.72
min          0.00
25%         68.00
50%        136.00
75%        290.00
max      11625.00
Name: number_of_words, dtype: float64

Phishing emails tend to be shorter than safe emails, which may indicate a more direct approach to trick the recipient.