In [None]:
!pip install tldextract
!pip install xgboost
!pip install pandas numpy beautifulsoup4 textblob scikit-learn
!pip install matplotlib
!pip install seaborn
!pip install scipy
!pip install nltk
!pip install wordcloud
!pip install scipy
!pip install nltk
!pip install wordcloud

In [None]:
import pandas as pd
import re
import urllib.parse
from collections import Counter
import tldextract
import matplotlib.pyplot as plt
import numpy as np


In [None]:
#Creating phising email dataset
#Utilizing `7 Email Phising Datasets` and merging them into a single dataset (https://figshare.com/articles/dataset/Seven_Phishing_Email_Datasets/25432108)
import glob

# Path to your datasets; adjust the pattern if they are in a different folder or format
dataset_files = glob.glob("7PhisingEmailsDataset/*.csv")  # Example: all CSVs in a 'datasets' folder

# List for storing the reduced DataFrames
dataframes = []

for file in dataset_files:
    try:
        # Read only the columns you need; ignore others (extra columns will be dropped)
        df = pd.read_csv(file, usecols=['subject', 'body', 'label'])
        dataframes.append(df)
        print(f"Loaded {file} with {len(df)} rows")
    except Exception as e:
        print(f"Error loading {file}: {e}")

# Concatenate all reduced DataFrames into one
merged_df = pd.concat(dataframes, ignore_index=True)
print(f"\nMerged dataset shape: {merged_df.shape}")

# Save as CSV
merged_df.to_csv("merged_phishing_emails.csv", index=False)
print("Saved merged dataset to merged_phishing_emails.csv")

#Preview first few rows
print("\nSample data:")
print(merged_df.head())
print(merged_df.info())


In [None]:
#Next step is to take a look a the dataset and see if:
    #1. The dataset is balanced
    #2. The dataset is clean

#Load the dataset
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv("merged_phishing_emails.csv")
print(f"Dataframe Shape Before Making Changes: {df.shape}")
#Preview the dataset
df.isnull().sum()
df.dropna(inplace=True)
df.info()
df.head()

#Check for duplicates
df.duplicated().sum()
df.drop_duplicates(inplace=True)
print(f"Dataframe Shape After Making Changes: {df.shape}")

#Check for balance 
df['label'].value_counts() 

df.to_csv("cleaned_phishing_emails.csv", index=False)


: 

In [None]:
import pandas as pd
import re
from bs4 import BeautifulSoup
from textblob import TextBlob
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

def count_keywords(text):
    # counts occurrences of suspicious keywords in text
    keywords = ["urgent", "verify your account", "click here", "login now", "password reset",
                "account suspended", "update your information", "confirm your identity",
                "secure your account", "action required"]
    text = text.lower()
    counts = {}
    for word in keywords:
        counts["count_" + word.replace(" ", "_")] = len(re.findall(r'\b' + word + r'\b', text))
    return counts

def check_greeting(text):
    # checks for generic greetings in first 200 characters
    greetings = ["dear customer", "dear user", "hello sir", "hello madam", "dear client"]
    first_bit = text.lower()[:200]
    for greeting in greetings:
        if greeting in first_bit:
            return 1
    return 0

def get_sentiment(text):
    # computes sentiment polarity and subjectivity
    blob = TextBlob(text)
    return blob.sentiment.polarity, blob.sentiment.subjectivity

def persuasion_cues(text):
    # counts gain and loss persuasion phrases
    good_phrases = ["win", "prize", "bonus", "reward"]
    bad_phrases = ["lose", "suspended", "locked", "expired"]
    text = text.lower()
    good_count = 0
    bad_count = 0
    for phrase in good_phrases:
        good_count += len(re.findall(r'\b' + phrase + r'\b', text))
    for phrase in bad_phrases:
        bad_count += len(re.findall(r'\b' + phrase + r'\b', text))
    return good_count, bad_count

def get_lengths(subject, body):
    return len(subject), len(body.split())

def count_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    return len(soup.find_all())

def count_urls(text):
    url_pattern = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
    return len(re.findall(url_pattern, text))

def count_attachments(text):
    return text.lower().count("content-disposition: attachment")

def count_exclamation(text):
    return text.count("!")

data = pd.read_csv('cleaned_phishing_emails.csv')
all_features = []

for index, row in data.iterrows():
    subject = row['subject']
    body = row['body']
    features = {}

    features.update(count_keywords(subject + ' ' + body))
    features['generic_greeting'] = check_greeting(body)
    polarity, subjectivity = get_sentiment(body)
    features['polarity'] = polarity
    features['subjectivity'] = subjectivity
    good_count, bad_count = persuasion_cues(body)
    features['good_phrases'] = good_count
    features['bad_phrases'] = bad_count
    sub_len, body_len = get_lengths(subject, body)
    features['subject_length'] = sub_len
    features['body_length'] = body_len
    features['html_tags'] = count_html_tags(body)
    features['url_count'] = count_urls(body)
    features['attachment_count'] = count_attachments(body)
    features['exclamation_count'] = count_exclamation(subject + ' ' + body)

    all_features.append(features)

features_df = pd.DataFrame(all_features)


# Utilizying TFID (Term Frequence-Inverse Document Frequency). This is a technique that helps to reduce the dimensionality of the data by removing words that are common across all documents
# In this case I have made it to use the top 2500 words and remove stop words (common words like "the", "and", "is", etc.)
vectorizer = TfidfVectorizer(max_features=2500, stop_words='english')
text_for_tfidf = data['subject'] + ' ' + data['body']
tfidf_matrix = vectorizer.fit_transform(text_for_tfidf)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
final_df = pd.concat([features_df, tfidf_df], axis=1)
final_df['label'] = data['label']
final_df = final_df.fillna(0)

final_df.to_csv('final_phishing_dataset.csv', index=False)
print("Dataset saved as final_phishing_dataset.csv")

from joblib import dump
dump(vectorizer, 'tfidf_vectorizer.joblib')
print("TF-IDF vectorizer saved as tfidf_vectorizer.joblib")