In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
stop_words = set(stopwords.words('english'))

In [4]:
df = pd.read_csv("CEAS_08.csv")
df_original = df.copy()

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39154 entries, 0 to 39153
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sender    39154 non-null  object
 1   receiver  38692 non-null  object
 2   date      39154 non-null  object
 3   subject   39126 non-null  object
 4   body      39154 non-null  object
 5   label     39154 non-null  int64 
 6   urls      39154 non-null  int64 
dtypes: int64(2), object(5)
memory usage: 2.1+ MB


# Preprocessing

In [21]:
def clean_text(text):
    text = re.sub(r"<[^>]+>", "", str(text))  
    text = re.sub(r"[^a-zA-Z0-9\s\-]", "", text)  # Retain hyphens
    text = re.sub(r"[^\w\s]", "", text, flags=re.UNICODE)  
    text = [word.lower() for word in word_tokenize(text) if word.lower() not in stop_words]
    return " ".join(text).strip()    


df["clean_subject"] = df["subject"].apply(clean_text)
df["clean_body"] = df["body"].apply(clean_text)

# Extract sender domain
df["sender_domain"] = df["sender"].apply(lambda x: x.split("@")[-1] if pd.notnull(x) else "")
df['sender_domain'] = df['sender_domain'].str[:-1]

df["receiver_domain"] = df["receiver"].apply(lambda x: x.split("@")[-1] if pd.notnull(x) else "")
# df['receiver_domain'] = df['receiver_domain'].str[:-1]
df['receiver_domain'] = df['receiver_domain'].apply(lambda x: x[:-1] if x.endswith(">") else x)

# Parse date (handle inconsistent formats)
df["date"] = df["date"].apply(lambda x: pd.to_datetime(x, errors="coerce",utc = True))
df['hour'] = df['date'].dt.hour
df['day_of_week'] = df['date'].dt.dayofweek 
df['hour_normalized'] = df['hour'] / 23.0

df = df.dropna(subset=["label", "clean_subject", "clean_body","receiver","subject","date"])

In [22]:
df['label'].value_counts()

label
1    21812
0    16842
Name: count, dtype: int64

# Balancing

In [23]:
# Separate the majority and minority classes
majority_class = df[df['label'] == 1]
minority_class = df[df['label'] == 0]

# Randomly sample from the majority class to match the size of the minority class
balanced_majority_class = majority_class.sample(len(minority_class), random_state=42)

# Combine the balanced majority class with the minority class
df_balanced = pd.concat([balanced_majority_class, minority_class])

# Shuffle the resulting dataframe
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

df_balanced['label'].value_counts()

label
1    16842
0    16842
Name: count, dtype: int64

In [24]:
df_balanced = df_balanced.drop(columns=["date", "sender", "receiver", "subject", "body","hour"])

In [25]:
df_balanced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33684 entries, 0 to 33683
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   label            33684 non-null  int64  
 1   urls             33684 non-null  int64  
 2   clean_subject    33684 non-null  object 
 3   clean_body       33684 non-null  object 
 4   sender_domain    33684 non-null  object 
 5   receiver_domain  33684 non-null  object 
 6   day_of_week      33684 non-null  float64
 7   hour_normalized  33684 non-null  float64
dtypes: float64(2), int64(2), object(4)
memory usage: 2.1+ MB


In [5]:
# df_balanced.to_csv("CEAS_08_cleaned_balanced.csv", index=False)
df_balanced = pd.read_csv("CEAS_08_cleaned_balanced.csv")

# Vectorize

In [None]:
# df_balanced
df_balanced['clean_body'].iloc[33682]

'rel ys iab vk le dru yar gs xb tore high qua lzp lity lh di chk cinec wkn li qsj ck na'

In [28]:
df_balanced.head()

Unnamed: 0,label,urls,clean_subject,clean_body,sender_domain,receiver_domain,day_of_week,hour_normalized
0,1,1,cnncom daily top 10,daily top 10 cnncom top videos stories aug 1 2...,2905.dk,gvc.ceas-challenge.cc,2.0,0.869565
1,1,0,canadian chemist trust j,find love stick gain click url ydrvl5a,tvgam.org.uk,gvc.ceas-challenge.cc,2.0,0.913043
2,0,0,ilug via epia server,newer generations via epia boxes withe c32 pro...,lincor.com,birdsnest.maths.tcd.ie,2.0,0.043478
3,0,0,opensuse amavisd warning failure,patrick shanahan wrote hylton conacher zr1hpc ...,conacher.co.za,opensuse.org,2.0,0.652174
4,1,1,cnncom daily top 10,daily top 10 cnncom top videos stories aug 1 2...,duluth.com,gvc.ceas-challenge.cc,3.0,0.434783


In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [None]:
# combine the subject and body for tfidf
df_balanced['subjectAndBody'] = df_balanced['clean_subject'] + ' ' + df_balanced['clean_body']

X_train, X_test, y_train, y_test = train_test_split(
    df_balanced.drop(columns='label'), df_balanced['label'], test_size=0.2, random_state=50
)


# vectorize
vectorizer = TfidfVectorizer()
X_train.info()
X_train_tfidf = vectorizer.fit_transform(X_train['subjectAndBody'])
X_test_tfidf = vectorizer.transform(X_test['subjectAndBody'])

<class 'pandas.core.frame.DataFrame'>
Index: 26947 entries, 20762 to 14000
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   urls             26947 non-null  int64  
 1   clean_subject    26947 non-null  object 
 2   clean_body       26947 non-null  object 
 3   sender_domain    26947 non-null  object 
 4   receiver_domain  26947 non-null  object 
 5   day_of_week      26947 non-null  float64
 6   hour_normalized  26947 non-null  float64
 7   subjectAndBody   26947 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 1.9+ MB


In [31]:
from sklearn.naive_bayes import MultinomialNB

In [32]:
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)

In [33]:
from sklearn.metrics import classification_report
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98      3386
           1       1.00      0.96      0.98      3351

    accuracy                           0.98      6737
   macro avg       0.98      0.98      0.98      6737
weighted avg       0.98      0.98      0.98      6737



In [10]:
df_balanced.head()

Unnamed: 0,label,urls,clean_subject,clean_body,sender_domain,receiver_domain,day_of_week,hour_normalized
0,1,1,cnncom daily top 10,daily top 10 cnncom top videos stories aug 1 2...,2905.dk,gvc.ceas-challenge.cc,2.0,0.869565
1,1,0,canadian chemist trust j,find love stick gain click url ydrvl5a,tvgam.org.uk,gvc.ceas-challenge.cc,2.0,0.913043
2,0,0,ilug via epia server,newer generations via epia boxes withe c32 pro...,lincor.com,birdsnest.maths.tcd.ie,2.0,0.043478
3,0,0,opensuse amavisd warning failure,patrick shanahan wrote hylton conacher zr1hpc ...,conacher.co.za,opensuse.org,2.0,0.652174
4,1,1,cnncom daily top 10,daily top 10 cnncom top videos stories aug 1 2...,duluth.com,gvc.ceas-challenge.cc,3.0,0.434783


In [None]:
def calculate_phishing_seriousness(
    clean_subject, 
    clean_body, 
    sender_domain, 
    receiver_domain, 
    day_of_week, 
    hour_normalized, 
    urls
):
    score = 0.0
    
    # 1. Check for URL presence (strong indicator)
    if urls == 1:
        score += 0.35
    
    # 2. Sender domain analysis
    trusted_domains = {'gmail.com', 'outlook.com', 'yahoo.com', 'hotmail.com', 'protonmail.com',
    'icloud.com', 'aol.com', 'zoho.com', 'gmx.com', 'mail.com', 'tutanota.com',
    'fastmail.com', 'hushmail.com', 'runbox.com', 'posteo.de', 'disroot.org'
    }
    if sender_domain not in trusted_domains:
        score += 0.25
    
    # 3. Text analysis for phishing keywords
    phishing_keywords = {
    'password', 'urgent', 'verify', 'account', 'login', 'bank', 'security', 
    'suspended', 'confirm', 'fraud', 'update', 'alert', 'compromised', 
    'immediately', 'limited', 'action', 'required', 'personal', 'information',
    'click', 'link', 'attachment', 'unauthorized', 'activity', 'locked', 
    'expired', 'reactivate', 'invoice', 'payment', 'refund', 'transaction'
    }
    text = (clean_subject + ' ' + clean_body).lower()
    keyword_hits = sum(1 for word in phishing_keywords if word in text)
    score += min(keyword_hits * 0.07, 0.3)  # Max 0.3 for text content
    
    # 4. Timing analysis
    # Weekend check (day 6=Saturday, 7=Sunday)
    if day_of_week in {6, 7}:
        score += 0.1
    
    # Unusual hours (before 6 AM or after 8 PM)
    if hour_normalized < (6/24) or hour_normalized > (20/24):
        score += 0.15
    
    # 5. Domain mismatch check
    if sender_domain != receiver_domain:
        score += 0.15
    
    # 6. Check for suspicious sender domain patterns
    if any(c in sender_domain for c in ['-', '0', '1', '2', '3']):
        score += 0.1
    
    # Ensure score is within [0, 1]
    return max(0.0, min(score, 1.0))