In [11]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [4]:
#loading the data into notebook
df = pd.read_csv(r"C:\Users\HP\Documents\Stanley college\CyberSecurity\cyber projects\phishing-email-detector\data\emails.csv")
print(df.head())
print(df.columns)


   Unnamed: 0                                         Email Text  \
0           0  re : 6 . 1100 , disc : uniformitarianism , re ...   
1           1  the other side of * galicismos * * galicismo *...   
2           2  re : equistar deal tickets are you still avail...   
3           3  \nHello I am your hot lil horny toy.\n    I am...   
4           4  software at incredibly low prices ( 86 % lower...   

       Email Type  
0      Safe Email  
1      Safe Email  
2      Safe Email  
3  Phishing Email  
4  Phishing Email  
Index(['Unnamed: 0', 'Email Text', 'Email Type'], dtype='object')


In [6]:
df['label'] = df['Email Type'].map({'Safe Email':0,'Phishing Email':1})
X= df['Email Text']
y= df['label']

#Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size= 0.2, random_state = 42, stratify=y
)

X_train = X_train.fillna("")
X_test = X_test.fillna("")

#converting to numbers
vector = TfidfVectorizer(stop_words='english',max_features = 5000)
X_train_vec = vector.fit_transform(X_train)
X_test_vec = vector.transform(X_test)

#training is done in this part
lr = LogisticRegression(max_iter=1000, class_weight='balanced')
lr.fit(X_train_vec, y_train)
y_pred = lr.predict(X_test_vec)

#evaluating
y_pred = lr.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9611260053619303

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.95      0.97      2264
           1       0.93      0.98      0.95      1466

    accuracy                           0.96      3730
   macro avg       0.96      0.96      0.96      3730
weighted avg       0.96      0.96      0.96      3730


Confusion Matrix:
 [[2155  109]
 [  36 1430]]


In [10]:
#testing on unseen samples
def predict_email(email_text):
    #transform the text using vectorizer
    email_vec = vector.transform([email_text])
    
    # predict the probability
    probabilities = lr.predict_proba(email_vec)
    
    phishing_probability = probabilities[0][1]
    
    return phishing_probability


test_emails = [
    #safe email
    """
    Hi team,
    Just a reminder that our weekly sync meeting is scheduled for tomorrow at 10 AM.
    Please review the attached agenda beforehand.
    Thanks,
    Alex
    """,
    
    #phishing email
    """
    Subject: Congratulations! You've Won!
    
    Dear Winner,
    You have been selected as the winner of our monthly $1,000,000 prize!
    To claim your reward, please click the link below and verify your bank details.
    Click here: http://bit.ly/claim-your-prize-now
    Hurry, this offer expires in 24 hours!
    """,
    
    # tricky phishing email
    """
    Subject: Urgent Security Alert for Your Account
    
    Dear Valued Customer,
    We detected unusual sign-in activity on your account. For your protection, we have temporarily suspended access.
    Please re-activate your account immediately by verifying your information here: http://your-bank-secure-login.com/update
    If you do not verify within 12 hours, your account will be permanently closed.
    Sincerely,
    The Security Team
    """
]

# run the test
print("\ntesting email classifier on new emails")
for i, email in enumerate(test_emails):
    phishing_prob = predict_email(email)
    decision = "Likely Phishing " if phishing_prob > 0.5 else "Likely Safe "
    
    print(f"Email Sample #{i+1}")
    print(f"Phishing Probability: {phishing_prob:.2%}")
    print(f"Result: {decision}\n" + "-"*30)


testing email classifier on new emails
Email Sample #1
Phishing Probability: 3.51%
Result: Likely Safe 
------------------------------
Email Sample #2
Phishing Probability: 95.34%
Result: Likely Phishing 
------------------------------
Email Sample #3
Phishing Probability: 89.19%
Result: Likely Phishing 
------------------------------


In [12]:
#conerting this model into two parts machine learning model and vectorizer(converting email text into numbers on which models are trained)
#everytime you run this code, two new files are created
email_model_filename = 'email_classifier_model.joblib'
email_vectorizer_filename = 'email_classifier_vectorizer.joblib'

#trained model object
joblib.dump(lr, email_model_filename)
print(f"Email model saved successfully to '{email_model_filename}'")

#fitted vectorizer object
joblib.dump(vector, email_vectorizer_filename)
print(f"Email vectorizer saved successfully to '{email_vectorizer_filename}'")


Email model saved successfully to 'email_classifier_model.joblib'
Email vectorizer saved successfully to 'email_classifier_vectorizer.joblib'
