<h1>For classification report -- Early sprints and project shaping</h1>

In [1]:
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib
import re

In [2]:
#load trained model and vectorizer
rf_model = joblib.load('models/random_forest_model.pkl')
vectorizer = joblib.load('models/tfidf_vectorizer.pkl')

In [3]:
#load test data
test_data = pd.read_csv('data/test_email_dataset.csv')

In [4]:
def preprocess_email(text):
    text = text.lower()
    text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-z\s]', '', text)  # Remove non-alphabetic characters
    return text

In [6]:
print(test_data.columns)

Index(['v2', 'v1', 'predicted_label', 'is_correct'], dtype='object')


In [8]:
#preprocess test data
test_data['processed_body'] = test_data['v2'].apply(preprocess_email)

In [9]:
#transform preprocessed text using TFIDF vectorizer
X_test = vectorizer.transform(test_data['processed_body'])

In [10]:
#use 'v1' as the label column (1 is phishing, 0 non-phishing)
y_test = test_data['v1']  

In [11]:
#predictlabels using the trained model
y_pred = rf_model.predict(X_test)

#print the classification report to evaluate the performance
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.50      0.17      0.25       100
           1       0.50      0.83      0.62       100

    accuracy                           0.50       200
   macro avg       0.50      0.50      0.44       200
weighted avg       0.50      0.50      0.44       200



In [12]:
#get the classification report as a string
report = classification_report(y_test, y_pred)

# save to text file
with open('classification_report.txt', 'w') as f:
    f.write(report)

print("Classification report saved to 'classification_report.txt'")

Classification report saved to 'classification_report.txt'
