In [1]:
#ThreatSense - email_content training
#Neccessary packages imported for training the model and saving the results
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import string
import re
import joblib 

In [2]:
# Loading the dataset to the program
df = pd.read_csv('Phishing_Email.csv') 

# Cleaning the null/NA values 
df.dropna(inplace=True)

# Preprocessing the email content
def clean_text(text):
    # Removing the punctuations 
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Converting all the characters to lowercase for consistent text analysis
    text = text.lower()
    # Text is splitted into words and for processing purposes(counting frequency or finding patterns)
    words = re.findall(r'\b\w+\b', text)
    return ' '.join(words)

# Applying the clean text function for the related column in the dataset
df['Email Text'] = df['Email Text'].apply(clean_text)


In [3]:
# Data splitting for traning and testing
X_train, X_test, y_train, y_test = train_test_split(df['Email Text'], df['Email Type'], test_size=0.2, random_state=42) #random_state to initialize the random number generator

# Converting the data into numerical values for vectorization that allows ML algorithms to quantitive analysis
vectorizer = TfidfVectorizer(max_features=5000)  
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Training the model using RandomForest algorithm that uses 100 decision trees
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_tfidf, y_train)

# Checking the accuracy of the model 
y_pred = rf_classifier.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.9624362758250604


In [4]:
# Saving ML model and data transformation settings for future predictions
joblib.dump(rf_classifier, 'rf_classifier_email.pkl')
joblib.dump(vectorizer, 'vectorizer_email.pkl')

['vectorizer_email.pkl']

In [None]:
#This is a function that allow us to test the trained model right after training
#without integrating it to any platform
def predict_email(input_text):
    cleaned_text = clean_text(input_text)
    tfidf_vector = vectorizer.transform([cleaned_text])
    prediction = rf_classifier.predict(tfidf_vector)
    return prediction[0]

# Getting user input and predicting
input_email = input("Enter the email text: ")
prediction = predict_email(input_email)
print("Prediction:", prediction)