In [1]:
pip install pandas numpy scikit-learn matplotlib seaborn nltk

Note: you may need to restart the kernel to use updated packages.


In [5]:
import pandas as pd

# Load the dataset
df = pd.read_csv("Phishing_Email.csv")  # Ensure the CSV is in the working directory
df.head()


Unnamed: 0.1,Unnamed: 0,Email Text,Email Type
0,0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email
1,1,the other side of * galicismos * * galicismo *...,Safe Email
2,2,re : equistar deal tickets are you still avail...,Safe Email
3,3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email
4,4,software at incredibly low prices ( 86 % lower...,Phishing Email


In [7]:
# Drop the unnecessary index column
df = df.drop(columns=['Unnamed: 0'])

# Rename columns
df.columns = ['email', 'label']
df.head()


Unnamed: 0,email,label
0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email
1,the other side of * galicismos * * galicismo *...,Safe Email
2,re : equistar deal tickets are you still avail...,Safe Email
3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email
4,software at incredibly low prices ( 86 % lower...,Phishing Email


In [9]:
df['label'] = df['label'].map({'Safe Email': 0, 'Phishing Email': 1})
df['label'].value_counts()

label
0    11322
1     7328
Name: count, dtype: int64

In [11]:
df.head()

Unnamed: 0,email,label
0,"re : 6 . 1100 , disc : uniformitarianism , re ...",0
1,the other side of * galicismos * * galicismo *...,0
2,re : equistar deal tickets are you still avail...,0
3,\nHello I am your hot lil horny toy.\n I am...,1
4,software at incredibly low prices ( 86 % lower...,1


In [15]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Clean and preprocess text
def preprocess(text):
    if not isinstance(text, str):
        return ''
    text = re.sub(r'[^\w\s]', '', text.lower())
    tokens = text.split()
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['clean_email'] = df['email'].apply(preprocess)
df[['clean_email', 'label']].head()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,clean_email,label
0,6 1100 disc uniformitarian 1086 sex lang dick ...,0
1,side galicismo galicismo spanish term name imp...,0
2,equistar deal ticket still avail assist robert...,0
3,hello hot lil horni toy one dream open mind pe...,1
4,softwar incred low price 86 lower draperi seve...,1


In [17]:
df = df.dropna(subset=['email'])  # Drops rows with missing 'email' values

In [19]:
df.isnull()

Unnamed: 0,email,label,clean_email
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
...,...,...,...
18645,False,False,False
18646,False,False,False
18647,False,False,False
18648,False,False,False


In [21]:
from sklearn.model_selection import train_test_split

X = df['clean_email']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

model = LogisticRegression()
model.fit(X_train_vec, y_train)

y_pred = model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9613630265629193

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.96      0.97      2265
           1       0.95      0.96      0.95      1462

    accuracy                           0.96      3727
   macro avg       0.96      0.96      0.96      3727
weighted avg       0.96      0.96      0.96      3727



In [27]:
import joblib

joblib.dump(model, 'phishing_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']

In [29]:
def predict_email(text):
    cleaned = preprocess(text)
    vectorized = vectorizer.transform([cleaned])
    result = model.predict(vectorized)
    return "Phishing Email" if result[0] == 1 else "Safe Email"

# Example:
predict_email("Please verify your account by clicking this link.")


'Phishing Email'

In [37]:
import joblib

# Load model and vectorizer
model = joblib.load('phishing_model.pkl')
vectorizer = joblib.load('tfidf_vectorizer.pkl')

# Function to preprocess and predict
def predict_email(text):
    cleaned = preprocess(text)
    vectorized = vectorizer.transform([cleaned])
    result = model.predict(vectorized)
    return "Phishing Email" if result[0] == 1 else "Safe Email"

# Take email content as input from the user
user_input = input("Enter the email content:\n")
prediction = predict_email(user_input)
print("\nPrediction:", prediction)


Enter the email content:
 Hi team, just a reminder that our weekly meeting is scheduled for 10 AM tomorrow in Conference Room A. Please be on time.



Prediction: Safe Email
