<a href="https://colab.research.google.com/github/Ashwathi1901/Link-Guardian/blob/main/notebooks/Email_dataset_preprocessing_%26_model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import joblib
import numpy as np

In [3]:
# Load data
df = pd.read_csv('Phishing_Email.csv')  # Columns: text_combined, label
print(df.head())

   Unnamed: 0                                         Email Text  \
0           0  re : 6 . 1100 , disc : uniformitarianism , re ...   
1           1  the other side of * galicismos * * galicismo *...   
2           2  re : equistar deal tickets are you still avail...   
3           3  \nHello I am your hot lil horny toy.\n    I am...   
4           4  software at incredibly low prices ( 86 % lower...   

       Email Type  
0      Safe Email  
1      Safe Email  
2      Safe Email  
3  Phishing Email  
4  Phishing Email  


In [5]:
# Map columns to standard names
df['text_combined'] = df['Email Text'].fillna('')  # Email content
df['label'] = (df['Email Type'] == 'Phishing Email').astype(int)  # 0=safe, 1=phishing

In [6]:

print(df[['text_combined', 'label']].head())
print(df['label'].value_counts())  # Check balance

                                       text_combined  label
0  re : 6 . 1100 , disc : uniformitarianism , re ...      0
1  the other side of * galicismos * * galicismo *...      0
2  re : equistar deal tickets are you still avail...      0
3  \nHello I am your hot lil horny toy.\n    I am...      1
4  software at incredibly low prices ( 86 % lower...      1
label
0    11322
1     7328
Name: count, dtype: int64


In [31]:
import numpy as np
# Clean + TF-IDF
df['text_clean'] = df['text_combined'].str.lower()

# Boost urgent/phishing words
phishing_keywords = ['urgent', 'suspended', 'verify', 'account', 'click', 'immediate']
for word in phishing_keywords:
    df[f'has_{word}'] = df['text_clean'].str.contains(word, na=False).astype(int)

tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X = tfidf.fit_transform(df['text_clean'])
X_keywords = df[[f'has_{word}' for word in phishing_keywords]].values
X_combined = np.hstack([X.toarray()[:, -100:], X_keywords])  # Top 100 TF-IDF + keywords
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

In [10]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [32]:
model = XGBClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [33]:
print(f"✅ Accuracy: {model.score(X_test, y_test):.3f}")
joblib.dump(model, 'email_model.joblib')
joblib.dump(tfidf, 'email_tfidf.pkl')
joblib.dump(phishing_keywords, 'phishing_keywords.pkl')
print("✅ All saved!")


✅ Accuracy: 0.787
✅ All saved!


In [34]:
# Load ALL models
model = joblib.load('email_model.joblib')
tfidf = joblib.load('email_tfidf.pkl')
keywords = joblib.load('phishing_keywords.pkl')

In [35]:
# Test phishing email
test_phish = "URGENT: Your bank account will be suspended. Verify now: [link]"
test_vec = tfidf.transform([test_phish.lower()])
test_keywords = np.array([[1 if w in test_phish.lower() else 0 for w in keywords]])
test_combined = np.hstack([test_vec.toarray()[:, -100:], test_keywords])
risk = model.predict_proba(test_combined)[0][1]
print(f"✅ Phishing risk: {risk:.3f}")  # 0.92+

✅ Phishing risk: 0.851
