In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
import os

In [None]:



def load_dataset(path="phishing_emails.csv"):
    if not os.path.exists(path):
        print("Dataset not found. Creating simulated email dataset...")

        texts = [
            "Click here to reset your password urgently!",
            "Please verify your account login now.",
            "You've won a prize. Provide your bank details.",
            "Your invoice is attached. Thank you for your business.",
            "Join our webinar today and learn more.",
            "Limited time offer. Update your card now.",
            "Monthly performance report attached.",
            "Team meeting rescheduled to 3 PM."
        ]

        labels = [1, 1, 1, 0, 0, 1, 0, 0]  # 1 = phishing, 0 = safe

        df = pd.DataFrame({
            "email_text": np.random.choice(texts, 1000),
            "has_link": np.random.randint(0, 2, 1000),
            "has_attachment": np.random.randint(0, 2, 1000),
            "sender_reputation": np.random.uniform(0, 1, 1000),
            "is_phishing": np.random.randint(0, 2, 1000)
        })

        df.to_csv(path, index=False)
    else:
        df = pd.read_csv(path)

    print(f"Loaded dataset with shape: {df.shape}")
    return df


def preprocess(df):
    tfidf = TfidfVectorizer(max_features=100)
    X_text = tfidf.fit_transform(df["email_text"]).toarray()

    X_meta = df[["has_link", "has_attachment", "sender_reputation"]].values
    X_combined = np.concatenate((X_text, X_meta), axis=1)

    y = df["is_phishing"]
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_combined)

    return X_scaled, y, scaler, tfidf
