In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE

# Load CSV files
df_spamassasin = pd.read_csv("SpamAssasin.csv")
df_ceas08 = pd.read_csv("CEAS_08.csv")
df_nazario = pd.read_csv("Nazario.csv")
df_nigerian = pd.read_csv("Nigerian_Fraud.csv")

# Label phishing emails as 2
df_nigerian["label"] = 2
df_nazario["label"] = 2

# Drop unnecessary columns
for df in [df_spamassasin, df_ceas08, df_nazario, df_nigerian]:
    df.drop(columns=['sender', 'receiver', 'date', 'urls'], inplace=True)

# Merge subject and body into one text column
def merge_subject_body(df):
    df['subject'] = df['subject'].fillna('')
    df['body'] = df['body'].fillna('')
    df['text'] = df['subject'] + ' ' + df['body']
    return df[['text', 'label']]

samdf = merge_subject_body(df_spamassasin)
nazdf = merge_subject_body(df_nazario)
nigdf = merge_subject_body(df_nigerian)
ceasdf = merge_subject_body(df_ceas08)

# Combine all datasets
combined = pd.concat([samdf, ceasdf, nigdf, nazdf])

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(combined['text'])
y = combined['label']

# Apply SMOTE
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X.toarray(), y)

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Step 1: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size=0.2, random_state=42)

# Step 2: Train a model (Random Forest as example)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 3: Predict and evaluate
y_pred = model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
