# 📘 XGBoost Classifier Analysis
This notebook performs data loading, preprocessing, training, and evaluation using an XGBoost classifier on a spam email dataset.

## 📦 1. Imports and Setup

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc, RocCurveDisplay
from xgboost import XGBClassifier

sns.set(style="whitegrid")

## 📊 2. Load and Explore the Dataset

In [None]:
file_path = "./emails.csv"  # Update path as needed
df = pd.read_csv(file_path)
display(df.head())

print(f"Dataset Shape: {df.shape}")


## Class Distribution

In [None]:
print("Class Distribution:")
print(df['spam'].value_counts().rename(index={1: 'Spam', 0: 'Ham'}))

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(data=df, x='spam')
plt.title("Class Distribution")
plt.xticks([0, 1], ['Ham', 'Spam'])
plt.xlabel("Class")
plt.ylabel("Count")
plt.show()

## 🪼 3. Preprocessing

In [None]:
# Convert spam column to labels
df['spam'] = df['spam'].astype(str).replace({"1": "spam", "0": "ham"})

# Extract features and labels
X_raw = df["text"].values
y_raw = df["spam"].values

# Vectorize text data
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X_raw)

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_raw)
class_names = label_encoder.classes_

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

## 🚀 4. Train the XGBoost Classifier

In [None]:
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', max_depth=1, learning_rate=0.1, n_estimators=50)
model.fit(X_train, y_train)

predictions = model.predict(X_test)
y_scores = model.predict_proba(X_test)[:, 1]

y_test_decoded = label_encoder.inverse_transform(y_test)
predictions_decoded = label_encoder.inverse_transform(predictions)

## 📟 5. Evaluation Report

In [None]:
print("Confusion Matrix:")
print(confusion_matrix(y_test_decoded, predictions_decoded))

print("\nClassification Report:")
print(classification_report(y_test_decoded, predictions_decoded, zero_division=1))

## 🔲 6. Confusion Matrix Visualization

In [None]:
ConfusionMatrixDisplay.from_predictions(
    y_test_decoded,
    predictions_decoded,
    display_labels=class_names,
    cmap=plt.cm.Blues,
    xticks_rotation=45
)
plt.title("Confusion Matrix - XGBoost")
plt.show()

## 📈 7. ROC Curve and AUC

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_scores, pos_label=1)
roc_auc = auc(fpr, tpr)

plt.figure()
RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc).plot()
plt.title("ROC Curve - XGBoost")
plt.show()

print(f"AUC Score: {roc_auc:.2f}")

## Save model

In [None]:
import joblib

joblib.dump(model, 'xgboost_Spam_email_model.pkl')
print("✅ Model saved as 'xgboost_Spam_email_model.pkl'")

## ✅ 8. Summary

In [None]:
print("XGBoost classifier trained, evaluated, and visualized. All steps complete.")