# Spam Email Analysis
This notebook walks through the process of loading the SMS spam dataset, preprocessing the text, training a classification model, and evaluating its performance.

In [None]:
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

## 1. Load Data

In [None]:
df = pd.read_csv('../data/sms_spam_no_header.csv', names=['label', 'message'])
df.head()

## 2. Text Preprocessing

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    text = re.sub(r'\d+', '', text)
    text = ' '.join(text.split())
    return text

df['cleaned_message'] = df['message'].apply(clean_text)
df.head()

## 3. Data Splitting and Vectorization

In [None]:
X = df['cleaned_message']
y = df['label']

X_train_text, X_test_text, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(X_train_text)
X_test = vectorizer.transform(X_test_text)

## 4. Model Training

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

## 5. Model Evaluation

In [None]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='spam')
recall = recall_score(y_test, y_pred, pos_label='spam')
f1 = f1_score(y_test, y_pred, pos_label='spam')

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

## 6. Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=model.classes_, yticklabels=model.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

## 7. Performance Metrics Bar Chart

In [None]:
metrics = {'Precision': precision, 'Recall': recall, 'F1 Score': f1}
plt.figure(figsize=(8, 5))
plt.bar(metrics.keys(), metrics.values())
plt.ylim(0, 1)
plt.title('Model Performance Metrics')
for i, v in enumerate(metrics.values()):
    plt.text(i, v + 0.02, f'{v:.4f}', ha='center')
plt.show()