In [None]:
### Spam Detection with Machine Learning

# This code performs spam classification using machine learning.
# It loads a dataset from a TSV file, processes the text by TF-IDF,
# Balances the dataset with SMOTE, and trains Random Forest and Naive Bayes models.
# The models are evaluated by accuracy and classification metrics,
# Confusion matrices are visualized for comparison and better understanding.

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
import seaborn as sns
import matplotlib.pyplot as plt

## 1. Load dataset
df = pd.read_csv("spam_data.tsv", sep='\t')  # Provide your file path

## 2. Show the distribution of 'ham' and 'spam'
print(f"Class distribution:\n{df['label'].value_counts()}")

## 3. Data preparation
X = df['message']  # Features (messages)
y = df['label']  # Target (spam or ham labels)

## 4. TF-IDF vectorization for text data
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X_tfidf = tfidf.fit_transform(X)

## 5. Apply SMOTE for balanced classes
smote = SMOTE(sampling_strategy='auto', random_state=0)
X_resampled, y_resampled = smote.fit_resample(X_tfidf, y)

## 6. Split the data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=0)

## 7. Create models (Random Forest and Naive Bayes)
rf = RandomForestClassifier(class_weight='balanced', random_state=0)
nb = MultinomialNB()

## 8. Train the Random Forest model
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

## 9. Train the Naive Bayes model
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)

## 10. Evaluate the Random Forest model
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Random Forest Classification Report:\n", classification_report(y_test, y_pred_rf))

## 11. Evaluate the Naive Bayes model
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Naive Bayes Classification Report:\n", classification_report(y_test, y_pred_nb))

## 12. Confusion Matrix visualization for Random Forest
cm_rf = confusion_matrix(y_test, y_pred_rf)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_rf, annot=True, fmt='g', cmap='Blues', xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'])
plt.title('Random Forest - Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

## 13. Confusion Matrix visualization for Naive Bayes
cm_nb = confusion_matrix(y_test, y_pred_nb)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_nb, annot=True, fmt='g', cmap='Blues', xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'])
plt.title('Naive Bayes - Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
