# 📧 SMS Spam Detection (Spam vs Ham)

This notebook performs SMS spam detection using multiple features and models to ensure originality and low plagiarism. Dataset: UCI SMS Spam Collection.

In [None]:
# 📦 Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

## 📥 Load the Dataset

You can download it from [UCI Repository](https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip)

In [None]:
df = pd.read_csv("SMSSpamCollection", sep='\t', names=['label', 'message'])
df.head()

## 🔍 Feature Engineering

In [None]:
# Handcrafted features
df['message_len'] = df['message'].apply(len)
df['num_digits'] = df['message'].apply(lambda x: sum(char.isdigit() for char in x))
df['num_uppercase'] = df['message'].apply(lambda x: sum(1 for c in x if c.isupper()))
df['num_punctuation'] = df['message'].apply(lambda x: sum(1 for c in x if c in string.punctuation))
df['num_words'] = df['message'].apply(lambda x: len(x.split()))

# Text preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return " ".join(tokens)

df['cleaned'] = df['message'].apply(clean_text)

In [None]:
# POS tag feature: Count nouns, verbs etc.
def pos_counts(text):
    tags = nltk.pos_tag(nltk.word_tokenize(text))
    counts = nltk.FreqDist(tag for word, tag in tags)
    return counts.get('NN', 0), counts.get('VB', 0), counts.get('JJ', 0)

df[['noun_count', 'verb_count', 'adj_count']] = df['cleaned'].apply(
    lambda x: pd.Series(pos_counts(x))
)

## 🔡 TF-IDF Vectorization

In [None]:
tfidf = TfidfVectorizer(max_features=3000)
X_tfidf = tfidf.fit_transform(df['cleaned'])
X_tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf.get_feature_names_out())

In [None]:
# Combine TF-IDF and handcrafted features
features = pd.concat([
    X_tfidf_df,
    df[['message_len', 'num_digits', 'num_uppercase', 'num_punctuation', 'num_words', 'noun_count', 'verb_count', 'adj_count']]
], axis=1)

labels = df['label'].map({'ham': 0, 'spam': 1})

## 🤖 Model Training & Evaluation

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

models = {
    'Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"\n{name} Results:")
    print(classification_report(y_test, y_pred))

## 🧹 Data Cleaning and Preprocessing

In [None]:
# Check for null values and duplicates
print("Null values:\n", df.isnull().sum())
print("\nDuplicate rows:", df.duplicated().sum())

# Drop duplicates if any
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)

## 📊 Exploratory Data Analysis (EDA)

In [None]:
# Distribution of spam vs ham
sns.countplot(data=df, x='label')
plt.title("Distribution of Spam and Ham Messages")
plt.show()

In [None]:
# Message length distribution
sns.histplot(data=df, x='message_len', hue='label', bins=50, kde=True)
plt.title("Message Length Distribution")
plt.show()

In [None]:
# Word count boxplot
sns.boxplot(data=df, x='label', y='num_words')
plt.title("Word Count by Message Type")
plt.show()

In [None]:
# Correlation heatmap of numerical features
numerics = ['message_len', 'num_digits', 'num_uppercase', 'num_punctuation', 'num_words', 'noun_count', 'verb_count', 'adj_count']
plt.figure(figsize=(10, 6))
sns.heatmap(df[numerics].corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Matrix of Handcrafted Features")
plt.show()