# Spam_detection_system

1. Import libraries


In [17]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

2. Load dataset

In [18]:
df = pd.read_csv("spam.csv", encoding='latin-1')
df = df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'])
df = df.rename(columns={'v1': 'label', 'v2': 'text'})


3. Clean the text

In [19]:
df['text'] = df['text'].str.lower()
df['text'] = df['text'].str.replace(r"[^a-z0-9\s]", " ", regex=True)
df['text'] = df['text'].str.replace(r'\s+', ' ', regex=True).str.strip()

4. Encode labels

In [20]:
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])

5. TF-IDF Vectorization

In [21]:
tfidf = TfidfVectorizer(stop_words='english', max_df=0.9, min_df=2, ngram_range=(1,2))
X = tfidf.fit_transform(df['text'])
y = df['label_encoded']

6. Split the data

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

7. Train models

In [23]:
nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)

rf = RandomForestClassifier(n_estimators=150, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

svm = SVC(kernel='linear', probability=True, random_state=42)
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)


8. Evaluate models

In [24]:
def evaluate_model(name, y_true, y_pred):
    print(f"\n--- {name} ---")
    print("Accuracy :", round(accuracy_score(y_true, y_pred), 4))
    print("Precision:", round(precision_score(y_true, y_pred), 4))
    print("Recall   :", round(recall_score(y_true, y_pred), 4))
    print("F1-Score :", round(f1_score(y_true, y_pred), 4))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))

evaluate_model("Naive Bayes", y_test, y_pred_nb)
evaluate_model("Random Forest", y_test, y_pred_rf)
evaluate_model("SVM", y_test, y_pred_svm)


--- Naive Bayes ---
Accuracy : 0.974
Precision: 1.0
Recall   : 0.8067
F1-Score : 0.893
Confusion Matrix:
 [[965   0]
 [ 29 121]]

--- Random Forest ---
Accuracy : 0.9776
Precision: 0.9921
Recall   : 0.84
F1-Score : 0.9097
Confusion Matrix:
 [[964   1]
 [ 24 126]]

--- SVM ---
Accuracy : 0.9776
Precision: 0.9771
Recall   : 0.8533
F1-Score : 0.911
Confusion Matrix:
 [[962   3]
 [ 22 128]]


9. Predict on new message

In [25]:
def predict_spam(message, model=nb):
    message = message.lower()
    message = re.sub(r"[^a-z0-9\s]", " ", message)
    message = re.sub(r'\s+', ' ', message).strip()
    message_tfidf = tfidf.transform([message])
    prediction_encoded = model.predict(message_tfidf)
    prediction_label = le.inverse_transform(prediction_encoded)
    return prediction_label[0]

# 🧪 Example usage
test_message1 = "Congratulations! You've won a free cruise! Call now to claim your prize."
test_message2 = "Hey, just wanted to remind you about our meeting tomorrow."

print("\n")
print(f"'{test_message1}' ➜ {predict_spam(test_message1)}")
print(f"'{test_message2}' ➜ {predict_spam(test_message2)}")



'Congratulations! You've won a free cruise! Call now to claim your prize.' ➜ spam
'Hey, just wanted to remind you about our meeting tomorrow.' ➜ ham


10. Insights & interpretation  
- Discuss which model performed best and why.  
- Identify common words/features in spam vs ham (you can check top TFIDF features).  
- Understand false positives / false negatives and their impact.  
