<a href="https://colab.research.google.com/github/Abinaya7247/Classifields-web-application/blob/main/SMS_Spam_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Download stopwords
nltk.download('stopwords')

# Load dataset
data = pd.read_csv('spam.csv', encoding='latin-1')[['v1', 'v2']]
data.columns = ['label', 'message']

# Convert spam/ham to 0/1
data['label_num'] = data['label'].map({'ham':0, 'spam':1})

# Text preprocessing
ps = PorterStemmer()
corpus = []
for msg in data['message']:
    msg = re.sub('[^a-zA-Z]', ' ', msg)  # keep only letters
    msg = msg.lower()                    # lowercase
    msg = msg.split()                    # split words
    msg = [ps.stem(word) for word in msg if word not in stopwords.words('english')]
    msg = ' '.join(msg)
    corpus.append(msg)

# Convert text to features
cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(corpus).toarray()
y = data['label_num'].values

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = MultinomialNB()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Accuracy
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n📊 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n📝 Classification Report:\n", classification_report(y_test, y_pred))

# Test custom SMS
def detect_spam(message):
    msg = re.sub('[^a-zA-Z]', ' ', message)
    msg = msg.lower()
    msg = msg.split()
    msg = [ps.stem(word) for word in msg if word not in stopwords.words('english')]
    msg = ' '.join(msg)
    vector = cv.transform([msg]).toarray()
    prediction = model.predict(vector)
    return "🚨 Spam" if prediction == [1] else "✅ Not Spam"

# Test
print(detect_spam("Congratulations! You've won $1000. Claim now!"))
print(detect_spam("Hello bro, what's up?"))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


✅ Accuracy: 0.9820627802690582

📊 Confusion Matrix:
 [[954  11]
 [  9 141]]

📝 Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       965
           1       0.93      0.94      0.93       150

    accuracy                           0.98      1115
   macro avg       0.96      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115

🚨 Spam
✅ Not Spam


# New Section