In [1]:
# ðŸ“Œ Step 1: Import Libraries
import pandas as pd
import numpy as np
import string
import nltk
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Download stopwords (only once)
nltk.download('stopwords')

# ðŸ“Œ Step 2: Load Dataset
df = pd.read_csv('dataset/spam.csv', encoding='latin-1')[['v1', 'v2']]
df.columns = ['label', 'text']  # Rename columns for clarity
df.head()

# ðŸ“Œ Step 3: Text Cleaning Function
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    tokens = text.split()
    cleaned = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return ' '.join(cleaned)

df['cleaned_text'] = df['text'].apply(clean_text)

# ðŸ“Œ Step 4: Encode Labels
df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})

# ðŸ“Œ Step 5: TF-IDF Vectorization
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df['cleaned_text'])
y = df['label_num']

# ðŸ“Œ Step 6: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ðŸ“Œ Step 7: Model Training
model = MultinomialNB()
model.fit(X_train, y_train)

# ðŸ“Œ Step 8: Evaluation
y_pred = model.predict(X_test)

print("âœ… Accuracy Score:", accuracy_score(y_test, y_pred))
print("\nðŸ“Š Classification Report:\n", classification_report(y_test, y_pred))
print("\nðŸ“‰ Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# ðŸ“Œ Step 9: Test a Custom SMS
sample = ["Congratulations! You've won a free ticket. Call now!"]
sample_cleaned = tfidf.transform([clean_text(sample[0])])
prediction = model.predict(sample_cleaned)
print("\nðŸ“¬ Sample Prediction:", "Spam" if prediction[0] == 1 else "Ham")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


âœ… Accuracy Score: 0.9614349775784753

ðŸ“Š Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.71      0.83       150

    accuracy                           0.96      1115
   macro avg       0.98      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115


ðŸ“‰ Confusion Matrix:
 [[965   0]
 [ 43 107]]

ðŸ“¬ Sample Prediction: Ham
