In [9]:
pip install pandas scikit-learn nltk




In [16]:
import pandas as pd
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords

# Download stopwords

In [17]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 1. Load Dataset

In [18]:
df = pd.read_csv("SMSSpamCollection", sep='\t', names=["label", "message"])


# 2. Preprocessing Function

In [19]:
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text

In [20]:
# Apply preprocessing
df['message'] = df['message'].apply(preprocess_text)

# 3. Convert labels to binary (ham=0, spam=1)

In [21]:
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# 4. Train-Test Split

In [22]:
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.3, random_state=42)


In [23]:
# 5. Vectorization
vectorizer = CountVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

In [24]:
# 6. Train Naïve Bayes Classifier
nb = MultinomialNB()
nb.fit(X_train_vect, y_train)

In [25]:
# 7. Predict and Evaluate
y_pred = nb.predict(X_test_vect)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9856459330143541

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      1448
           1       0.96      0.93      0.95       224

    accuracy                           0.99      1672
   macro avg       0.97      0.96      0.97      1672
weighted avg       0.99      0.99      0.99      1672



In [26]:
# --- 8. Test with Custom Messages ---
print("\n--- Testing with Custom Messages ---")
custom_messages = [
    "Congratulations! You've won a free iPhone. Click here to claim.", # Spam
    "Hey, how are you doing today? Let's catch up soon.",              # Ham
    "URGENT! Your account has been suspended. Verify your details now.", # Spam
    "Hi, just confirming our meeting for tomorrow at 10 AM.",          # Ham
    "Free entry to a contest! Text WIN to 12345.",                     # Spam
    "Call me back please, it's urgent."                                # Ham (can be tricky)
]

# Preprocess and vectorize custom messages using the *trained* vectorizer
# (Do not fit_transform again, only transform)
processed_custom_messages = [preprocess_text(msg) for msg in custom_messages]
X_custom = tfidf_vectorizer.transform(processed_custom_messages) # Use the same vectorizer as for training

# Predict labels for custom messages
predictions_custom = model.predict(X_custom)

label_map = {0: 'ham', 1: 'spam'}
for i, msg in enumerate(custom_messages):
    predicted_label = label_map[predictions_custom[i]]
    print(f"Message: '{msg}'\nPredicted: {predicted_label}\n")



--- Testing with Custom Messages ---
Message: 'Congratulations! You've won a free iPhone. Click here to claim.'
Predicted: spam

Message: 'Hey, how are you doing today? Let's catch up soon.'
Predicted: ham

Message: 'URGENT! Your account has been suspended. Verify your details now.'
Predicted: spam

Message: 'Hi, just confirming our meeting for tomorrow at 10 AM.'
Predicted: ham

Message: 'Free entry to a contest! Text WIN to 12345.'
Predicted: spam

Message: 'Call me back please, it's urgent.'
Predicted: ham

