In [1]:
# sms_spam_classifier.py

import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from gensim.models import KeyedVectors

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\divya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\divya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# 1. Load data (assuming DataFrame df with columns 'Label','Message')
df = pd.read_csv('spam.csv', encoding='latin-1')[['v1','v2']].rename(columns={'v1':'Label','v2':'Message'})

# 2. Clean & tokenize
stopwords_set = set(stopwords.words('english'))
def preprocess_sms(text):
    tokens = nltk.word_tokenize(text.lower())
    return [w for w in tokens if w.isalpha() and w not in stopwords_set]

df['tokens'] = df['Message'].map(preprocess_sms)

In [12]:
import gensim.downloader as api

# Load the pre-trained Google News Word2Vec model (this downloads and caches it)
w2v_model = api.load("word2vec-google-news-300")

In [13]:
# 4. Message → average word vectors
def text_to_vec(tokens, model, size=300):
    vecs = [model[w] for w in tokens if w in model]
    return np.mean(vecs, axis=0) if vecs else np.zeros(size)

df['vec'] = df['tokens'].map(lambda toks: text_to_vec(toks, w2v_model))

In [24]:
# 5. Train/test split
X = np.vstack(df['vec'].values)
y = (df['Label'] == 'spam').astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Train & evaluate
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
acc = clf.score(X_test, y_test)
print(f"SMS Spam Classification Accuracy: {acc:.4f}")

SMS Spam Classification Accuracy: 0.9417


In [21]:
# 7. Prediction function
def predict_message_class(classifier, w2v_model, message):
    tokens = preprocess_sms(message)
    vec = text_to_vec(tokens, w2v_model).reshape(1, -1)
    pred = classifier.predict(vec)[0]
    return 'spam' if pred == 1 else 'ham'

In [23]:
message1 = "Congratulations! You’ve won a $1000 Walmart gift card. Click here to claim now."
prediction1 = predict_message_class(clf, w2v_model, message1)
print(f"Message: {message1}\nPredicted Class: {prediction1}\n")

message2 = "Hey, we are still meeting for lunch today at 1 PM?"
prediction2 = predict_message_class(clf, w2v_model, message2)
print(f"Message: {message2}\nPredicted Class: {prediction2}")

Message: Congratulations! You’ve won a $1000 Walmart gift card. Click here to claim now.
Predicted Class: spam

Message: Hey, we are still meeting for lunch today at 1 PM?
Predicted Class: ham


# 