In [12]:
import pandas as pd
import numpy as np
import os

# Load dataset
df = pd.read_csv('../data/dataspam.csv', encoding='latin-1')[['v1', 'v2']]
df.columns = ['label', 'text']

print("✅ Dataset Loaded")
df.head()


✅ Dataset Loaded


Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [13]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    # Tokenize and remove stopwords, apply stemming
    tokens = text.split()
    cleaned = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return ' '.join(cleaned)

# Apply to entire column
df['clean_text'] = df['text'].apply(preprocess)
df[['text', 'clean_text']].head()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\VIGNANI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,text,clean_text
0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,"Nah I don't think he goes to usf, he lives aro...",nah dont think goe usf live around though


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Encode labels: ham=0, spam=1
df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})

# TF-IDF
tfidf = TfidfVectorizer(max_features=3000)
X = tfidf.fit_transform(df['clean_text']).toarray()
y = df['label_num'].values

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("✅ Data ready for training")


✅ Data ready for training


In [15]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("🎯 Precision:", precision_score(y_test, y_pred))
print("📢 Recall:", recall_score(y_test, y_pred))
print("\n🧾 Report:\n", classification_report(y_test, y_pred))


✅ Accuracy: 0.97847533632287
🎯 Precision: 1.0
📢 Recall: 0.84

🧾 Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       1.00      0.84      0.91       150

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [16]:
import pickle

# Ensure model folder exists
os.makedirs('../model', exist_ok=True)

# Save model
with open('../model/spam_classifier.pkl', 'wb') as f:
    pickle.dump(model, f)

# Save vectorizer
with open('../model/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

print("✅ Saved to model/")


✅ Saved to model/


In [17]:
# Load model and vectorizer
with open('../model/spam_classifier.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

with open('../model/tfidf_vectorizer.pkl', 'rb') as f:
    loaded_vectorizer = pickle.load(f)

# Predict function
def predict_message(msg):
    cleaned = preprocess(msg)
    vect = loaded_vectorizer.transform([cleaned])
    pred = loaded_model.predict(vect)
    return "🚫 SPAM" if pred[0] == 1 else "✅ HAM (not spam)"

# Try examples
print(predict_message("Congrats! You've won a free iPhone. Click to claim."))
print(predict_message("Hey, are we still meeting at 5 PM today?"))


🚫 SPAM
✅ HAM (not spam)
