In [2]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score

# Download NLTK resources (only run once)
nltk.download('punkt')
nltk.download('stopwords')

# Load dataset
df = pd.read_csv("spam.csv", encoding="latin-1")
df = df[['v1','v2']]
df.columns = ['target','text']

# Encode target
df['target'] = df['target'].map({'ham':0, 'spam':1})

# Remove duplicates
df = df.drop_duplicates()

# Preprocessing
ps = PorterStemmer()
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    
    y = []
    for i in text:
        if i.isalnum():
            y.append(i)
    
    text = y[:]
    y.clear()
    
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
    
    text = y[:]
    y.clear()
    
    for i in text:
        y.append(ps.stem(i))
    
    return " ".join(y)

df['transformed_text'] = df['text'].apply(transform_text)

# Vectorization
tfidf = TfidfVectorizer(max_features=3000)
X = tfidf.fit_transform(df['transformed_text']).toarray()
y = df['target'].values

# Train-test split (stratified so spam also goes to test set)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=2, stratify=y
)

# Train Naive Bayes
mnb = MultinomialNB()
mnb.fit(X_train, y_train)

# Evaluation
y_pred = mnb.predict(X_test)
print("Accuracy:", accuracy_score(y_test,y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test,y_pred))
print("Precision:", precision_score(y_test,y_pred))

# Prediction function
def predict_spam(msg):
    msg_transformed = transform_text(msg)
    vector_input = tfidf.transform([msg_transformed]).toarray()
    result = mnb.predict(vector_input)[0]
    prob = mnb.predict_proba(vector_input)[0][result]
    return "Spam" if result==1 else "Ham", round(prob, 3)

# Test examples
print(predict_spam("Congratulations! You've won a free lottery ticket."))
print(predict_spam("Hey, are we meeting tomorrow?"))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy: 0.9835589941972921
Confusion Matrix:
 [[903   0]
 [ 17 114]]
Precision: 1.0
('Ham', 0.578)
('Ham', 0.991)
