In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

# Load the SMS dataset
df = pd.read_csv('sms_spam.csv', encoding='latin-1')

# Display the first few rows of the dataset
df.head()


In [None]:
# Convert labels to binary (ham = 0, spam = 1)
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Split the data into features (X) and labels (y)
X = df['message']  # Features (SMS text)
y = df['label']    # Labels (ham or spam)

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape of the training and test sets
print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")


In [None]:
# Create a pipeline that first transforms the text to features using TF-IDF and then applies Naive Bayes
model = make_pipeline(TfidfVectorizer(), MultinomialNB())

# Train the model on the training data
model.fit(X_train, y_train)

# Evaluate the model on the test data
accuracy = model.score(X_test, y_test)
print(f"Model accuracy: {accuracy * 100:.2f}%")


In [None]:
# Function to predict whether a message is spam or ham
def predict_message(message):
    # Predict the class (0 = ham, 1 = spam) for the input message
    prediction = model.predict([message])[0]

    # Get the probability of the message being "ham" or "spam"
    probability = model.predict_proba([message])[0]

    # Get the probability for "spam" (index 1)
    spam_probability = probability[1]

    # Return the results as a list: [probability of spam, 'ham' or 'spam']
    if prediction == 1:
        return [spam_probability, 'spam']
    else:
        return [1 - spam_probability, 'ham']


In [None]:
# Test the function with some sample messages
print(predict_message("Congrats! You've won a free ticket to Bahamas. Claim now!"))
print(predict_message("Hey, let's catch up soon. Are you free tomorrow?"))
print(predict_message("Limited time offer! 50% off on all products. Hurry up!"))
print(predict_message("Hi, how are you doing today?"))


In [None]:
[0.98, 'spam']  # High probability of spam
[0.02, 'ham']   # Low probability of spam, hence 'ham'
[0.99, 'spam']  # High probability of spam
[0.01, 'ham']   # Low probability of spam, hence 'ham'


In [None]:
# Evaluate model performance on test set
test_accuracy = model.score(X_test, y_test)
print(f"Test accuracy: {test_accuracy * 100:.2f}%")
