In [1]:
# Import the pandas library
import pandas as pd

# Load the dataset
# It's a tab-separated file, so we use sep='\t'
# We'll also name the columns for clarity
df = pd.read_csv('SMSSpamCollection', sep='\t', names=['label', 'message'])

# Display the first 5 rows of the data
print("First 5 rows of the dataset:")
print(df.head())

# Get some basic information about the dataset
print("\nDataset Info:")
df.info()

# See the distribution of labels
print("\nLabel Distribution:")
print(df['label'].value_counts())


First 5 rows of the dataset:
  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB

Label Distribution:
label
ham     4825
spam     747
Name: count, dtype: int64


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# 1. Encode the labels
# Convert 'ham' and 'spam' into 0 and 1
encoder = LabelEncoder()
df['label_encoded'] = encoder.fit_transform(df['label'])

# Now, 'ham' is 0 and 'spam' is 1
print("\nFirst 5 rows with encoded labels:")
print(df.head())

# 2. Define our features (X) and target (y)
X = df['message']
y = df['label_encoded']

# 3. Split the data into training and testing sets
# We train the model on the training set and evaluate it on the unseen testing set.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"\nTraining data size: {len(X_train)}")
print(f"Testing data size: {len(X_test)}")

# 4. Create and fit the TF-IDF Vectorizer
# This learns the vocabulary from our training data and converts text to vectors
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# Learn the vocabulary and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Only transform the test data (using the vocabulary learned from training)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


First 5 rows with encoded labels:
  label                                            message  label_encoded
0   ham  Go until jurong point, crazy.. Available only ...              0
1   ham                      Ok lar... Joking wif u oni...              0
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...              1
3   ham  U dun say so early hor... U c already then say...              0
4   ham  Nah I don't think he goes to usf, he lives aro...              0

Training data size: 4457
Testing data size: 1115


In [4]:
# --- RUN THIS CELL FIRST! ---

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the Naive Bayes classifier
model = MultinomialNB()

# Train the model on the TF-IDF transformed training data
model.fit(X_train_tfidf, y_train)

print("\nModel trained successfully!")


Model trained successfully!


In [5]:
# Make predictions on the test data
y_pred = model.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy * 100:.2f}%")

# Print a detailed classification report
# Precision: Of all messages predicted as spam, how many were actually spam?
# Recall: Of all the actual spam messages, how many did we correctly identify?
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Ham (Legitimate)', 'Spam (Phishing)']))

# Display the confusion matrix
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)


Model Accuracy: 97.13%

Classification Report:
                  precision    recall  f1-score   support

Ham (Legitimate)       0.97      1.00      0.98       966
 Spam (Phishing)       1.00      0.79      0.88       149

        accuracy                           0.97      1115
       macro avg       0.98      0.89      0.93      1115
    weighted avg       0.97      0.97      0.97      1115


Confusion Matrix:
[[966   0]
 [ 32 117]]


In [6]:
def predict_message(message):
    """
    Takes a string message and predicts if it's ham or spam.
    """
    # Transform the new message using the same vectorizer
    message_tfidf = tfidf_vectorizer.transform([message])
    
    # Make a prediction
    prediction = model.predict(message_tfidf)
    
    # Get the probability of it being spam
    prediction_prob = model.predict_proba(message_tfidf)[0][1]
    
    if prediction[0] == 0:
        return f"Prediction: HAM (Legitimate) | Spam Probability: {prediction_prob:.2%}"
    else:
        return f"Prediction: SPAM (Phishing) | Spam Probability: {prediction_prob:.2%}"

# --- Test with some examples ---

# Example 1: A likely legitimate message
test_ham = "Hey, are we still on for the meeting tomorrow at 2pm?"
print(f"\nTesting message: '{test_ham}'")
print(predict_message(test_ham))

# Example 2: A classic spam/phishing message
test_spam = "Congratulations! You've won a $1000 Walmart gift card. Go to http://bit.ly/scamlink to claim now."
print(f"\nTesting message: '{test_spam}'")
print(predict_message(test_spam))

# Example 3: Another spam message
test_spam_2 = "URGENT: Your account has been compromised. Please verify your identity by clicking here immediately."
print(f"\nTesting message: '{test_spam_2}'")
print(predict_message(test_spam_2))


Testing message: 'Hey, are we still on for the meeting tomorrow at 2pm?'
Prediction: HAM (Legitimate) | Spam Probability: 0.41%

Testing message: 'Congratulations! You've won a $1000 Walmart gift card. Go to http://bit.ly/scamlink to claim now.'
Prediction: SPAM (Phishing) | Spam Probability: 69.39%

Testing message: 'URGENT: Your account has been compromised. Please verify your identity by clicking here immediately.'
Prediction: SPAM (Phishing) | Spam Probability: 63.63%
