In [14]:
import pandas as pd

url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
df = pd.read_csv(url, sep='\t', header=None, names=['label', 'message'])

df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [15]:
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# Clean the message text
def clean_text(text):
    text = text.lower()                        # lowercase
    text = "".join([char for char in text if char not in string.punctuation])  # remove punctuation
    return text

df['clean_message'] = df['message'].apply(clean_text)

# Convert labels to binary values: spam = 1, ham = 0
df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})

# Split data
X = df['clean_message']
y = df['label_num']

# Vectorize text (Bag of Words model)
vectorizer = CountVectorizer()
X_vectorized = vectorizer.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

print("Preprocessing done. Training data shape:", X_train.shape)

Preprocessing done. Training data shape: (4457, 9544)


In [16]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Create and train the model
model = MultinomialNB()
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.98

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       966
           1       0.93      0.94      0.93       149

    accuracy                           0.98      1115
   macro avg       0.96      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [17]:
def predict_message(msg):
    cleaned = clean_text(msg)
    vector = vectorizer.transform([cleaned])
    prediction = model.predict(vector)[0]
    return "Spam" if prediction == 1 else "Ham"

# Example usage:
test_msg = "Congratulations! You have won a free iPhone. Click now!"
print("Prediction:", predict_message(test_msg))

Prediction: Spam


In [18]:
def predict_message(msg):
    cleaned = clean_text(msg)
    vector = vectorizer.transform([cleaned])
    prediction = model.predict(vector)[0]
    return "Spam" if prediction == 1 else "yes"

# Example usage:
test_msg = "hey,are we meeting for lunch today"
print("Prediction:", predict_message(test_msg))

Prediction: Ham


In [19]:
def predict_message(msg):
    cleaned = clean_text(msg)
    vector = vectorizer.transform([cleaned])
    prediction = model.predict(vector)[0]
    return "Spam" if prediction == 1 else "yes"

# Example usage:
test_msg = "who are you?"
print("Prediction:", predict_message(test_msg))

Prediction: yes


In [21]:
def predict_message(msg):
    cleaned = clean_text(msg)
    vector = vectorizer.transform([cleaned])
    prediction = model.predict(vector)[0]
    return "Spam" if prediction == 1 else "Ham"

# Example usage:
test_msg = "URGENT! your account will be deactivated unless you verify yourself."
print("Prediction:", predict_message(test_msg))

Prediction: Spam
