1. loads the SMS Spam Collection dataset

In [18]:
import pandas as pd

# Download the dataset
url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
df = pd.read_csv(url, sep='\t', header=None, names=['label', 'message'])

# Show the first few rows
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


2. Preprocess the Text Data

In [19]:
import string
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert labels to 0 (ham) and 1 (spam)
df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})

# Basic text cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r"\d+", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    return text

df['clean_message'] = df['message'].apply(clean_text)

# Vectorize text using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.9, min_df=5)
X = vectorizer.fit_transform(df['clean_message'])
y = df['label_num']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


3. Train and Test the Model - Logistic Regression Model

In [20]:
from sklearn.linear_model import LogisticRegression

# Train logistic regression
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_log = log_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_log))
print("\nClassification Report:\n", classification_report(y_test, y_pred_log))

Accuracy: 0.968609865470852

Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98       966
           1       1.00      0.77      0.87       149

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115



4. Test the regression model with the same test

In [21]:
def predict_message(msg):
    clean = clean_text(msg)
    vec = vectorizer.transform([clean])
    pred = log_model.predict(vec)[0]  # ← use logistic regression model here
    return "Spam" if pred == 1 else "Not Spam"

# Test again
print(predict_message("Win a FREE iPhone now! Click here!"))
print(predict_message("Hey, are we still meeting at 5pm?"))

Spam
Not Spam


5. Save the model and vectorizer

In [23]:
import joblib

# Save model separately
model_path = "spam_classifier_model.pkl"
vectorizer_path = "tfidf_vectorizer.pkl"

joblib.dump(log_model, model_path)
joblib.dump(vectorizer, vectorizer_path)

print("Files saved:", model_path, vectorizer_path)

Files saved: spam_classifier_model.pkl tfidf_vectorizer.pkl


6. Download the files from Colab

In [24]:
from google.colab import files

files.download("spam_classifier_model.pkl")
files.download("tfidf_vectorizer.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>