📘 SMS Spam Classification Notebook

## Import libraries

In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score, classification_report
import joblib
import warnings
warnings.filterwarnings("ignore")

## Load the dataset

In [None]:

url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
df = pd.read_table(url, header=0, names=["label", "message"])

##  Convert labels to binary

In [None]:

df['label'] = df['label'].map({'ham': 0, 'spam': 1})

## Features and labels

In [None]:

X = df['message']
y = df['label']

## Split into train/test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Vectorize the text data using TF-IDF

In [None]:
tfidf = TfidfVectorizer(max_features=3000)
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)


##  Define models

In [None]:

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Naive Bayes': MultinomialNB(),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': LinearSVC()
}

### Train the model

In [13]:
results = []
for name, model in models.items():
    # Train the model
    model.fit(X_train_vec, y_train)

    # Make predictions
    y_pred = model.predict(X_test_vec)

    # Evaluate
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, pos_label=1)  # Use 1 if 'spam' is encoded as 1

    # Store results
    results.append({
        'Model': name,
        'Accuracy': acc,
        'F1 Score': f1
    })

# 5. Display results
results_df = pd.DataFrame(results)
print(results_df)

                 Model  Accuracy  F1 Score
0  Logistic Regression  0.970404  0.885813
1          Naive Bayes  0.976682  0.911565
2        Random Forest  0.982960  0.937294
3                  SVM  0.985650  0.948052


## View best model 

In [None]:

best_model_name = results_df.iloc[0]['Model']
print(f"\n✅ Best model: {best_model_name}")


✅ Best model: SVM


## Prediction

In [None]:
# Make a prediction on a new message
new_message = ["Congratulations! You've won a free iPhone! Click here to claim now."]

# Convert the new message into numbers using the same vectorizer
new_message_vec = tfidf.transform(new_message)


# Predict using the trained model
best_model = models[best_model_name] # create a reference to the best model
prediction = best_model.predict(new_message_vec) # make the prediction

# Print the result
print("Prediction:", prediction[0]) # 0 for 'ham', 1 for 'spam'

Prediction: 1
