In [2]:
# Import necessary libraries
import pandas as pd
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Load the dataset
# You can download the dataset from: https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset
df = pd.read_csv("spam.csv", encoding='latin-1')[['v1', 'v2']]
df.columns = ['label', 'message']

# Step 2: Preprocess text (cleaning)
df['message'] = df['message'].str.lower()
df['message'] = df['message'].str.replace('[{}]'.format(string.punctuation), '', regex=True)

# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.2, random_state=42)

# Step 4: Convert text data into TF-IDF feature vectors
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Step 5: Create and train the SVM model
model = svm.SVC(kernel='linear')
model.fit(X_train_tfidf, y_train)

# Step 6: Make predictions
y_pred = model.predict(X_test_tfidf)

# Step 7: Evaluate the performance
print("Model Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Step 8: Try predicting with new messages
examples = [
    "Congratulations! You've won a free ticket to the Bahamas! tap here to claim now!",
    "Hey, are we still meeting for dinner tonight?"
]

example_features = vectorizer.transform(examples)
predictions = model.predict(example_features)

for msg, label in zip(examples, predictions):
    print(f"\nMessage: {msg}\nPrediction: {label}")


Model Accuracy: 0.9811659192825112

Classification Report:
               precision    recall  f1-score   support

         ham       0.98      1.00      0.99       965
        spam       0.98      0.87      0.93       150

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115


Message: Congratulations! You've won a free ticket to the Bahamas! tap here to claim now!
Prediction: spam

Message: Hey, are we still meeting for dinner tonight?
Prediction: ham
