In [None]:
# Install dependencies
!pip install pandas numpy scikit-learn streamlit joblib

# Download and load SMSSpamCollection dataset
import pandas as pd
import urllib.request
import zipfile

# Download dataset zip
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
urllib.request.urlretrieve(url, "smsspamcollection.zip")

# Extract and read the file
with zipfile.ZipFile("smsspamcollection.zip", "r") as zip_ref:
    zip_ref.extractall()

df = pd.read_csv("SMSSpamCollection", sep="\t", header=None, names=["label", "text"])

# Preprocessing
df['label'] = df['label'].map({'spam': 1, 'ham': 0})
X = df['text']
y = df['label']

# Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Model
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Evaluation
from sklearn.metrics import accuracy_score, classification_report
y_pred = model.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Save model and vectorizer
import joblib
joblib.dump(model, "spam_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")


Accuracy: 0.97847533632287
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       1.00      0.84      0.91       149

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115



['tfidf_vectorizer.pkl']