# Spam Detection Project
This notebook trains a spam classifier using TF-IDF + Logistic Regression and saves artifacts for deployment.

In [None]:
# Imports and data (mock sample - replace with real CSV as needed)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score
import joblib

# Mock dataset (replace with reading real dataset)
texts = [
    'Win money now', 'Hello friend', 'Limited offer!!!', 'How are you?', 'Free coupons',
    'Congratulations, you have won a prize', 'Meeting at 10am', 'Important update about your account',
    'Claim your free gift', 'Lunch tomorrow?'
]
labels = [1,0,1,0,1,1,0,0,1,0]

df = pd.DataFrame({'text': texts, 'label': labels})

X = df['text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1,2))),
    ('clf', LogisticRegression(max_iter=1000))
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print('Classification report:')
print(classification_report(y_test, y_pred))
try:
    print('ROC AUC:', roc_auc_score(y_test, pipeline.predict_proba(X_test)[:,1]))
except Exception:
    pass

# Save artifacts
pipeline.named_steps['clf']  # ensure model exists
joblib.dump(pipeline.named_steps['tfidf'], 'vectorizer.pkl')
joblib.dump(pipeline.named_steps['clf'], 'model.pkl')
joblib.dump(pipeline, 'pipeline_full.joblib')
print('Saved vectorizer.pkl, model.pkl and pipeline_full.joblib')


**Notes:** Replace mock data with `pd.read_csv('spambase.csv')` and re-run cells.