In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, accuracy_score
import joblib

# Load SMS Spam Collection Dataset
# Make sure to have a dataset file named 'spam.csv' with 'label' and 'message' columns
df = pd.read_csv('spam.csv',encoding='latin-1')
df = df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'], axis=1)
df.columns = ['label', 'message']

# Encode labels: 'spam' = 1, 'ham' = 0
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
df = df.dropna()
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.3, random_state=42)

# Define models and parameters for GridSearch
models = {
    'MultinomialNB': {
        'pipeline': Pipeline([
            ('tfidf', TfidfVectorizer()),
            ('clf', MultinomialNB())
        ]),
        'params': {
            'tfidf__ngram_range': [(1, 1), (1, 2)],
            'clf__alpha': [0.5, 1.0]
        }
    },
    'LogisticRegression': {
        'pipeline': Pipeline([
            ('tfidf', TfidfVectorizer()),
            ('clf', LogisticRegression(max_iter=200))
        ]),
        'params': {
            'tfidf__ngram_range': [(1, 1), (1, 2)],
            'clf__C': [0.1, 1, 10]
        }
    }
}

best_model = None
best_score = 0
best_name = ""
final_conf_matrix = None

# Train and evaluate models
for name, mp in models.items():
    print(f"\n🔍 Training {name}...")
    clf = GridSearchCV(mp['pipeline'], mp['params'], cv=5, scoring='accuracy')
    clf.fit(X_train, y_train)

    print(f"✅ Best Parameters for {name}: {clf.best_params_}")
    model = clf.best_estimator_

    # Cross-validation accuracy
    scores = cross_val_score(model, df['message'], df['label'], cv=5)
    print(f"📊 Cross-validation Accuracy for {name}: {np.mean(scores):.4f}")

    # Evaluate on test data
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    print(f"📈 Test Accuracy for {name}: {acc:.4f}")
    print(f"🧮 Confusion Matrix:\n{cm}")

    if acc > best_score:
        best_score = acc
        best_model = model
        best_name = name
        final_conf_matrix = cm

# Save best model
joblib.dump(best_model, "spam_ham_model.pkl")
print(f"\n✅ Best Model: {best_name} with Accuracy: {best_score:.4f}")
print(f"💾 Model saved as: spam_ham_model.pkl")
print(f"🧾 Confusion Matrix of Best Model:\n{final_conf_matrix}")



🔍 Training MultinomialNB...
✅ Best Parameters for MultinomialNB: {'clf__alpha': 0.5, 'tfidf__ngram_range': (1, 1)}
📊 Cross-validation Accuracy for MultinomialNB: 0.9745
📈 Test Accuracy for MultinomialNB: 0.9725
🧮 Confusion Matrix:
[[1452    1]
 [  45  174]]

🔍 Training LogisticRegression...
✅ Best Parameters for LogisticRegression: {'clf__C': 10, 'tfidf__ngram_range': (1, 2)}
📊 Cross-validation Accuracy for LogisticRegression: 0.9844
📈 Test Accuracy for LogisticRegression: 0.9809
🧮 Confusion Matrix:
[[1449    4]
 [  28  191]]

✅ Best Model: LogisticRegression with Accuracy: 0.9809
💾 Model saved as: spam_ham_model.pkl
🧾 Confusion Matrix of Best Model:
[[1449    4]
 [  28  191]]


In [10]:
spam = joblib.load("spam_ham_model.pkl")
pa = "congrats you have won 100000 dollar"
msg = spam.predict([pa])
print(msg)

[1]
