## Predict with All Models and Show User-Friendly Explanation

This cell runs the input message through all trained models, shows a table of results, and provides a plain-language summary anyone can understand.

In [1]:
# --- Predict with all models and show results in a user-friendly way ---
import numpy as np
import joblib
import pandas as pd
from collections import Counter

# List of model names and their constructors (must match training)
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

model_defs = [
    ('SVC', SVC(kernel='sigmoid', gamma=1.0, probability=True)),
    ('KNeighbors', KNeighborsClassifier()),
    ('MultinomialNB', MultinomialNB()),
    ('DecisionTree', DecisionTreeClassifier(max_depth=5)),
    ('LogisticRegression', LogisticRegression(solver='liblinear', penalty='l1')),
    ('RandomForest', RandomForestClassifier(n_estimators=50, random_state=2)),
    ('AdaBoost', AdaBoostClassifier(n_estimators=50, random_state=2)),
    ('Bagging', BaggingClassifier(n_estimators=50, random_state=2)),
    ('ExtraTrees', ExtraTreesClassifier(n_estimators=50, random_state=2)),
    ('GradientBoosting', GradientBoostingClassifier(n_estimators=50, random_state=2)),
    ('XGBoost', XGBClassifier(n_estimators=50, random_state=2)),
]

# Load vectorizer
tfidf = joblib.load('../../models/main_model/vectorizer.pkl')

# Load or retrain models (for demo, we retrain here; in production, load from disk)
import os
model_dir = '../../models/main_model/'
trained_models = {}
for name, model in model_defs:
    model_path = os.path.join(model_dir, f'{name}.pkl')
    if os.path.exists(model_path):
        trained_models[name] = joblib.load(model_path)
    else:
        # Fallback: retrain on the spot (not recommended for production)
        try:
            model.fit(X_train, y_train)
            trained_models[name] = model
        except Exception as e:
            trained_models[name] = None

# Input message
sample_message = input('Enter a message to check for spam: ')
if 'transform_text' in globals():
    processed = transform_text(sample_message)
else:
    import re
    processed = sample_message.lower()
    processed = re.sub(r'[^a-zA-Z\s]', '', processed)
    processed = re.sub(r'\s+', ' ', processed).strip()

features = tfidf.transform([processed]).toarray()

# Run all models
results = []
for name, model in trained_models.items():
    if model is None:
        results.append({'Model': name, 'Prediction': 'N/A', 'Confidence': 'N/A'})
        continue
    try:
        pred = model.predict(features)[0]
        if hasattr(model, 'predict_proba'):
            proba = model.predict_proba(features)[0]
            conf = max(proba)
        elif hasattr(model, 'decision_function'):
            # Convert decision function to pseudo-probability
            df = model.decision_function(features)
            conf = 1 / (1 + np.exp(-df))[0] if hasattr(df, '__getitem__') else float(1 / (1 + np.exp(-df)))
        else:
            conf = 'N/A'
        results.append({
            'Model': name,
            'Prediction': 'Spam' if pred == 1 else 'Ham',
            'Confidence': round(float(conf)*100, 1) if conf != 'N/A' else 'N/A'
        })
    except Exception as e:
        results.append({'Model': name, 'Prediction': 'Error', 'Confidence': 'N/A'})

# Show table
df_results = pd.DataFrame(results)
display(df_results)

# Consensus and summary
votes = [r['Prediction'] for r in results if r['Prediction'] in ['Spam', 'Ham']]
vote_counts = Counter(votes)
majority = vote_counts.most_common(1)[0][0] if vote_counts else 'Unknown'
majority_count = vote_counts[majority] if majority != 'Unknown' else 0
total_votes = sum(vote_counts.values())
confidence = round(majority_count / total_votes * 100, 1) if total_votes > 0 else 0

# Find top spammy/hammy words for explanation
spammy_words = ['prize', 'claim', 'click', 'win', 'free', 'offer', 'urgent', 'congratulations']
hammy_words = ['ok', 'thanks', 'see', 'home', 'love', 'good', 'yes', 'no']
found_spammy = [w for w in spammy_words if w in processed.split()]
found_hammy = [w for w in hammy_words if w in processed.split()]

# Plain-language summary
summary = ''
if majority == 'Spam':
    summary += f"Most models ({majority_count} out of {total_votes}) think this message is SPAM. "
    summary += f"Confidence: {confidence}%. "
    if found_spammy:
        summary += f"This is likely because the message contains words like: {', '.join(found_spammy)}. "
    summary += "Spam messages often use such words to get your attention or trick you."
elif majority == 'Ham':
    summary += f"Most models ({majority_count} out of {total_votes}) think this message is NOT spam (ham). "
    summary += f"Confidence: {confidence}%. "
    if found_hammy:
        summary += f"The message contains friendly or common words like: {', '.join(found_hammy)}. "
    summary += "Ham messages are usually personal or normal conversations."
else:
    summary += "The models could not agree on a result. Please try again."

print('\n--- Plain-language Summary ---')
print(summary)


Unnamed: 0,Model,Prediction,Confidence
0,SVC,,
1,KNeighbors,,
2,MultinomialNB,,
3,DecisionTree,,
4,LogisticRegression,,
5,RandomForest,,
6,AdaBoost,,
7,Bagging,,
8,ExtraTrees,,
9,GradientBoosting,,



--- Plain-language Summary ---
The models could not agree on a result. Please try again.
