In [None]:
# Cell 1: Imports and Setup
import pandas as pd
import time
import logging
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import re
import os
from IPython.display import display

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('spam_detection.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Cell 2: Utility Functions
def log_time(message, start_time):
    elapsed = time.time() - start_time
    logger.info(f"{message} took {elapsed:.4f} seconds")
    return elapsed

def preview_file(file_path, num_lines=5):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            logger.info(f"Previewing first {num_lines} lines of {file_path}:")
            for i, line in enumerate(f, 1):
                if i > num_lines:
                    break
                logger.info(f"Line {i}: {line.strip()}")
    except Exception as e:
        logger.error(f"Error previewing file: {e}")

def clean_dataset(file_path, output_path='data/cleaned_dataset.csv'):
    logger.info(f"Cleaning dataset: {file_path}")
    start = time.time()
    cleaned_lines = []
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                if not line.strip():
                    continue
                line = re.sub(r'\t+', '\t', line)
                line = line.replace('"', '')
                parts = line.strip().split('\t')
                if len(parts) >= 2:
                    label, text = parts[0], ' '.join(parts[1:])
                    cleaned_lines.append(f"{label}\t{text}")
                else:
                    logger.warning(f"Skipping malformed line: {line.strip()}")
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write('\n'.join(cleaned_lines))
        logger.info(f"Cleaned dataset saved to {output_path}")
        log_time("Dataset cleaning", start)
        return output_path
    except Exception as e:
        logger.error(f"Error cleaning dataset: {e}")
        raise

# Cell 3: Data Loading and Preprocessing
def load_and_preprocess_data(file_path='data/dataset.csv'):
    start = time.time()
    preview_file(file_path)
    cleaned_file_path = clean_dataset(file_path)
    try:
        df = pd.read_csv(
            cleaned_file_path,
            sep='\t',
            header=None,
            names=['label', 'text'],
            engine='python',
            encoding='utf-8',
            quoting=3
        )
        df['label'] = df['label'].map({'ham': 'not_spam', 'spam': 'spam'})
        df = df.dropna(subset=['label', 'text'])
        df = df[df['label'].isin(['not_spam', 'spam'])]
        logger.info(f"Loaded {len(df)} valid samples")
        logger.info(f"Class distribution: {df['label'].value_counts().to_dict()}")
        log_time("Data loading and preprocessing", start)
        return df
    except Exception as e:
        logger.error(f"Error loading dataset: {e}")
        raise

# Cell 4: Text Vectorization
def vectorize_text(X_train, X_test):
    start = time.time()
    vectorizer = TfidfVectorizer(
        stop_words='english',
        max_features=20000,
        ngram_range=(1, 3),
        min_df=2
    )
    X_train_vect = vectorizer.fit_transform(X_train).astype(np.float32)
    X_test_vect = vectorizer.transform(X_test).astype(np.float32)
    logger.info(f"Vocabulary size: {len(vectorizer.vocabulary_)}")
    log_time("Text vectorization", start)
    return vectorizer, X_train_vect, X_test_vect

# Cell 5: Model Training
def train_naive_bayes(X_train, y_train):
    start = time.time()
    model = MultinomialNB(alpha=0.5)
    model.fit(X_train, y_train)
    logger.info("Naïve Bayes model trained successfully")
    log_time("Naïve Bayes training", start)
    return model

def train_svm(X_train, y_train):
    start = time.time()
    model = LinearSVC(C=1.0, max_iter=1000, dual=False, class_weight='balanced')
    model.fit(X_train, y_train)
    logger.info("SVM model trained successfully")
    log_time("SVM training", start)
    return model

# Cell 6: Model Evaluation with Confusion Matrix
def evaluate_model(model, model_name, X_test, y_test):
    start = time.time()
    preds = model.predict(X_test)
    accuracy = accuracy_score(y_test, preds)
    report = classification_report(y_test, preds, output_dict=True)
    
    # Compute confusion matrix
    cm = confusion_matrix(y_test, preds, labels=['not_spam', 'spam'])
    cm_df = pd.DataFrame(
        cm,
        index=['Actual: not_spam', 'Actual: spam'],
        columns=['Predicted: not_spam', 'Predicted: spam']
    )
    
    # Log results
    logger.info(f"\n--- {model_name} Results ---")
    logger.info(f"Accuracy: {accuracy:.4f}")
    logger.info(f"Precision (spam): {report['spam']['precision']:.4f}")
    logger.info(f"Recall (spam): {report['spam']['recall']:.4f}")
    logger.info(f"F1-score (spam): {report['spam']['f1-score']:.4f}")
    logger.info(f"Confusion Matrix:\n{cm_df.to_string()}")
    
    # Display in notebook
    print(f"\n{model_name} Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision (spam): {report['spam']['precision']:.4f}")
    print(f"Recall (spam): {report['spam']['recall']:.4f}")
    print(f"F1-score (spam): {report['spam']['f1-score']:.4f}")
    print("\nConfusion Matrix:")
    display(cm_df)
    
    log_time(f"{model_name} evaluation", start)
    return preds, accuracy, report, cm

# Cell 7: Model Comparison
def compare_models(nb_accuracy, svm_accuracy, nb_report, svm_report, nb_cm, svm_cm):
    logger.info("\n=== Model Comparison ===")
    logger.info(f"Naïve Bayes Accuracy: {nb_accuracy:.4f}")
    logger.info(f"SVM Accuracy: {svm_accuracy:.4f}")
    logger.info(f"Accuracy Difference (SVM - NB): {svm_accuracy - nb_accuracy:.4f}")
    
    logger.info("\nSpam Class Metrics Comparison:")
    logger.info(f"{'Metric':<15} {'Naïve Bayes':<15} {'SVM':<15}")
    logger.info(f"{'-'*45}")
    logger.info(f"{'Precision':<15} {nb_report['spam']['precision']:<15.4f} {svm_report['spam']['precision']:<15.4f}")
    logger.info(f"{'Recall':<15} {nb_report['spam']['recall']:<15.4f} {svm_report['spam']['recall']:<15.4f}")
    logger.info(f"{'F1-score':<15} {nb_report['spam']['f1-score']:<15.4f} {svm_report['spam']['f1-score']:<15.4f}")
    
    # Log confusion matrices
    nb_cm_df = pd.DataFrame(
        nb_cm,
        index=['Actual: not_spam', 'Actual: spam'],
        columns=['Predicted: not_spam', 'Predicted: spam']
    )
    svm_cm_df = pd.DataFrame(
        svm_cm,
        index=['Actual: not_spam', 'Actual: spam'],
        columns=['Predicted: not_spam', 'Predicted: spam']
    )
    
    logger.info("\nNaïve Bayes Confusion Matrix:\n%s", nb_cm_df.to_string())
    logger.info("\nSVM Confusion Matrix:\n%s", svm_cm_df.to_string())
    
    # Display in notebook
    print("\nModel Comparison:")
    print(f"Naïve Bayes Accuracy: {nb_accuracy:.4f}")
    print(f"SVM Accuracy: {svm_accuracy:.4f}")
    print(f"Accuracy Difference (SVM - NB): {svm_accuracy - nb_accuracy:.4f}")
    print("\nSpam Class Metrics Comparison:")
    print(f"{'Metric':<15} {'Naïve Bayes':<15} {'SVM':<15}")
    print(f"{'-'*45}")
    print(f"{'Precision':<15} {nb_report['spam']['precision']:<15.4f} {svm_report['spam']['precision']:<15.4f}")
    print(f"{'Recall':<15} {nb_report['spam']['recall']:<15.4f} {svm_report['spam']['recall']:<15.4f}")
    print(f"{'F1-score':<15} {nb_report['spam']['f1-score']:<15.4f} {svm_report['spam']['f1-score']:<15.4f}")
    print("\nNaïve Bayes Confusion Matrix:")
    display(nb_cm_df)
    print("\nSVM Confusion Matrix:")
    display(svm_cm_df)

# Cell 8: Text Classification and Feature Explanation
def classify_text(model, model_name, vectorizer, text):
    start = time.time()
    text_vect = vectorizer.transform([text]).astype(np.float32)
    pred = model.predict(text_vect)[0]
    elapsed = log_time(f"{model_name} single text prediction", start)
    return pred, elapsed

def explain_prediction(model, model_name, vectorizer, text):
    start = time.time()
    text_vect = vectorizer.transform([text]).toarray()[0]
    feature_names = vectorizer.get_feature_names_out()
    
    if model_name == "Naïve Bayes":
        spam_probs = np.exp(model.feature_log_prob_[1])
        top_features = sorted(
            [(feature_names[i], spam_probs[i]) for i in np.where(text_vect > 0)[0]],
            key=lambda x: x[1],
            reverse=True
        )[:5]
    else:  # SVM
        weights = model.coef_[0]
        top_features = sorted(
            [(feature_names[i], weights[i]) for i in np.where(text_vect > 0)[0]],
            key=lambda x: abs(x[1]),
            reverse=True
        )[:5]
    
    logger.info(f"Top features for {model_name} prediction: {top_features}")
    print(f"Top features for {model_name} prediction: {top_features}")
    log_time(f"{model_name} feature explanation", start)

# Cell 9: Interactive Mode for Jupyter
def interactive_mode(nb_model, svm_model, vectorizer):
    logger.info("\n=== Interactive Spam Detector ===")
    print("\n=== Interactive Spam Detector ===")
    print("Enter text to classify as spam or not_spam. Type 'exit' to quit.")
    
    while True:
        try:
            user_input = input("\nEnter text to classify (or 'exit' to quit):\n> ").strip()
            if user_input.lower() == 'exit':
                logger.info("Exiting interactive mode")
                print("Exiting interactive mode")
                break
            if not user_input:
                logger.info("Empty input, please enter some text")
                print("Empty input, please enter some text")
                continue
            
            # Classify with Naive Bayes
            nb_pred, nb_elapsed = classify_text(nb_model, "Naïve Bayes", vectorizer, user_input)
            logger.info(f"Naïve Bayes Prediction: {nb_pred} (Time: {nb_elapsed:.4f}s)")
            print(f"Naïve Bayes Prediction: {nb_pred} (Time: {nb_elapsed:.4f}s)")
            explain_prediction(nb_model, "Naïve Bayes", vectorizer, user_input)
            
            # Classify with SVM
            svm_pred, svm_elapsed = classify_text(svm_model, "SVM", vectorizer, user_input)
            logger.info(f"SVM Prediction: {svm_pred} (Time: {svm_elapsed:.4f}s)")
            print(f"SVM Prediction: {svm_pred} (Time: {svm_elapsed:.4f}s)")
            explain_prediction(svm_model, "SVM", vectorizer, user_input)
        
        except KeyboardInterrupt:
            logger.info("\nInteractive mode interrupted by user")
            print("\nInteractive mode interrupted by user")
            break
        except Exception as e:
            logger.error(f"Error processing input: {e}")
            print(f"Error processing input: {e}")
            logger.info("Please try again or type 'exit' to quit")
            print("Please try again or type 'exit' to quit")

# Cell 10: Main Execution
def main():
    logger.info("Starting spam detection pipeline with Naïve Bayes and SVM")
    print("Starting spam detection pipeline with Naïve Bayes and SVM")
    try:
        # Load and preprocess data
        df = load_and_preprocess_data()
        X_train, X_test, y_train, y_test = train_test_split(
            df['text'], df['label'], test_size=0.2, random_state=42, stratify=df['label']
        )
        logger.info(f"Training set size: {len(X_train)}, Test set size: {len(X_test)}")
        print(f"Training set size: {len(X_train)}, Test set size: {len(X_test)}")

        # Vectorize text
        vectorizer, X_train_vect, X_test_vect = vectorize_text(X_train, X_test)

        # Train and evaluate Naïve Bayes
        nb_model = train_naive_bayes(X_train_vect, y_train)
        nb_preds, nb_accuracy, nb_report, nb_cm = evaluate_model(nb_model, "Naïve Bayes", X_test_vect, y_test)

        # Train and evaluate SVM
        svm_model = train_svm(X_train_vect, y_train)
        svm_preds, svm_accuracy, svm_report, svm_cm = evaluate_model(svm_model, "SVM", X_test_vect, y_test)

        # Compare models
        compare_models(nb_accuracy, svm_accuracy, nb_report, svm_report, nb_cm, svm_cm)

        # Log top features for both models
        feature_names = vectorizer.get_feature_names_out()
        
        # Naïve Bayes top features
        spam_probs = np.exp(nb_model.feature_log_prob_[1])
        top_nb_features = sorted(zip(feature_names, spam_probs), key=lambda x: x[1], reverse=True)[:10]
        logger.info("\nTop Naïve Bayes spam features: %s", top_nb_features)
        print("\nTop Naïve Bayes spam features:", top_nb_features)

        # SVM top features
        svm_weights = svm_model.coef_[0]
        top_svm_features = sorted(zip(feature_names, svm_weights), key=lambda x: abs(x[1]), reverse=True)[:10]
        logger.info("Top SVM spam features: %s", top_svm_features)
        print("Top SVM spam features:", top_svm_features)

        # Start interactive mode
        interactive_mode(nb_model, svm_model, vectorizer)

    except Exception as e:
        logger.error(f"Pipeline failed: {e}")
        print(f"Pipeline failed: {e}")
        raise

if __name__ == "__main__":
    main()

2025-05-06 02:26:29,428 - INFO - Starting spam detection pipeline with Naïve Bayes and SVM
2025-05-06 02:26:29,429 - INFO - Previewing first 5 lines of data/dataset.csv:
2025-05-06 02:26:29,430 - INFO - Line 1: ham	Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
2025-05-06 02:26:29,431 - INFO - Line 2: ham	Ok lar... Joking wif u oni...
2025-05-06 02:26:29,431 - INFO - Line 3: spam	Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
2025-05-06 02:26:29,432 - INFO - Line 4: ham	U dun say so early hor... U c already then say...
2025-05-06 02:26:29,433 - INFO - Line 5: ham	Nah I don't think he goes to usf, he lives around here though
2025-05-06 02:26:29,434 - INFO - Cleaning dataset: data/dataset.csv
2025-05-06 02:26:29,456 - INFO - Cleaned dataset saved to data/cleaned_dataset.csv
2025-05-06 02:26:29,458 - INFO - Dataset cl

Starting spam detection pipeline with Naïve Bayes and SVM
Training set size: 4459, Test set size: 1115


2025-05-06 02:26:29,736 - INFO - Vocabulary size: 9977
2025-05-06 02:26:29,737 - INFO - Text vectorization took 0.2281 seconds
2025-05-06 02:26:29,743 - INFO - Naïve Bayes model trained successfully
2025-05-06 02:26:29,744 - INFO - Naïve Bayes training took 0.0058 seconds
2025-05-06 02:26:29,773 - INFO - 
--- Naïve Bayes Results ---
2025-05-06 02:26:29,775 - INFO - Accuracy: 0.9776
2025-05-06 02:26:29,776 - INFO - Precision (spam): 0.9921
2025-05-06 02:26:29,778 - INFO - Recall (spam): 0.8389
2025-05-06 02:26:29,779 - INFO - F1-score (spam): 0.9091
2025-05-06 02:26:29,787 - INFO - Confusion Matrix:
                  Predicted: not_spam  Predicted: spam
Actual: not_spam                  965                1
Actual: spam                       24              125



Naïve Bayes Results:
Accuracy: 0.9776
Precision (spam): 0.9921
Recall (spam): 0.8389
F1-score (spam): 0.9091

Confusion Matrix:


Unnamed: 0,Predicted: not_spam,Predicted: spam
Actual: not_spam,965,1
Actual: spam,24,125


2025-05-06 02:26:29,797 - INFO - Naïve Bayes evaluation took 0.0529 seconds
2025-05-06 02:26:29,817 - INFO - SVM model trained successfully
2025-05-06 02:26:29,820 - INFO - SVM training took 0.0210 seconds
2025-05-06 02:26:29,847 - INFO - 
--- SVM Results ---
2025-05-06 02:26:29,850 - INFO - Accuracy: 0.9848
2025-05-06 02:26:29,852 - INFO - Precision (spam): 0.9521
2025-05-06 02:26:29,855 - INFO - Recall (spam): 0.9329
2025-05-06 02:26:29,858 - INFO - F1-score (spam): 0.9424
2025-05-06 02:26:29,861 - INFO - Confusion Matrix:
                  Predicted: not_spam  Predicted: spam
Actual: not_spam                  959                7
Actual: spam                       10              139



SVM Results:
Accuracy: 0.9848
Precision (spam): 0.9521
Recall (spam): 0.9329
F1-score (spam): 0.9424

Confusion Matrix:


Unnamed: 0,Predicted: not_spam,Predicted: spam
Actual: not_spam,959,7
Actual: spam,10,139


2025-05-06 02:26:29,871 - INFO - SVM evaluation took 0.0498 seconds
2025-05-06 02:26:29,873 - INFO - 
=== Model Comparison ===
2025-05-06 02:26:29,875 - INFO - Naïve Bayes Accuracy: 0.9776
2025-05-06 02:26:29,876 - INFO - SVM Accuracy: 0.9848
2025-05-06 02:26:29,877 - INFO - Accuracy Difference (SVM - NB): 0.0072
2025-05-06 02:26:29,882 - INFO - 
Spam Class Metrics Comparison:
2025-05-06 02:26:29,886 - INFO - Metric          Naïve Bayes     SVM            
2025-05-06 02:26:29,888 - INFO - ---------------------------------------------
2025-05-06 02:26:29,889 - INFO - Precision       0.9921          0.9521         
2025-05-06 02:26:29,890 - INFO - Recall          0.8389          0.9329         
2025-05-06 02:26:29,891 - INFO - F1-score        0.9091          0.9424         
2025-05-06 02:26:29,894 - INFO - 
Naïve Bayes Confusion Matrix:
                  Predicted: not_spam  Predicted: spam
Actual: not_spam                  965                1
Actual: spam                       24      


Model Comparison:
Naïve Bayes Accuracy: 0.9776
SVM Accuracy: 0.9848
Accuracy Difference (SVM - NB): 0.0072

Spam Class Metrics Comparison:
Metric          Naïve Bayes     SVM            
---------------------------------------------
Precision       0.9921          0.9521         
Recall          0.8389          0.9329         
F1-score        0.9091          0.9424         

Naïve Bayes Confusion Matrix:


Unnamed: 0,Predicted: not_spam,Predicted: spam
Actual: not_spam,965,1
Actual: spam,24,125



SVM Confusion Matrix:


Unnamed: 0,Predicted: not_spam,Predicted: spam
Actual: not_spam,959,7
Actual: spam,10,139


2025-05-06 02:26:29,948 - INFO - 
Top Naïve Bayes spam features: [('free', np.float64(0.002682353036111534)), ('txt', np.float64(0.0018666035493384392)), ('stop', np.float64(0.0016746789821013616)), ('text', np.float64(0.0015970912900126447)), ('mobile', np.float64(0.0015794647590104522)), ('claim', np.float64(0.0015458991892335603)), ('reply', np.float64(0.0014794919805973343)), ('ur', np.float64(0.0014011788002742168)), ('www', np.float64(0.0013985124070268991)), ('prize', np.float64(0.0012837529283372296))]
2025-05-06 02:26:29,972 - INFO - Top SVM spam features: [('uk', np.float64(2.2732570594450903)), ('mobile', np.float64(2.1749408714253455)), ('txt', np.float64(1.9388065528128013)), ('claim', np.float64(1.8962492098377632)), ('150p', np.float64(1.780258693943252)), ('won', np.float64(1.7490170436618757)), ('50', np.float64(1.7380799895248473)), ('www', np.float64(1.6829639848904618)), ('com', np.float64(1.66966456450967)), ('video', np.float64(1.5602256383943296))]
2025-05-06 02: