In [None]:
import pandas as pd
import time
import logging
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
import re
import os

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('spam_detection.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Timer utility
def log_time(message, start_time):
    elapsed = time.time() - start_time
    logger.info(f"{message} took {elapsed:.4f} seconds")
    return elapsed

def preview_file(file_path, num_lines=5):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            logger.info(f"Previewing first {num_lines} lines of {file_path}:")
            for i, line in enumerate(f, 1):
                if i > num_lines:
                    break
                logger.info(f"Line {i}: {line.strip()}")
    except Exception as e:
        logger.error(f"Error previewing file: {e}")

def clean_dataset(file_path, output_path='data/cleaned_dataset.csv'):
    logger.info(f"Cleaning dataset: {file_path}")
    start = time.time()
    cleaned_lines = []
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                if not line.strip():
                    continue
                line = re.sub(r'\t+', '\t', line)
                line = line.replace('"', '')
                parts = line.strip().split('\t')
                if len(parts) >= 2:
                    label, text = parts[0], ' '.join(parts[1:])
                    cleaned_lines.append(f"{label}\t{text}")
                else:
                    logger.warning(f"Skipping malformed line: {line.strip()}")
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write('\n'.join(cleaned_lines))
        logger.info(f"Cleaned dataset saved to {output_path}")
        log_time("Dataset cleaning", start)
        return output_path
    except Exception as e:
        logger.error(f"Error cleaning dataset: {e}")
        raise

def load_and_preprocess_data(file_path='data/dataset.csv'):
    start = time.time()
    preview_file(file_path)
    cleaned_file_path = clean_dataset(file_path)
    try:
        df = pd.read_csv(
            cleaned_file_path,
            sep='\t',
            header=None,
            names=['label', 'text'],
            engine='python',
            encoding='utf-8',
            quoting=3
        )
        df['label'] = df['label'].map({'ham': 'not_spam', 'spam': 'spam'})
        df = df.dropna(subset=['label', 'text'])
        df = df[df['label'].isin(['not_spam', 'spam'])]
        logger.info(f"Loaded {len(df)} valid samples")
        logger.info(f"Class distribution: {df['label'].value_counts().to_dict()}")
        log_time("Data loading and preprocessing", start)
        return df
    except Exception as e:
        logger.error(f"Error loading dataset: {e}")
        raise

def vectorize_text(X_train, X_test):
    start = time.time()
    vectorizer = TfidfVectorizer(
        stop_words='english',
        max_features=20000,
        ngram_range=(1, 3),
        min_df=2
    )
    X_train_vect = vectorizer.fit_transform(X_train).astype(np.float32)
    X_test_vect = vectorizer.transform(X_test).astype(np.float32)
    logger.info(f"Vocabulary size: {len(vectorizer.vocabulary_)}")
    log_time("Text vectorization", start)
    return vectorizer, X_train_vect, X_test_vect

def train_naive_bayes(X_train, y_train):
    start = time.time()
    model = MultinomialNB(alpha=0.5)
    model.fit(X_train, y_train)
    logger.info("Naïve Bayes model trained successfully")
    log_time("Naïve Bayes training", start)
    return model

def train_svm(X_train, y_train):
    start = time.time()
    model = LinearSVC(C=1.0, max_iter=1000, dual=False, class_weight='balanced')
    model.fit(X_train, y_train)
    logger.info("SVM model trained successfully")
    log_time("SVM training", start)
    return model

def evaluate_model(model, model_name, X_test, y_test):
    start = time.time()
    preds = model.predict(X_test)
    accuracy = accuracy_score(y_test, preds)
    report = classification_report(y_test, preds, output_dict=True)
    logger.info(f"\n--- {model_name} Results ---")
    logger.info(f"Accuracy: {accuracy:.4f}")
    logger.info(f"Precision (spam): {report['spam']['precision']:.4f}")
    logger.info(f"Recall (spam): {report['spam']['recall']:.4f}")
    logger.info(f"F1-score (spam): {report['spam']['f1-score']:.4f}")
    log_time(f"{model_name} evaluation", start)
    return preds, accuracy, report

def compare_models(nb_accuracy, svm_accuracy, nb_report, svm_report):
    logger.info("\n=== Model Comparison ===")
    logger.info(f"Naïve Bayes Accuracy: {nb_accuracy:.4f}")
    logger.info(f"SVM Accuracy: {svm_accuracy:.4f}")
    logger.info(f"Accuracy Difference (SVM - NB): {svm_accuracy - nb_accuracy:.4f}")
    
    logger.info("\nSpam Class Metrics Comparison:")
    logger.info(f"{'Metric':<15} {'Naïve Bayes':<15} {'SVM':<15}")
    logger.info(f"{'-'*45}")
    logger.info(f"{'Precision':<15} {nb_report['spam']['precision']:<15.4f} {svm_report['spam']['precision']:<15.4f}")
    logger.info(f"{'Recall':<15} {nb_report['spam']['recall']:<15.4f} {svm_report['spam']['recall']:<15.4f}")
    logger.info(f"{'F1-score':<15} {nb_report['spam']['f1-score']:<15.4f} {svm_report['spam']['f1-score']:<15.4f}")

def classify_text(model, model_name, vectorizer, text):
    start = time.time()
    text_vect = vectorizer.transform([text]).astype(np.float32)
    pred = model.predict(text_vect)[0]
    elapsed = log_time(f"{model_name} single text prediction", start)
    return pred, elapsed

def explain_prediction(model, model_name, vectorizer, text):
    start = time.time()
    text_vect = vectorizer.transform([text]).toarray()[0]
    feature_names = vectorizer.get_feature_names_out()
    
    if model_name == "Naïve Bayes":
        spam_probs = np.exp(model.feature_log_prob_[1])
        top_features = sorted(
            [(feature_names[i], spam_probs[i]) for i in np.where(text_vect > 0)[0]],
            key=lambda x: x[1],
            reverse=True
        )[:5]
    else:  # SVM
        weights = model.coef_[0]
        top_features = sorted(
            [(feature_names[i], weights[i]) for i in np.where(text_vect > 0)[0]],
            key=lambda x: abs(x[1]),
            reverse=True
        )[:5]
    
    logger.info(f"Top features for {model_name} prediction: {top_features}")
    log_time(f"{model_name} feature explanation", start)

def interactive_mode(nb_model, svm_model, vectorizer):
    logger.info("\n=== Interactive Spam Detector ===")
    logger.info("Enter text to classify as spam or not_spam. Type 'exit' to quit.")
    while True:
        try:
            user_input = input("\nEnter text to classify (or 'exit' to quit):\n> ").strip()
            if user_input.lower() == 'exit':
                logger.info("Exiting interactive mode")
                break
            if not user_input:
                logger.info("Empty input, please enter some text")
                continue
            
            # Classify with Naive Bayes
            nb_pred, nb_elapsed = classify_text(nb_model, "Naïve Bayes", vectorizer, user_input)
            logger.info(f"Naïve Bayes Prediction: {nb_pred} (Time: {nb_elapsed:.4f}s)")
            explain_prediction(nb_model, "Naïve Bayes", vectorizer, user_input)
            
            # Classify with SVM
            svm_pred, svm_elapsed = classify_text(svm_model, "SVM", vectorizer, user_input)
            logger.info(f"SVM Prediction: {svm_pred} (Time: {svm_elapsed:.4f}s)")
            explain_prediction(svm_model, "SVM", vectorizer, user_input)
        
        except KeyboardInterrupt:
            logger.info("\nInteractive mode interrupted by user")
            break
        except Exception as e:
            logger.error(f"Error processing input: {e}")
            logger.info("Please try again or type 'exit' to quit")

def main():
    logger.info("Starting spam detection pipeline with Naïve Bayes and SVM")
    try:
        # Load and preprocess data
        df = load_and_preprocess_data()
        X_train, X_test, y_train, y_test = train_test_split(
            df['text'], df['label'], test_size=0.2, random_state=42, stratify=df['label']
        )
        logger.info(f"Training set size: {len(X_train)}, Test set size: {len(X_test)}")

        # Vectorize text
        vectorizer, X_train_vect, X_test_vect = vectorize_text(X_train, X_test)

        # Train and evaluate Naïve Bayes
        nb_model = train_naive_bayes(X_train_vect, y_train)
        nb_preds, nb_accuracy, nb_report = evaluate_model(nb_model, "Naïve Bayes", X_test_vect, y_test)

        # Train and evaluate SVM
        svm_model = train_svm(X_train_vect, y_train)
        svm_preds, svm_accuracy, svm_report = evaluate_model(svm_model, "SVM", X_test_vect, y_test)

        # Compare models
        compare_models(nb_accuracy, svm_accuracy, nb_report, svm_report)

        # Log top features for both models
        feature_names = vectorizer.get_feature_names_out()
        
        # Naïve Bayes top features
        spam_probs = np.exp(nb_model.feature_log_prob_[1])
        top_nb_features = sorted(zip(feature_names, spam_probs), key=lambda x: x[1], reverse=True)[:10]
        logger.info("\nTop Naïve Bayes spam features: %s", top_nb_features)

        # SVM top features
        svm_weights = svm_model.coef_[0]
        top_svm_features = sorted(zip(feature_names, svm_weights), key=lambda x: abs(x[1]), reverse=True)[:10]
        logger.info("Top SVM spam features: %s", top_svm_features)

        # Start interactive mode
        interactive_mode(nb_model, svm_model, vectorizer)

    except Exception as e:
        logger.error(f"Pipeline failed: {e}")
        raise

if __name__ == "__main__":
    main()

2025-05-06 01:58:15,056 - INFO - Starting spam detection pipeline with Naïve Bayes and SVM
2025-05-06 01:58:15,058 - INFO - Previewing first 5 lines of data/dataset.csv:
2025-05-06 01:58:15,059 - INFO - Line 1: ham	Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
2025-05-06 01:58:15,060 - INFO - Line 2: ham	Ok lar... Joking wif u oni...
2025-05-06 01:58:15,060 - INFO - Line 3: spam	Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
2025-05-06 01:58:15,061 - INFO - Line 4: ham	U dun say so early hor... U c already then say...
2025-05-06 01:58:15,061 - INFO - Line 5: ham	Nah I don't think he goes to usf, he lives around here though
2025-05-06 01:58:15,062 - INFO - Cleaning dataset: data/dataset.csv
2025-05-06 01:58:15,079 - INFO - Cleaned dataset saved to data/cleaned_dataset.csv
2025-05-06 01:58:15,079 - INFO - Dataset cl