# Email Classification with RoBERTa using Kaggle Dataset

# Data Preprocessing

## Imports & Setup


In [6]:
import pandas as pd
import numpy as np
import torch
import os
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import warnings
import random
from collections import Counter
warnings.filterwarnings('ignore')

# Set seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


## Data Loading & Augmentation

In [2]:
class EmailDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128, augment=False):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.augment = augment

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx]).strip()
        text = ' '.join(text.split())

        # Simple text augmentation for training
        if self.augment and random.random() < 0.15:
            text = self._augment_text(text)

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt',
            add_special_tokens=True
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

    def _augment_text(self, text):
        """Simple text augmentation techniques"""
        words = text.split()
        if len(words) < 3:
            return text

        # Random word dropout (10% chance)
        if random.random() < 0.1 and len(words) > 3:
            idx = random.randint(0, len(words)-1)
            words.pop(idx)

        # Random word swap (5% chance)
        if random.random() < 0.05 and len(words) > 3:
            i, j = random.sample(range(len(words)), 2)
            words[i], words[j] = words[j], words[i]

        return ' '.join(words)

# Model Architecture

In [3]:
class OptimizedRoBERTaClassifier(nn.Module):
    def __init__(self, model_name='roberta-base', num_classes=2, dropout_rate=0.3, freeze_layers=6):
        super(OptimizedRoBERTaClassifier, self).__init__()

        self.roberta = AutoModel.from_pretrained(model_name)

        # Freeze early layers to prevent overfitting
        self._freeze_layers(freeze_layers)

        hidden_size = self.roberta.config.hidden_size

        # Simplified classifier architecture to reduce parameters
        self.classifier = nn.Sequential(
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_size, hidden_size // 4),
            nn.GELU(),
            nn.Dropout(dropout_rate * 0.7),
            nn.Linear(hidden_size // 4, num_classes)
        )

        # Initialize weights properly
        self._init_weights()

    def _freeze_layers(self, num_layers):
        """Freeze embeddings and first N encoder layers"""
        # Freeze embeddings
        for param in self.roberta.embeddings.parameters():
            param.requires_grad = False

        # Freeze first N encoder layers
        for layer in self.roberta.encoder.layer[:num_layers]:
            for param in layer.parameters():
                param.requires_grad = False

    def _init_weights(self):
        """Initialize classifier weights"""
        for layer in self.classifier:
            if isinstance(layer, nn.Linear):
                nn.init.xavier_uniform_(layer.weight)
                nn.init.zeros_(layer.bias)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]  # CLS token
        return self.classifier(pooled_output)

# Training & Evaluation functions

In [None]:
class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.001, restore_best_weights=True):
        self.patience = patience
        self.min_delta = min_delta
        self.restore_best_weights = restore_best_weights
        self.counter = 0
        self.best_score = None
        self.best_weights = None

    def __call__(self, val_score, model):
        if self.best_score is None:
            self.best_score = val_score
            self.save_checkpoint(model)
        elif val_score < self.best_score + self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                if self.restore_best_weights:
                    model.load_state_dict(self.best_weights)
                return True
        else:
            self.best_score = val_score
            self.counter = 0
            self.save_checkpoint(model)
        return False

    def save_checkpoint(self, model):
        self.best_weights = model.state_dict().copy()

def train_fold(model, train_loader, val_loader, fold_num, max_epochs=5):
    """Enhanced training with optimal hyperparameters"""

    # Optimized learning rates based on research
    optimizer = AdamW([
        {'params': model.roberta.parameters(), 'lr': 2e-5},  # Lower LR for pre-trained layers
        {'params': model.classifier.parameters(), 'lr': 5e-5}  # Higher LR for new layers
    ], weight_decay=0.02, eps=1e-8)

    total_steps = len(train_loader) * max_epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(0.1 * total_steps),  # 10% warmup
        num_training_steps=total_steps
    )

    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)  # Label smoothing for regularization
    early_stopping = EarlyStopping(patience=3, min_delta=0.001)

    train_losses = []
    val_accuracies = []

    for epoch in range(max_epochs):
        # Training phase
        model.train()
        total_train_loss = 0
        train_bar = tqdm(train_loader, desc=f"Fold {fold_num+1} Epoch {epoch+1}")

        for batch in train_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()

            # Gradient clipping for stability
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()
            scheduler.step()

            total_train_loss += loss.item()
            train_bar.set_postfix({'loss': loss.item()})

        avg_train_loss = total_train_loss / len(train_loader)
        train_losses.append(avg_train_loss)

        # Validation phase
        model.eval()
        val_predictions = []
        val_labels = []
        total_val_loss = 0

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)

                outputs = model(input_ids, attention_mask)
                val_loss = criterion(outputs, labels)
                total_val_loss += val_loss.item()

                predictions = torch.argmax(outputs, dim=1)
                val_predictions.extend(predictions.cpu().numpy())
                val_labels.extend(labels.cpu().numpy())

        val_accuracy = accuracy_score(val_labels, val_predictions)
        val_f1 = f1_score(val_labels, val_predictions, average='weighted')
        avg_val_loss = total_val_loss / len(val_loader)
        val_accuracies.append(val_accuracy)

        print(f"Epoch {epoch+1}: Train Loss: {avg_train_loss:.4f}, "
              f"Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}, Val F1: {val_f1:.4f}")

        # Early stopping check
        if early_stopping(val_accuracy, model):
            print(f"Early stopping at epoch {epoch+1}")
            break

    return early_stopping.best_score, val_predictions, val_labels, train_losses, val_accuracies

def enhanced_cross_validate(df, n_folds=5):
    """Enhanced cross-validation with optimal parameters"""

    # Optimal hyperparameters based on research
    MODEL_NAME = 'roberta-base'
    MAX_LENGTH = 128  # Reduced for better generalization
    BATCH_SIZE = 24   # Optimal for RoBERTa
    MAX_EPOCHS = 5
    DROPOUT_RATE = 0.4  # Higher dropout for regularization
    FREEZE_LAYERS = 6   # Freeze first 6 layers

    print(f"Configuration:")
    print(f"Model: {MODEL_NAME}")
    print(f"Max Length: {MAX_LENGTH}")
    print(f"Batch Size: {BATCH_SIZE}")
    print(f"Max Epochs: {MAX_EPOCHS}")
    print(f"Dropout Rate: {DROPOUT_RATE}")
    print(f"Frozen Layers: {FREEZE_LAYERS}")

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    texts = df['text'].values
    labels = df['spam'].values

    # Check class balance
    label_counts = Counter(labels)
    print(f"Class distribution: {label_counts}")

    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

    fold_results = []
    all_predictions = []
    all_true_labels = []
    fold_histories = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(texts, labels)):
        print(f"\n{'='*50}")
        print(f"FOLD {fold + 1}/{n_folds}")
        print(f"{'='*50}")

        train_texts = texts[train_idx]
        train_labels = labels[train_idx]
        val_texts = texts[val_idx]
        val_labels = labels[val_idx]

        print(f"Train size: {len(train_texts)} (Spam: {sum(train_labels)})")
        print(f"Val size: {len(val_texts)} (Spam: {sum(val_labels)})")

        # Create datasets with augmentation for training
        train_dataset = EmailDataset(train_texts, train_labels, tokenizer, MAX_LENGTH, augment=True)
        val_dataset = EmailDataset(val_texts, val_labels, tokenizer, MAX_LENGTH, augment=False)

        train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

        # Initialize model with optimal configuration
        model = OptimizedRoBERTaClassifier(
            MODEL_NAME,
            num_classes=2,
            dropout_rate=DROPOUT_RATE,
            freeze_layers=FREEZE_LAYERS
        ).to(device)

        # Train the model
        fold_accuracy, fold_preds, fold_labels, train_losses, val_accuracies = train_fold(
            model, train_loader, val_loader, fold, MAX_EPOCHS
        )

        fold_results.append(fold_accuracy)
        all_predictions.extend(fold_preds)
        all_true_labels.extend(fold_labels)
        fold_histories.append({'train_losses': train_losses, 'val_accuracies': val_accuracies})

        print(f"Fold {fold + 1} Best Accuracy: {fold_accuracy:.4f}")

        # Memory cleanup
        del model
        torch.cuda.empty_cache() if torch.cuda.is_available() else None

    # Final results
    mean_accuracy = np.mean(fold_results)
    std_accuracy = np.std(fold_results)
    overall_f1 = f1_score(all_true_labels, all_predictions, average='weighted')

    print(f"\n{'='*50}")
    print(f"FINAL CROSS VALIDATION RESULTS")
    print(f"{'='*50}")
    print(f"Mean Accuracy: {mean_accuracy:.4f} ± {std_accuracy:.4f}")
    print(f"Overall F1 Score: {overall_f1:.4f}")
    print(f"Individual Fold Accuracies: {[f'{acc:.4f}' for acc in fold_results]}")

    # Detailed classification report
    print(f"\nDetailed Classification Report:")
    print(classification_report(all_true_labels, all_predictions,
                              target_names=['Ham', 'Spam'], digits=4))

    # Plot training curves and results
    plot_results(fold_results, fold_histories, all_true_labels, all_predictions, mean_accuracy)

    return fold_results, mean_accuracy, tokenizer

def plot_results(fold_results, fold_histories, true_labels, predictions, mean_accuracy):
    """Enhanced plotting function"""
    plt.figure(figsize=(15, 10))

    # Fold accuracies
    plt.subplot(2, 3, 1)
    plt.bar(range(1, len(fold_results) + 1), fold_results, color='skyblue', alpha=0.7)
    plt.axhline(y=mean_accuracy, color='red', linestyle='--',
                label=f'Mean: {mean_accuracy:.4f}')
    plt.xlabel('Fold')
    plt.ylabel('Accuracy')
    plt.title('RoBERTa Accuracy by Fold')
    plt.legend()
    plt.grid(axis='y', alpha=0.3)

    # Training curves
    plt.subplot(2, 3, 2)
    for i, history in enumerate(fold_histories):
        epochs = range(1, len(history['train_losses']) + 1)
        plt.plot(epochs, history['train_losses'], label=f'Fold {i+1}', alpha=0.7)
    plt.xlabel('Epoch')
    plt.ylabel('Training Loss')
    plt.title('Training Loss Curves')
    plt.legend()
    plt.grid(alpha=0.3)

    plt.subplot(2, 3, 3)
    for i, history in enumerate(fold_histories):
        epochs = range(1, len(history['val_accuracies']) + 1)
        plt.plot(epochs, history['val_accuracies'], label=f'Fold {i+1}', alpha=0.7)
    plt.xlabel('Epoch')
    plt.ylabel('Validation Accuracy')
    plt.title('Validation Accuracy Curves')
    plt.legend()
    plt.grid(alpha=0.3)

    # Confusion matrix
    cm = confusion_matrix(true_labels, predictions)
    plt.subplot(2, 3, 4)
    plt.imshow(cm, interpolation='nearest', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.colorbar()

    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, format(cm[i, j], 'd'),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.xticks([0, 1], ['Ham', 'Spam'])
    plt.yticks([0, 1], ['Ham', 'Spam'])

    # Performance metrics
    plt.subplot(2, 3, 5)
    accuracy = accuracy_score(true_labels, predictions)
    precision = f1_score(true_labels, predictions, average='weighted')
    metrics = ['Accuracy', 'F1-Score']
    values = [accuracy, precision]

    bars = plt.bar(metrics, values, color=['green', 'orange'], alpha=0.7)
    plt.ylabel('Score')
    plt.title('Overall Performance Metrics')
    plt.ylim(0, 1)

    for bar, value in zip(bars, values):
        plt.text(bar.get_x() + bar.get_width()/2., bar.get_height() + 0.01,
                f'{value:.4f}', ha='center', va='bottom')

    plt.tight_layout()
    plt.show()

def train_final_model(df, tokenizer):
    """Train a final model on the full dataset with optimal parameters"""
    MODEL_NAME = 'roberta-base'
    MAX_LENGTH = 128
    BATCH_SIZE = 24
    EPOCHS = 4
    DROPOUT_RATE = 0.4
    FREEZE_LAYERS = 6

    # Split data
    train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['spam'], random_state=42)

    # Create datasets
    train_dataset = EmailDataset(train_df['text'].values, train_df['spam'].values,
                                tokenizer, MAX_LENGTH, augment=True)
    test_dataset = EmailDataset(test_df['text'].values, test_df['spam'].values,
                               tokenizer, MAX_LENGTH, augment=False)

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

    # Initialize model
    final_model = OptimizedRoBERTaClassifier(
        MODEL_NAME,
        num_classes=2,
        dropout_rate=DROPOUT_RATE,
        freeze_layers=FREEZE_LAYERS
    ).to(device)

    # Optimizer with optimal settings
    optimizer = AdamW([
        {'params': final_model.roberta.parameters(), 'lr': 2e-5},
        {'params': final_model.classifier.parameters(), 'lr': 5e-5}
    ], weight_decay=0.02, eps=1e-8)

    total_steps = len(train_loader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(0.1 * total_steps),
        num_training_steps=total_steps
    )

    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
    early_stopping = EarlyStopping(patience=3, min_delta=0.001)

    # Training loop
    for epoch in range(EPOCHS):
        final_model.train()
        total_loss = 0

        for batch in tqdm(train_loader, desc=f"Final Training Epoch {epoch+1}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = final_model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()

            torch.nn.utils.clip_grad_norm_(final_model.parameters(), max_norm=1.0)
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()

        # Validation on test set
        final_model.eval()
        test_predictions = []
        test_labels = []

        with torch.no_grad():
            for batch in test_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)

                outputs = final_model(input_ids, attention_mask)
                predictions = torch.argmax(outputs, dim=1)

                test_predictions.extend(predictions.cpu().numpy())
                test_labels.extend(labels.cpu().numpy())

        val_accuracy = accuracy_score(test_labels, test_predictions)
        print(f"Epoch {epoch+1}: Train Loss: {total_loss/len(train_loader):.4f}, "
              f"Test Accuracy: {val_accuracy:.4f}")

        if early_stopping(val_accuracy, final_model):
            break

    # Final evaluation
    final_test_accuracy = accuracy_score(test_labels, test_predictions)
    final_f1 = f1_score(test_labels, test_predictions, average='weighted')

    print(f"\nFinal Optimized RoBERTa Model Performance:")
    print(f"Test Accuracy: {final_test_accuracy:.4f}")
    print(f"Test F1 Score: {final_f1:.4f}")

    # Save the model
    torch.save({
        'model_state_dict': final_model.state_dict(),
        'tokenizer_name': MODEL_NAME,
        'max_length': MAX_LENGTH,
        'accuracy': final_test_accuracy,
        'dropout_rate': DROPOUT_RATE,
        'freeze_layers': FREEZE_LAYERS
    }, 'optimized_roberta_spam_classifier.pth')

    print(f"Model saved as 'optimized_roberta_spam_classifier.pth'")
    return final_model, final_test_accuracy

Model: roberta-base
Max Length: 256
Batch Size: 64
Epochs per fold: 4


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]


FOLD 1/5
Train size: 4582 (Spam: 1094)
Val size: 1146 (Spam: 274)


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 1 Epoch 1:   0%|          | 0/72 [00:00<?, ?it/s]

Epoch 1: Train Loss: 0.3794, Val Accuracy: 0.9869, Val F1: 0.9870


Fold 1 Epoch 2:   0%|          | 0/72 [00:00<?, ?it/s]

Epoch 2: Train Loss: 0.0388, Val Accuracy: 0.9930, Val F1: 0.9930


Fold 1 Epoch 3:   0%|          | 0/72 [00:00<?, ?it/s]

Epoch 3: Train Loss: 0.0148, Val Accuracy: 0.9948, Val F1: 0.9947


Fold 1 Epoch 4:   0%|          | 0/72 [00:00<?, ?it/s]

Epoch 4: Train Loss: 0.0030, Val Accuracy: 0.9921, Val F1: 0.9921
Fold 1 Best Accuracy: 0.9948

FOLD 2/5
Train size: 4582 (Spam: 1094)
Val size: 1146 (Spam: 274)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 2 Epoch 1:   0%|          | 0/72 [00:00<?, ?it/s]

Epoch 1: Train Loss: 0.3282, Val Accuracy: 0.9625, Val F1: 0.9614


Fold 2 Epoch 2:   0%|          | 0/72 [00:00<?, ?it/s]

KeyboardInterrupt: 

# Model Inferencing

In [4]:
def load_classifier():
    """Load the trained classifier for inference"""
    checkpoint = torch.load('optimized_roberta_spam_classifier.pth', map_location=device)
    tokenizer = AutoTokenizer.from_pretrained(checkpoint['tokenizer_name'])

    model = OptimizedRoBERTaClassifier(
        checkpoint['tokenizer_name'],
        num_classes=2,
        dropout_rate=checkpoint.get('dropout_rate', 0.4),
        freeze_layers=checkpoint.get('freeze_layers', 6)
    )
    model.load_state_dict(checkpoint['model_state_dict'])
    model.to(device)
    model.eval()

    return model, tokenizer, checkpoint['max_length']

def predict_email(text, model, tokenizer, max_length):
    """Predict if an email is spam or ham"""
    # Preprocess text
    text = str(text).strip()
    text = ' '.join(text.split())

    # Tokenize
    encoding = tokenizer(
        text,
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='pt',
        add_special_tokens=True
    )

    # Predict
    with torch.no_grad():
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)

        outputs = model(input_ids, attention_mask)
        probs = torch.softmax(outputs, dim=1)
        pred = torch.argmax(probs, dim=1).item()
        confidence = probs[0][pred].item()
        spam_score = probs[0][1].item()

    return {
        'result': 'Spam' if pred == 1 else 'Ham',
        'confidence': confidence,
        'spam_score': spam_score,
        'ham_score': probs[0][0].item()
    }

def batch_predict(texts, model, tokenizer, max_length, batch_size=32):
    """Predict multiple emails efficiently"""
    results = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]

        # Preprocess batch
        processed_texts = [str(text).strip() for text in batch_texts]
        processed_texts = [' '.join(text.split()) for text in processed_texts]

        # Tokenize batch
        encodings = tokenizer(
            processed_texts,
            truncation=True,
            padding='max_length',
            max_length=max_length,
            return_tensors='pt',
            add_special_tokens=True
        )

        # Predict batch
        with torch.no_grad():
            input_ids = encodings['input_ids'].to(device)
            attention_mask = encodings['attention_mask'].to(device)

            outputs = model(input_ids, attention_mask)
            probs = torch.softmax(outputs, dim=1)
            predictions = torch.argmax(probs, dim=1)

            # Process results
            for j in range(len(batch_texts)):
                pred = predictions[j].item()
                confidence = probs[j][pred].item()
                spam_score = probs[j][1].item()

                results.append({
                    'text': batch_texts[j],
                    'result': 'Spam' if pred == 1 else 'Ham',
                    'confidence': confidence,
                    'spam_score': spam_score,
                    'ham_score': probs[j][0].item()
                })

    return results

# Main Execution Pipeline

In [None]:
def main():
    """Main training pipeline"""
    # Load and preprocess data
    df = pd.read_csv('emails.csv')

    print(f"Dataset shape: {df.shape}")
    print(f"Label distribution:\n{df['spam'].value_counts()}")

    # Data cleaning
    df = df.dropna(subset=['text', 'spam'])
    df['text'] = df['text'].astype(str)
    df['spam'] = df['spam'].astype(int)
    df['text_length'] = df['text'].str.len()

    print(f"Cleaned dataset shape: {df.shape}")
    print(f"Text length statistics:\n{df['text_length'].describe()}")

    # Run enhanced cross-validation
    fold_accuracies, final_accuracy, trained_tokenizer = enhanced_cross_validate(df, n_folds=5)

    return df, trained_tokenizer, final_accuracy

def check_model_exists():
    """Check if trained model file exists"""
    return os.path.exists('optimized_roberta_spam_classifier.pth')

def run_inference_only():
    """Run inference only using existing trained model"""
    print("✅ Found existing trained model!")
    print("Loading model for inference...")

    model, tokenizer, max_length = load_classifier()

    # Test examples
    test_examples = [
        # Clear Spam
        "CONGRATULATIONS! You've won $50,000 in our lottery! Click here to claim your prize immediately!",
        "URGENT: Your account will be suspended unless you verify your details within 24 hours!",
        "You've been pre-approved for a $10,000 loan with 0% interest! Apply now before offer expires!",

        # Phishing
        "Your Amazon order #AMZ-7449 for $899.99 has been shipped. Click to track or cancel.",
        "Security Alert: Unusual activity detected on your Google account. Verify identity now.",
        "Your PayPal account has been limited. Click here to restore access immediately.",

        # Clear Ham
        "Hi Sarah, please find attached the quarterly financial report for Q3 2024. Let me know if any questions.",
        "Meeting reminder: Team standup tomorrow at 10 AM in Conference Room B. See you there!",
        "Thanks for helping me move last weekend. I owe you dinner! How about next Friday?",
        "Your flight UA 1234 from LAX to JFK departs at 8:45 AM on March 15th. Have a safe trip!",

        # Marketing
        "Limited time offer: 50% off all items! Use code SAVE50 at checkout. Valid until Sunday.",
        "Free webinar: Learn advanced Excel techniques. Register now - only 100 spots available!",
        "Your warranty is about to expire. Extend your coverage for 3 more years at discounted price.",

        # Legitimate
        "Reminder: Your library books are due tomorrow. Renew online to avoid late fees.",
        "Newsletter: This week's top tech news and startup funding updates. Unsubscribe anytime."
    ]

    print(f"\nOptimized RoBERTa Email Classification Results:")
    print("-" * 70)

    # Single predictions
    for i, email in enumerate(test_examples, 1):
        result = predict_email(email, model, tokenizer, max_length)
        indicator = "SPAM" if result['result'] == 'Spam' else "HAM "

        print(f"{i:2}. [{indicator}] {result['confidence']:.3f} "
              f"(Spam: {result['spam_score']:.3f}) - {email[:50]}...")

    print("-" * 70)

    # Batch prediction example
    print(f"\nBatch Prediction Example:")
    batch_results = batch_predict(test_examples[:5], model, tokenizer, max_length)

    for i, result in enumerate(batch_results, 1):
        indicator = "SPAM" if result['result'] == 'Spam' else "HAM "
        print(f"{i}. [{indicator}] {result['confidence']:.3f} - {result['text'][:50]}...")

    return model, tokenizer, max_length

def run_complete_pipeline():
    """Complete pipeline including training and inference"""

    # Load and train model
    print("❌ No trained model found. Starting full training pipeline...")
    df, tokenizer, accuracy = main()

    # Train final model
    print("\nTraining final model...")
    final_model, test_accuracy = train_final_model(df, tokenizer)

    # Load model for inference
    print("\nLoading model for inference...")
    model, tokenizer, max_length = load_classifier()

    # Test examples
    test_examples = [
        # Clear Spam
        "CONGRATULATIONS! You've won $50,000 in our lottery! Click here to claim your prize immediately!",
        "URGENT: Your account will be suspended unless you verify your details within 24 hours!",
        "You've been pre-approved for a $10,000 loan with 0% interest! Apply now before offer expires!",

        # Phishing
        "Your Amazon order #AMZ-7449 for $899.99 has been shipped. Click to track or cancel.",
        "Security Alert: Unusual activity detected on your Google account. Verify identity now.",
        "Your PayPal account has been limited. Click here to restore access immediately.",

        # Clear Ham
        "Hi Sarah, please find attached the quarterly financial report for Q3 2024. Let me know if any questions.",
        "Meeting reminder: Team standup tomorrow at 10 AM in Conference Room B. See you there!",
        "Thanks for helping me move last weekend. I owe you dinner! How about next Friday?",
        "Your flight UA 1234 from LAX to JFK departs at 8:45 AM on March 15th. Have a safe trip!",

        # Marketing
        "Limited time offer: 50% off all items! Use code SAVE50 at checkout. Valid until Sunday.",
        "Free webinar: Learn advanced Excel techniques. Register now - only 100 spots available!",
        "Your warranty is about to expire. Extend your coverage for 3 more years at discounted price.",

        # Legitimate
        "Reminder: Your library books are due tomorrow. Renew online to avoid late fees.",
        "Newsletter: This week's top tech news and startup funding updates. Unsubscribe anytime."
    ]

    print(f"\nOptimized RoBERTa Email Classification Results:")
    print("-" * 70)

    # Single predictions
    for i, email in enumerate(test_examples, 1):
        result = predict_email(email, model, tokenizer, max_length)
        indicator = "SPAM" if result['result'] == 'Spam' else "HAM "

        print(f"{i:2}. [{indicator}] {result['confidence']:.3f} "
              f"(Spam: {result['spam_score']:.3f}) - {email[:50]}...")

    print("-" * 70)

    # Batch prediction example
    print(f"\nBatch Prediction Example:")
    batch_results = batch_predict(test_examples[:5], model, tokenizer, max_length)

    for i, result in enumerate(batch_results, 1):
        indicator = "SPAM" if result['result'] == 'Spam' else "HAM "
        print(f"{i}. [{indicator}] {result['confidence']:.3f} - {result['text'][:50]}...")

    return model, tokenizer, max_length

def interactive_predict(model, tokenizer, max_length):
    """Interactive email classification"""
    print("\nInteractive Email Classification")
    print("Enter emails to classify (type 'quit' to exit):")
    print("-" * 50)

    while True:
        email_text = input("\nEmail text: ")
        if email_text.lower() in ['quit', 'exit', 'q']:
            break

        result = predict_email(email_text, model, tokenizer, max_length)
        print(f"Result: {result['result']}")
        print(f"Confidence: {result['confidence']:.3f}")
        print(f"Spam Score: {result['spam_score']:.3f}")
        print(f"Ham Score: {result['ham_score']:.3f}")

# Smart pipeline execution with model checking
def smart_pipeline(force_retrain=False):
    """Intelligent pipeline that checks for existing model"""

    model_exists = check_model_exists()

    if model_exists and not force_retrain:
        print("🎯 Smart Model Detection Activated!")
        print(f"📁 Found: 'optimized_roberta_spam_classifier.pth'")
        print("⚡ Skipping training, loading existing model...")

        # Run inference only
        model, tokenizer, max_length = run_inference_only()

    else:
        if force_retrain:
            print("🔄 Force retraining enabled...")
            if model_exists:
                print("🗑️  Existing model will be overwritten after training")

        # Run complete pipeline with training
        model, tokenizer, max_length = run_complete_pipeline()

    return model, tokenizer, max_length

# Run the pipeline

In [8]:
if __name__ == "__main__":

    # OPTION 1: Smart execution (recommended)
    # Automatically detects existing model
    model, tokenizer, max_length = smart_pipeline(force_retrain=False)

    # OPTION 2: Force retraining (uncomment if needed)
    # model, tokenizer, max_length = smart_pipeline(force_retrain=True)

    print(f"\n🎉 Pipeline completed successfully!")
    print(f"🤖 Model ready for inference.")

    # Start interactive prediction
    interactive_predict(model, tokenizer, max_length)

🎯 Smart Model Detection Activated!
📁 Found: 'optimized_roberta_spam_classifier.pth'
⚡ Skipping training, loading existing model...
✅ Found existing trained model!
Loading model for inference...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Optimized RoBERTa Email Classification Results:
----------------------------------------------------------------------
 1. [SPAM] 0.904 (Spam: 0.904) - CONGRATULATIONS! You've won $50,000 in our lottery...
 2. [SPAM] 0.914 (Spam: 0.914) - URGENT: Your account will be suspended unless you ...
 3. [SPAM] 0.926 (Spam: 0.926) - You've been pre-approved for a $10,000 loan with 0...
 4. [SPAM] 0.826 (Spam: 0.826) - Your Amazon order #AMZ-7449 for $899.99 has been s...
 5. [SPAM] 0.894 (Spam: 0.894) - Security Alert: Unusual activity detected on your ...
 6. [SPAM] 0.866 (Spam: 0.866) - Your PayPal account has been limited. Click here t...
 7. [HAM ] 0.936 (Spam: 0.064) - Hi Sarah, please find attached the quarterly finan...
 8. [HAM ] 0.929 (Spam: 0.071) - Meeting reminder: Team standup tomorrow at 10 AM i...
 9. [HAM ] 0.838 (Spam: 0.162) - Thanks for helping me move last weekend. I owe you...
10. [HAM ] 0.814 (Spam: 0.186) - Your flight UA 1234 from LAX to JFK departs at 8:4...
11. [SPAM]