<a href="https://colab.research.google.com/github/Avniiii2606/Email-Classification-using-BERT-LDA/blob/main/Untitled16.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("wcukierski/enron-email-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/wcukierski/enron-email-dataset?dataset_version_number=2...


100%|██████████| 358M/358M [00:04<00:00, 80.4MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/wcukierski/enron-email-dataset/versions/2


In [None]:
import kagglehub
import pandas as pd
import os
import email
from tqdm import tqdm
from bs4 import BeautifulSoup
import re
# ... (other imports and functions remain the same)


# Download latest version
path = kagglehub.dataset_download("wcukierski/enron-email-dataset")

print("Path to dataset files:", path)

# Process the dataset using your defined function
df = process_enron_dataset(path)  # Call process_enron_dataset

# Now you can use the DataFrame 'df'
df.head()  # Now you can call head() on the DataFrame

Path to dataset files: /root/.cache/kagglehub/datasets/wcukierski/enron-email-dataset/versions/2
Processing Enron emails...


100%|██████████| 1/1 [00:00<00:00, 4882.78it/s]


In [None]:
import os
import email
import pandas as pd
import numpy as np
from tqdm import tqdm
from bs4 import BeautifulSoup
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from torch.cuda.amp import autocast, GradScaler


In [None]:
def clean_text(text):
    """Clean email text by removing HTML, extra spaces, and special characters"""
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    # Remove special characters
    text = re.sub(r'[^\w\s@]', '', text)
    return text.strip()

def parse_email_file(file_path):
    """Parse a single email file and return relevant information"""
    try:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            msg = email.message_from_file(f)

        # Get email body
        body = ""
        if msg.is_multipart():
            for part in msg.walk():
                if part.get_content_type() == "text/plain":
                    body += part.get_payload(decode=True).decode('utf-8', errors='ignore')
        else:
            body = msg.get_payload(decode=True).decode('utf-8', errors='ignore')

        return {
            'subject': msg.get('subject', ''),
            'from': msg.get('from', ''),
            'to': msg.get('to', ''),
            'body': clean_text(body)
        }
    except Exception as e:
        return None

def process_enron_dataset(root_dir, max_emails=100000):
    """Process the Enron dataset and return a DataFrame"""
    emails_data = []
    processed = 0

    print("Processing Enron emails...")
    for user in tqdm(os.listdir(root_dir)):
        user_dir = os.path.join(root_dir, user)
        if not os.path.isdir(user_dir):
            continue

        for folder in os.listdir(user_dir):
            folder_path = os.path.join(user_dir, folder)
            if not os.path.isdir(folder_path):
                continue

            for file in os.listdir(folder_path):
                if processed >= max_emails:
                    break

                file_path = os.path.join(folder_path, file)
                email_data = parse_email_file(file_path)

                if email_data:
                    email_data['file'] = folder  # Use folder name as category
                    emails_data.append(email_data)
                    processed += 1
                else:
                    # Handle cases where parse_email_file returns None
                    print(f"Error parsing file: {file_path}") # Print error message to diagnose why parsing failed
                    # If you want to skip this email add 'continue' here


    return pd.DataFrame(emails_data)


In [None]:
class EmailDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }


In [None]:
class EnronClassifier(nn.Module):
    def __init__(self, num_classes, dropout_rate=0.3):
        super(EnronClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained('bert-base-uncased')

        # Classifier layers
        self.classifier = nn.Sequential(
            nn.Linear(768, 512),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(256, num_classes)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        return self.classifier(pooled_output)


In [None]:
def train_epoch(model, dataloader, optimizer, scheduler, criterion, device, scaler):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    progress_bar = tqdm(dataloader, desc="Training")
    for batch in progress_bar:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        with autocast():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        _, preds = torch.max(outputs, dim=1)
        correct_predictions += torch.sum(preds == labels)
        total_predictions += len(labels)
        total_loss += loss.item()

        # Update progress bar
        progress_bar.set_postfix({
            'loss': total_loss / (progress_bar.n + 1),
            'accuracy': 100. * correct_predictions / total_predictions
        })

    return total_loss / len(dataloader), correct_predictions.float() / total_predictions

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)

            _, preds = torch.max(outputs, dim=1)
            correct_predictions += torch.sum(preds == labels)
            total_predictions += len(labels)
            total_loss += loss.item()

    return total_loss / len(dataloader), correct_predictions.float() / total_predictions


In [None]:
def main():
    # Configuration
    config = {
        'data_dir': '/root/.cache/kagglehub/datasets/wcukierski/enron-email-dataset/versions/2',
        'max_emails': 100000,  # Limit number of emails to process
        'max_length': 512,
        'batch_size': 16,
        'epochs': 5,
        'learning_rate': 2e-5,
        'warmup_steps': 0,
        'device': torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    }

    print(f"Using device: {config['device']}")

    # Process dataset
    df = process_enron_dataset(config['data_dir'], config['max_emails'])

    # Prepare labels
    label_encoder = LabelEncoder()
    df['encoded_category'] = label_encoder.fit_transform(df['file'])

    # Split data
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        df['body'].values,
        df['encoded_category'].values,
        test_size=0.2,
        random_state=42
    )

    # Initialize tokenizer and datasets
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

    train_dataset = EmailDataset(train_texts, train_labels, tokenizer, config['max_length'])
    val_dataset = EmailDataset(val_texts, val_labels, tokenizer, config['max_length'])

    train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=config['batch_size'])

    # Initialize model and training components
    model = EnronClassifier(num_classes=len(label_encoder.classes_))
    model = model.to(config['device'])

    optimizer = torch.optim.AdamW(model.parameters(), lr=config['learning_rate'])
    criterion = nn.CrossEntropyLoss()
    scaler = GradScaler()

    total_steps = len(train_loader) * config['epochs']
    scheduler = torch.optim.lr_scheduler.OneCycleLR(
        optimizer,
        max_lr=config['learning_rate'],
        total_steps=total_steps
    )

    # Training loop
    best_accuracy = 0

    for epoch in range(config['epochs']):
        print(f'\nEpoch {epoch + 1}/{config["epochs"]}')

        train_loss, train_acc = train_epoch(
            model, train_loader, optimizer, scheduler,
            criterion, config['device'], scaler
        )

        val_loss, val_acc = evaluate(model, val_loader, criterion, config['device'])

        print(f'Training Loss: {train_loss:.3f}, Accuracy: {train_acc:.3f}')
        print(f'Validation Loss: {val_loss:.3f}, Accuracy: {val_acc:.3f}')

        if val_acc > best_accuracy:
            best_accuracy = val_acc
            torch.save(model.state_dict(), 'best_model.pt')

    print(f'\nBest validation accuracy: {best_accuracy:.3f}')

if __name__ == "__main__":
    main()

Using device: cpu
Processing Enron emails...


100%|██████████| 1/1 [00:00<00:00, 1065.63it/s]


KeyError: 'file'