In [1]:
# Import necessary libraries
import mailbox
import email
import logging

from pathlib import Path
from multiprocessing import Pool, cpu_count
from typing import List, Optional, Any

import joblib
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

# For topic modeling
from gensim import corpora, models
from gensim.utils import simple_preprocess 
from nltk.corpus import stopwords

# Additional imports from your pipeline
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
from bs4 import BeautifulSoup

from torch.utils.tensorboard import SummaryWriter

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("email_generation")


In [3]:
# Set random seed for reproducibility
np.random.seed(42)
torch.manual_seed(42)


<torch._C.Generator at 0x7b665ff6f5b0>

In [4]:
# Device configuration (use GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [5]:
# Define checkpoint utilities (from your pipeline)
CHECKPOINT_DIR = Path("checkpoints")
CHECKPOINT_DIR.mkdir(exist_ok=True)


In [6]:
def save_checkpoint(obj: Any, name: str):
    """Saves an object to the checkpoint directory."""
    path = CHECKPOINT_DIR / f"{name}.pkl"
    joblib.dump(obj, path)
    logger.info(f"Checkpoint saved: {path}")


def load_checkpoint(name: str) -> Optional[Any]:
    """Loads an object from the checkpoint directory."""
    path = CHECKPOINT_DIR / f"{name}.pkl"
    if path.exists():
        obj = joblib.load(path)
        logger.info(f"Checkpoint loaded: {path}")
        return obj
    else:
        logger.info(f"No checkpoint found for: {path}")
        return None


## 1. Data Loading and Preprocessing


In [7]:
# Define functions to parse and preprocess emails (from your pipeline)
_GMAIL_CATEGORY_HEADER_MARKER = "Categoría:"


def decode_mime_str(encoded: str) -> str:
    # Decodes a MIME-encoded string to a regular string.
    if not encoded:
        return ""
    fragments = email.header.decode_header(encoded)
    decoded = "".join(
        (
            fragment.decode(charset or "utf-8", errors="ignore")
            if isinstance(fragment, bytes)
            else fragment
        )
        for fragment, charset in fragments
    )
    return decoded


def parse_gmail_labels(gmail_labels: str) -> str:
    # Parses Gmail labels to extract the primary category.
    gmail_labels = gmail_labels.split(",")
    category_label = "Uncategorized"
    for label in gmail_labels:
        if _GMAIL_CATEGORY_HEADER_MARKER in label:
            category_label = label.replace(_GMAIL_CATEGORY_HEADER_MARKER, "").strip()
            break
    return category_label


def parse_body(message: mailbox.Message) -> str:
    # Extracts and decodes the body of an email message.
    body_parts = []
    try:
        if message.is_multipart():
            for part in message.walk():
                payload = part.get_payload(decode=True)
                if payload:
                    body_parts.append(payload.decode("utf-8", errors="ignore"))
        else:
            payload = message.get_payload(decode=True)
            if payload:
                body_parts.append(payload.decode("utf-8", errors="ignore"))
    except Exception as e:
        logger.error(f"Error extracting body: {e}")
    return " ".join(body_parts)


def parse_message(message) -> tuple[str, str, str]:
    # Parses an email message to extract the subject, body, and category.
    try:
        subject = decode_mime_str(message.get("subject", ""))
        body = parse_body(message)
        category = parse_gmail_labels(
            decode_mime_str(message.get("X-Gmail-Labels", ""))
        )
        return [subject, body, category]
    except Exception as e:
        logger.error(f"Failed to process an email: {e}")
        return None


def load_emails(mbox_file_path: str, max_emails: int | None = None) -> pd.DataFrame:
    # Loads and parses emails from an MBOX file using parallel processing.
    columns = ["Subject", "Body", "Category"]
    data = []
    mbox = mailbox.mbox(mbox_file_path)
    n_processes = max(cpu_count() - 1, 1)
    with Pool(processes=n_processes) as pool:
        for i, result in enumerate(tqdm(pool.imap_unordered(parse_message, mbox, chunksize=100))):
            if result:
                data.append(result)
            if max_emails is not None and len(data) >= max_emails:
                break
    return pd.DataFrame(data, columns=columns)


# Preprocessing functions (from your pipeline)
def parse_html(html: str) -> str:
    # Parses HTML content and extracts text.
    soup = BeautifulSoup(html, "lxml")
    return soup.get_text(separator=" ", strip=True)


def transform_parse_html(
    df: pd.DataFrame,
    *,
    columns: List[str] | None = None,
):
    if columns is None:
        columns = ["Body"]
    for column in columns:
        df[column] = df[column].apply(lambda x: parse_html(x))


def clean_text(text: str) -> str:
    # Normalizes the text by converting to lowercase.
    text = text.lower()
    return text


def transform_clean_text(
    df: pd.DataFrame,
    *,
    columns: List[str] | None = None,
):
    if columns is None:
        columns = ["Subject", "Body"]
    for column in columns:
        df[column] = df[column].apply(lambda x: clean_text(x))


def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
    # Preprocesses the email data by cleaning text.
    transform_parse_html(df)
    transform_clean_text(df)
    df["Text"] = df["Subject"] + " " + df["Body"]
    return df


In [8]:
# Load the emails
mbox_path = Path("emails.mbox")
df = load_checkpoint("emails_df")
if df is not None:
    logger.info("Emails DataFrame loaded from checkpoint.")
else:
    df = load_emails(mbox_path, max_emails=100)
    save_checkpoint(df, "emails_df")


INFO:email_generation:Checkpoint loaded: checkpoints/emails_df.pkl
INFO:email_generation:Emails DataFrame loaded from checkpoint.


In [9]:
# Apply preprocessing
preprocessed_df = load_checkpoint("preprocessed_df")
if preprocessed_df is not None:
    df = preprocessed_df
    logger.info("Preprocessed DataFrame loaded from checkpoint.")
else:
    df = preprocess_data(df)
    save_checkpoint(df, "preprocessed_df")


INFO:email_generation:No checkpoint found for: checkpoints/preprocessed_df.pkl
INFO:email_generation:Checkpoint saved: checkpoints/preprocessed_df.pkl


## 2. Topic Modeling


In [10]:
# Preprocess texts for topic modeling
STOP_WORDS = set(stopwords.words("english"))


def preprocess_text_for_topic_modeling(text):
    tokens = simple_preprocess(text, deacc=True)  # deacc=True removes punctuations
    tokens = [t for t in tokens if t not in STOP_WORDS]
    return tokens


# Convert topic distributions to fixed-size vectors
def topic_vector(topic_dist, num_topics):
    vec = np.zeros(num_topics)
    for topic_id, prob in topic_dist:
        vec[topic_id] = prob
    return vec


In [11]:
# Extract the texts for topic modeling
texts = df["Text"].tolist()

processed_texts = load_checkpoint("processed_texts")
if processed_texts is None:
    processed_texts = [preprocess_text_for_topic_modeling(text) for text in tqdm(texts, desc="Processing texts")]
    save_checkpoint(processed_texts, "processed_texts")


INFO:email_generation:No checkpoint found for: checkpoints/processed_texts.pkl
Processing texts: 100%|██████████| 100/100 [00:00<00:00, 731.33it/s]
INFO:email_generation:Checkpoint saved: checkpoints/processed_texts.pkl


In [12]:
# Create dictionary and corpus for LDA

dictionary = load_checkpoint("dictionary")
if dictionary is None:
    dictionary = corpora.Dictionary(processed_texts)
    save_checkpoint(dictionary, "dictionary")

corpus = load_checkpoint("corpus")
if corpus is None:
    corpus = [dictionary.doc2bow(text) for text in tqdm(processed_texts, desc="Creating corpus")]
    save_checkpoint(corpus, "corpus")


INFO:email_generation:No checkpoint found for: checkpoints/dictionary.pkl
INFO:gensim.corpora.dictionary:adding document #0 to Dictionary<0 unique tokens: []>
INFO:gensim.corpora.dictionary:built Dictionary<14975 unique tokens: ['ae', 'afterhours', 'aires', 'argentino', 'auto']...> from 100 documents (total 63329 corpus positions)
INFO:gensim.utils:Dictionary lifecycle event {'msg': "built Dictionary<14975 unique tokens: ['ae', 'afterhours', 'aires', 'argentino', 'auto']...> from 100 documents (total 63329 corpus positions)", 'datetime': '2024-12-03T12:29:33.466670', 'gensim': '4.3.3', 'python': '3.11.10 (main, Oct  3 2024, 07:29:13) [GCC 11.2.0]', 'platform': 'Linux-6.8.0-49-generic-x86_64-with-glibc2.39', 'event': 'created'}
INFO:email_generation:Checkpoint saved: checkpoints/dictionary.pkl
INFO:email_generation:No checkpoint found for: checkpoints/corpus.pkl
Creating corpus: 100%|██████████| 100/100 [00:00<00:00, 6747.16it/s]
INFO:email_generation:Checkpoint saved: checkpoints/corpu

In [13]:
# Train LDA model
num_topics = 5  # Adjust based on your data
lda_model = load_checkpoint("lda_model")
if lda_model is None:
    lda_model = models.LdaModel(
        corpus, num_topics=num_topics, id2word=dictionary, passes=15, random_state=42
    )
    save_checkpoint(lda_model, "lda_model")

# Get topic distributions for each document
topic_distributions = load_checkpoint("topic_distributions")
if topic_distributions is None:
    topic_distributions = [
        lda_model.get_document_topics(bow) 
        for bow in tqdm(corpus, desc="Getting topic distributions")
    ]
    save_checkpoint(topic_distributions, "topic_distributions")

topic_vectors = load_checkpoint("topic_vectors")
if topic_vectors is None:
    topic_vectors = [
        topic_vector(td, num_topics) 
        for td in tqdm(topic_distributions, desc="Creating topic vectors")
    ]
    save_checkpoint(topic_vectors, "topic_vectors")

# Add topic vectors to the DataFrame
df["TopicVector"] = topic_vectors


INFO:email_generation:No checkpoint found for: checkpoints/lda_model.pkl
INFO:gensim.models.ldamodel:using symmetric alpha at 0.2
INFO:gensim.models.ldamodel:using symmetric eta at 0.2
INFO:gensim.models.ldamodel:using serial LDA version on this node
INFO:gensim.models.ldamodel:running online (multi-pass) LDA training, 5 topics, 15 passes over the supplied corpus of 100 documents, updating model once every 100 documents, evaluating perplexity every 100 documents, iterating 50x with a convergence threshold of 0.001000
INFO:gensim.models.ldamodel:-10.900 per-word bound, 1910.2 perplexity estimate based on a held-out corpus of 100 documents with 63329 words
INFO:gensim.models.ldamodel:PROGRESS: pass 0, at document #100/100
INFO:gensim.models.ldamodel:topic #0 (0.200): 0.013*"https" + 0.011*"com" + 0.007*"click" + 0.006*"email" + 0.005*"utm_source" + 0.005*"read" + 0.003*"jamanetwork" + 0.003*"ks" + 0.003*"alerts" + 0.003*"app"
INFO:gensim.models.ldamodel:topic #1 (0.200): 0.009*"https" + 

## 3. Character Mapping and Dataset Preparation


In [14]:
class EmailDataset(Dataset):
    def __init__(self, texts, topic_vectors, char2idx, seq_length):
        self.texts = texts
        self.topic_vectors = topic_vectors
        self.char2idx = char2idx
        self.seq_length = seq_length
        self.data = self.process_texts()

    def process_texts(self):
        data = []
        for text, topic_vec in zip(self.texts, self.topic_vectors):
            encoded = [self.char2idx.get(char, 0) for char in text]
            if len(encoded) < self.seq_length + 1:
                continue  # Skip sequences that are too short
            for i in range(len(encoded) - self.seq_length):
                input_seq = encoded[i : i + self.seq_length]
                target_seq = encoded[i + 1 : i + self.seq_length + 1]
                data.append((input_seq, target_seq, topic_vec))
        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_seq, target_seq, topic_vec = self.data[idx]
        return (
            torch.tensor(input_seq, dtype=torch.long),
            torch.tensor(target_seq, dtype=torch.long),
            torch.tensor(topic_vec, dtype=torch.float32),
        )


In [15]:
# Create character mapping
all_text = " ".join(df["Text"])
chars = sorted(list(set(all_text)))
char2idx = {char: idx for idx, char in enumerate(chars)}
idx2char = {idx: char for char, idx in char2idx.items()}
vocab_size = len(char2idx)
seq_length = 100  # Adjust based on your data

# Prepare the dataset and dataloader
dataset = EmailDataset(
    df["Text"].tolist(), df["TopicVector"].tolist(), char2idx, seq_length
)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)


## 4. Defining the Topic-Guided VAE Model


In [16]:
# Encoder
class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, latent_dim, num_topics):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim + num_topics, hidden_dim, batch_first=True)
        self.hidden_to_mu = nn.Linear(hidden_dim, latent_dim)
        self.hidden_to_logvar = nn.Linear(hidden_dim, latent_dim)

    def forward(self, x, topic_vec):
        x = self.embedding(x)
        topic_vec_expanded = topic_vec.unsqueeze(1).repeat(1, x.size(1), 1)
        x = torch.cat([x, topic_vec_expanded], dim=2)
        _, (h_n, _) = self.lstm(x)
        h_n = h_n.squeeze(0)
        mu = self.hidden_to_mu(h_n)
        logvar = self.hidden_to_logvar(h_n)
        return mu, logvar


# Decoder
class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, latent_dim, num_topics):
        super(Decoder, self).__init__()
        self.latent_to_hidden = nn.Linear(latent_dim + num_topics, hidden_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.outputs2vocab = nn.Linear(hidden_dim, vocab_size)
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, x, z, topic_vec):
        z = torch.cat([z, topic_vec], dim=1)
        h_0 = torch.tanh(self.latent_to_hidden(z)).unsqueeze(0)
        c_0 = torch.zeros_like(h_0).to(device)
        x = self.embedding(x)
        outputs, _ = self.lstm(x, (h_0, c_0))
        logits = self.outputs2vocab(outputs)
        return logits


# VAE Model
class TopicGuidedVAE(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, latent_dim, num_topics):
        super(TopicGuidedVAE, self).__init__()
        self.encoder = Encoder(
            vocab_size, embedding_dim, hidden_dim, latent_dim, num_topics
        )
        self.decoder = Decoder(
            vocab_size, embedding_dim, hidden_dim, latent_dim, num_topics
        )

    def forward(self, x, topic_vec):
        mu, logvar = self.encoder(x, topic_vec)
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        z = mu + eps * std
        logits = self.decoder(x, z, topic_vec)
        return logits, mu, logvar



## 5. Define the Loss Function


In [17]:
def loss_function(logits, targets, mu, logvar):
    CE = F.cross_entropy(
        logits.view(-1, logits.size(-1)), targets.view(-1), reduction="mean"
    )
    KLD = -0.5 * torch.mean(1 + logvar - mu.pow(2) - logvar.exp())
    return CE + KLD



## 6. Training the VAE Model


In [18]:
# Model hyperparameters
embedding_dim = 256
hidden_dim = 512
latent_dim = 64

# Initialize model, optimizer and tensorboard writer
model = TopicGuidedVAE(
    vocab_size, embedding_dim, hidden_dim, latent_dim, num_topics
).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
writer = SummaryWriter('runs/topic_guided_vae')

# Training loop
num_epochs = 10  # Adjust based on your data
best_loss = float('inf')
model.train()

# Initialize early stopping parameters
patience = 3  # Number of epochs to wait for improvement
patience_counter = 0

for epoch in range(num_epochs):
    total_loss = 0
    total_ce_loss = 0 
    total_kld_loss = 0
    
    progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")
    
    for batch_idx, (input_seq, target_seq, topic_vec) in enumerate(progress_bar):
        # Move data to device
        input_seq = input_seq.to(device)
        target_seq = target_seq.to(device)
        topic_vec = topic_vec.to(device)

        # Forward pass
        optimizer.zero_grad()
        logits, mu, logvar = model(input_seq, topic_vec)
        
        # Calculate losses
        ce_loss = F.cross_entropy(logits.view(-1, logits.size(-1)), 
                                target_seq.view(-1), reduction="mean")
        kld_loss = -0.5 * torch.mean(1 + logvar - mu.pow(2) - logvar.exp())
        loss = ce_loss + kld_loss

        # Backward pass
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        # Update metrics
        total_loss += loss.item()
        total_ce_loss += ce_loss.item()
        total_kld_loss += kld_loss.item()
        
        # Update progress bar
        progress_bar.set_postfix({
            'loss': f'{loss.item():.4f}',
            'ce_loss': f'{ce_loss.item():.4f}',
            'kld_loss': f'{kld_loss.item():.4f}'
        })

        # Log losses every 1000 iterations
        if batch_idx % 1000 == 0:
            writer.add_scalar('Loss/total_step', loss.item(), epoch * len(dataloader) + batch_idx)
            writer.add_scalar('Loss/ce_step', ce_loss.item(), epoch * len(dataloader) + batch_idx)
            writer.add_scalar('Loss/kld_step', kld_loss.item(), epoch * len(dataloader) + batch_idx)

        # Save checkpoint every 5000 batches
        if batch_idx % 5000 == 0:
            checkpoint_path = f'checkpoints/model_epoch{epoch}_batch{batch_idx}.pt'
            torch.save({
                'epoch': epoch,
                'batch_idx': batch_idx,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': loss.item(),
                'ce_loss': ce_loss.item(),
                'kld_loss': kld_loss.item()
            }, checkpoint_path)

    # Calculate average losses
    avg_loss = total_loss / len(dataloader)
    avg_ce_loss = total_ce_loss / len(dataloader)
    avg_kld_loss = total_kld_loss / len(dataloader)

    # Log to tensorboard
    writer.add_scalar('Loss/total', avg_loss, epoch)
    writer.add_scalar('Loss/cross_entropy', avg_ce_loss, epoch)
    writer.add_scalar('Loss/kld', avg_kld_loss, epoch)
    
    # Save best model and check for early stopping
    if avg_loss < best_loss:
        best_loss = avg_loss
        patience_counter = 0
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': best_loss,
        }, 'checkpoints/best_model.pt')
    else:
        patience_counter += 1
        if patience_counter >= patience:  # Stop if no improvement for patience epochs
            print(f"Early stopping triggered at epoch {epoch+1}")
            break

    # Save epoch checkpoint
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': avg_loss,
        'ce_loss': avg_ce_loss,
        'kld_loss': avg_kld_loss
    }, f'checkpoints/model_epoch{epoch}.pt')

    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Average Loss: {avg_loss:.4f}")
    print(f"Cross Entropy Loss: {avg_ce_loss:.4f}")
    print(f"KLD Loss: {avg_kld_loss:.4f}")
    print("-" * 50)
writer.close()


Epoch 1/10:  84%|████████▍ | 7801/9310 [08:22<01:37, 15.52it/s, loss=0.7521, ce_loss=0.7478, kld_loss=0.0043]


RuntimeError: CUDA error: unspecified launch failure
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


## 7. Generating New Emails Conditioned on Topics


In [19]:
def generate_email(model, idx2char, char2idx, topic_vec, start_text="", length=500):
    model.eval()
    with torch.no_grad():
        input_seq = [char2idx.get(c, 0) for c in start_text]
        input_seq = (
            torch.tensor(input_seq[-seq_length:], dtype=torch.long)
            .unsqueeze(0)
            .to(device)
        )
        topic_vec = torch.tensor(topic_vec, dtype=torch.float32).unsqueeze(0).to(device)

        if input_seq.size(1) < seq_length:
            pad_size = seq_length - input_seq.size(1)
            input_seq = F.pad(input_seq, (pad_size, 0), "constant", 0)

        mu, logvar = model.encoder(input_seq, topic_vec)
        z = mu  # Use mean for deterministic output
        generated = start_text
        for _ in range(length):
            logits = model.decoder(input_seq, z, topic_vec)
            probs = F.softmax(logits[:, -1, :], dim=-1)
            next_char_idx = torch.multinomial(probs, num_samples=1).item()
            next_char = idx2char[next_char_idx]
            generated += next_char
            input_seq = torch.cat(
                [input_seq[:, 1:], torch.tensor([[next_char_idx]], device=device)],
                dim=1,
            )
        return generated


In [20]:
# Display the topics
print("\nTopics discovered by LDA:")
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx}: {topic}")

# Select a topic index (e.g., topic 0)
selected_topic = 0

# Create a one-hot topic vector
topic_vec = np.zeros(num_topics)
topic_vec[selected_topic] = 1.0

# Generate a new email
start_text = "Dear "
generated_email = generate_email(
    model, idx2char, char2idx, topic_vec, start_text=start_text, length=500
)
print("\nGenerated Email:")
print(generated_email)


INFO:gensim.models.ldamodel:topic #0 (0.200): 0.021*"com" + 0.020*"https" + 0.016*"click" + 0.009*"jamanetwork" + 0.009*"alerts" + 0.008*"ks" + 0.008*"axac" + 0.008*"objz" + 0.007*"heygen" + 0.006*"aws"
INFO:gensim.models.ldamodel:topic #1 (0.200): 0.011*"read" + 0.008*"nvidia" + 0.007*"https" + 0.006*"tldr" + 0.006*"minute" + 0.005*"com" + 0.005*"data" + 0.005*"new" + 0.004*"utm_source" + 0.004*"ai"
INFO:gensim.models.ldamodel:topic #2 (0.200): 0.004*"obj" + 0.004*"endobj" + 0.003*"type" + 0.003*"null" + 0.003*"length" + 0.002*"stream" + 0.002*"copilot" + 0.002*"filter" + 0.002*"flatedecode" + 0.002*"xz"
INFO:gensim.models.ldamodel:topic #3 (0.200): 0.015*"github" + 0.014*"https" + 0.013*"build" + 0.013*"com" + 0.012*"data" + 0.011*"eigen" + 0.009*"root" + 0.009*"failed" + 0.007*"run" + 0.007*"holafly"
INFO:gensim.models.ldamodel:topic #4 (0.200): 0.020*"de" + 0.014*"https" + 0.012*"com" + 0.008*"email" + 0.007*"utm_source" + 0.007*"en" + 0.007*"la" + 0.005*"el" + 0.005*"read" + 0.005


Topics discovered by LDA:
Topic 0: 0.021*"com" + 0.020*"https" + 0.016*"click" + 0.009*"jamanetwork" + 0.009*"alerts" + 0.008*"ks" + 0.008*"axac" + 0.008*"objz" + 0.007*"heygen" + 0.006*"aws"
Topic 1: 0.011*"read" + 0.008*"nvidia" + 0.007*"https" + 0.006*"tldr" + 0.006*"minute" + 0.005*"com" + 0.005*"data" + 0.005*"new" + 0.004*"utm_source" + 0.004*"ai"
Topic 2: 0.004*"obj" + 0.004*"endobj" + 0.003*"type" + 0.003*"null" + 0.003*"length" + 0.002*"stream" + 0.002*"copilot" + 0.002*"filter" + 0.002*"flatedecode" + 0.002*"xz"
Topic 3: 0.015*"github" + 0.014*"https" + 0.013*"build" + 0.013*"com" + 0.012*"data" + 0.011*"eigen" + 0.009*"root" + 0.009*"failed" + 0.007*"run" + 0.007*"holafly"
Topic 4: 0.020*"de" + 0.014*"https" + 0.012*"com" + 0.008*"email" + 0.007*"utm_source" + 0.007*"en" + 0.007*"la" + 0.005*"el" + 0.005*"read" + 0.005*"tldr"


RuntimeError: CUDA error: unspecified launch failure
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


## 8. Evaluation and Visualization


In [21]:
# You can further evaluate the generated emails by inspecting them manually.
# Additionally, visualize topics and their associated words to understand the topics better.

# Visualize topics
import matplotlib.pyplot as plt
from wordcloud import WordCloud

for idx in range(num_topics):
    plt.figure(figsize=(8, 6))
    plt.title(f"Word Cloud for Topic {idx}")
    words = dict(lda_model.show_topic(idx, 50))
    wordcloud = WordCloud(
        width=800, height=600, background_color="white"
    ).generate_from_frequencies(words)
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()


ModuleNotFoundError: No module named 'matplotlib'