In [13]:
# %pip install torch torchvision diffusers transformers accelerate

In [14]:
import os

# Folder containing captcha images
folder_path = "samples"

# Lists to store image paths and labels
image_paths = []
labels = []

# Iterate over files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".png"):
        label = filename.split(".")[0]  # Extract label from filename
        image_paths.append(os.path.join(folder_path, filename))
        labels.append(label)

print("Image paths:", image_paths)
print("Labels:", labels)

Image paths: ['samples\\226md.png', 'samples\\22d5n.png', 'samples\\2356g.png', 'samples\\23mdg.png', 'samples\\23n88.png', 'samples\\243mm.png', 'samples\\244e2.png', 'samples\\245y5.png', 'samples\\24f6w.png', 'samples\\24pew.png', 'samples\\25257.png', 'samples\\253dc.png', 'samples\\25egp.png', 'samples\\25m6p.png', 'samples\\25p2m.png', 'samples\\25w53.png', 'samples\\264m5.png', 'samples\\268g2.png', 'samples\\28348.png', 'samples\\28x47.png', 'samples\\2b827.png', 'samples\\2bg48.png', 'samples\\2cegf.png', 'samples\\2cg58.png', 'samples\\2cgyx.png', 'samples\\2en7g.png', 'samples\\2enf4.png', 'samples\\2fxgd.png', 'samples\\2g783.png', 'samples\\2g7nm.png', 'samples\\2gyb6.png', 'samples\\2mg87.png', 'samples\\2mpnn.png', 'samples\\2n73f.png', 'samples\\2nbcx.png', 'samples\\2nf26.png', 'samples\\2npg6.png', 'samples\\2nx38.png', 'samples\\2p2y8.png', 'samples\\2pfpn.png', 'samples\\2w4y7.png', 'samples\\2wc38.png', 'samples\\2wx73.png', 'samples\\2x7bm.png', 'samples\\2xc2n.pn

In [15]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image

class CaptchaDataset(Dataset):
    def __init__(self, image_paths, labels, image_size=64):
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transforms.Compose([
            transforms.Resize((image_size, image_size)),
            transforms.ToTensor(),
            # Normalize to [-1, 1] if needed (depending on your model)
            transforms.Normalize([0.5], [0.5])
        ])

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx]).convert("RGB")
        image = self.transform(image)
        label = self.labels[idx]  # assume label is a string, e.g., "A3B9Z"
        return image, label


In [16]:
import torch.nn as nn

class LabelEmbedding(nn.Module):
    def __init__(self, vocab_size, embedding_dim, max_length=5):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.max_length = max_length
        # Optionally, add an LSTM or transformer encoder for sequence encoding.
        self.encoder = nn.LSTM(embedding_dim, embedding_dim, batch_first=True)

    def forward(self, labels):
        # labels: (batch_size, max_length) of token indices
        x = self.embedding(labels)  # (batch_size, max_length, embedding_dim)
        # Encode sequence information. We take the last hidden state.
        _, (hidden, _) = self.encoder(x)
        return hidden.squeeze(0)  # (batch_size, embedding_dim)


In [17]:
import torch.nn.functional as F

class ConditionalUNet(nn.Module):
    def __init__(self, in_channels=3, base_channels=64, cond_dim=128):
        super().__init__()
        # Downsample layers
        self.down1 = nn.Conv2d(in_channels, base_channels, kernel_size=3, padding=1)
        self.down2 = nn.Conv2d(base_channels, base_channels*2, kernel_size=3, padding=1)
        # Up-sample layers
        self.up1 = nn.ConvTranspose2d(base_channels*2, base_channels, kernel_size=3, padding=1)
        self.up2 = nn.ConvTranspose2d(base_channels, in_channels, kernel_size=3, padding=1)
        # Conditioning projection
        self.cond_proj = nn.Linear(cond_dim, base_channels*2)

    def forward(self, x, t, cond_embedding):
        """
        x: noisy image (B, C, H, W)
        t: timestep (for positional embedding if needed)
        cond_embedding: conditioning vector (B, cond_dim)
        """
        # Downsampling
        h1 = F.relu(self.down1(x))
        h2 = F.relu(self.down2(h1))
        
        # Inject conditioning: for example, add a broadcasted conditioning term.
        cond = self.cond_proj(cond_embedding).unsqueeze(-1).unsqueeze(-1)
        h2 = h2 + cond
        
        # Upsampling
        h = F.relu(self.up1(h2))
        # Optionally concatenate with h1 here for skip connections
        h = h + h1
        out = self.up2(h)
        return out


In [18]:
import numpy as np

# Define a simple noise schedule
T = 1000  # total timesteps
betas = torch.linspace(1e-4, 0.02, T)  # or use a cosine schedule
alphas = 1.0 - betas
alphas_cumprod = torch.cumprod(alphas, dim=0)

def q_sample(x0, t, noise=None):
    """
    Forward diffusion: sample x_t from x0 using noise.
    x0: original image (B, C, H, W)
    t: timestep (B,)
    noise: optional noise sample
    """
    if noise is None:
        noise = torch.randn_like(x0)
    # Get appropriate alpha values
    alphas_t = alphas_cumprod[t].view(-1, 1, 1, 1).to(x0.device)
    return torch.sqrt(alphas_t) * x0 + torch.sqrt(1 - alphas_t) * noise

# Suppose we have:
cond_dim = 128  # dimension of text conditioning
unet = ConditionalUNet(in_channels=3, base_channels=64, cond_dim=cond_dim)
label_embedder = LabelEmbedding(vocab_size=36, embedding_dim=cond_dim, max_length=5)  
# For 26 letters + 10 digits = 36 tokens.

optimizer = torch.optim.Adam(list(unet.parameters()) + list(label_embedder.parameters()), lr=1e-4)

# Training loop skeleton
num_epochs = 100
dataloader = DataLoader(CaptchaDataset(image_paths, labels), batch_size=16, shuffle=True)


def labels_to_indices(labels):
    """
    Convert a list of string labels to a tensor of indices.
    Assumes labels are strings of length 5 with characters in [0-9A-Z].
    """
    char_to_idx = {char: idx for idx, char in enumerate("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ")}
    indices = [[char_to_idx[char] for char in label] for label in labels]
    return torch.tensor(indices, dtype=torch.long)


for epoch in range(num_epochs):
    for images, text_labels in dataloader:
        # images: (B, 3, H, W)
        # Convert text_labels to indices, e.g., using your mapping.
        # Assume `labels_to_indices` converts list of 5-char strings to a tensor of shape (B, 5)
        label_indices = labels_to_indices(text_labels)  # Implement this function.
        cond_embedding = label_embedder(label_indices)  # (B, cond_dim)
        
        B = images.size(0)
        t = torch.randint(0, T, (B,), device=images.device).long()
        
        noise = torch.randn_like(images)
        x_t = q_sample(images, t, noise=noise)
        
        # The network tries to predict the noise added (this is one common formulation).
        predicted_noise = unet(x_t, t, cond_embedding)
        
        loss = F.mse_loss(predicted_noise, noise)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f"Epoch {epoch}: Loss = {loss.item()}")


KeyError: 'n'

In [None]:
@torch.no_grad()
def p_sample(model, x, t, cond_embedding):
    # In practice, you would use your learned model to predict the noise and then perform one reverse step.
    # Here, we use a simplified approach.
    predicted_noise = model(x, t, cond_embedding)
    beta_t = betas[t].view(-1, 1, 1, 1).to(x.device)
    alpha_t = alphas[t].view(-1, 1, 1, 1).to(x.device)
    alpha_cumprod_t = alphas_cumprod[t].view(-1, 1, 1, 1).to(x.device)
    
    # Compute the mean of the posterior (simplified)
    mean = (1 / torch.sqrt(alpha_t)) * (x - beta_t / torch.sqrt(1 - alpha_cumprod_t) * predicted_noise)
    # Sample noise if t > 0
    if t > 0:
        noise = torch.randn_like(x)
    else:
        noise = 0
    return mean + torch.sqrt(beta_t) * noise

@torch.no_grad()
def sample_captcha(model, label, label_embedder, shape=(1, 3, 64, 64)):
    # label: string (5 characters)
    # Convert label to indices and get conditioning embedding.
    label_tensor = labels_to_indices([label])  # shape (1, 5)
    cond_embedding = label_embedder(label_tensor)
    
    x = torch.randn(shape).to(next(model.parameters()).device)  # start from pure noise
    
    for i in reversed(range(T)):
        t = torch.full((shape[0],), i, device=x.device, dtype=torch.long)
        x = p_sample(model, x, t, cond_embedding)
    return x

# Example usage after training:
generated = sample_captcha(unet, "A3B9Z", label_embedder)
# Post-process generated image: denormalize if needed, convert to PIL, etc.