    DATA_ROOT: str = "C:\\Users\\muhid\\Downloads\\UTN-CV25-Captcha-Dataset"


In [4]:
import json, os, torch, torchvision as tv
from PIL import Image

with open('C:\\Users\\muhid\\Downloads\\UTN-CV25-Captcha-Dataset\\part2\\train\\labels.json') as f:
    ann = json.load(f)

charset = sorted({c for e in ann for c in e['captcha_string']})
BLANK = 0
char2idx = {c:i+1 for i,c in enumerate(charset)}  # 0 reserved for CTC blank
idx2char = {i+1:c for i,c in enumerate(charset)}

def text_to_targets(s): return torch.tensor([char2idx[c] for c in s], dtype=torch.long)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import numpy as np
from torchvision.transforms.functional import to_tensor

from PIL import Image
try:
    RESAMP_BILINEAR = Image.Resampling.BILINEAR  # Pillow >= 10
except AttributeError:
    RESAMP_BILINEAR = Image.BILINEAR             # Pillow < 10

MODEL_H = 32
def preprocess(png_path):
    img = Image.open(png_path).convert('L')
    w,h = img.size
    new_w = int(w * (MODEL_H / h))
    img = img.resize((new_w, MODEL_H), RESAMP_BILINEAR)
    return to_tensor(img)  # (1,H,W) in [0,1]


In [6]:
class Captcha2(torch.utils.data.Dataset):
    def __init__(self, root):
        self.root = root
        with open(os.path.join(root,'labels.json')) as f: self.meta = json.load(f)
    def __len__(self): return len(self.meta)
    def __getitem__(self, i):
        e = self.meta[i]
        img_path = os.path.join(self.root, 'images', f"{e['image_id']}.png")
        x = preprocess(img_path)
        y = text_to_targets(e['captcha_string'])
        return x, y

def collate(batch, stride=4):
    xs, ys = zip(*batch)
    H = xs[0].shape[1]
    widths = [x.shape[2] for x in xs]             # original widths before padding
    maxW = max(widths)

    padded = []
    input_lengths = []
    for x, ow in zip(xs, widths):
        padW = maxW - ow
        if padW:
            x = torch.nn.functional.pad(x, (0, padW, 0, 0))  # pad right
        padded.append(x)
        input_lengths.append(ow // stride)  # valid time steps for this sample

    targets = torch.cat(ys)                                # 1D concat
    target_lengths = torch.tensor([len(y) for y in ys])    # per-sample
    return torch.stack(padded), targets, torch.tensor(input_lengths), target_lengths


In [7]:
import torch.nn as nn
class CRNN(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1,64,3,1,1), nn.ReLU(), nn.MaxPool2d((2,2)),  # H/2, W/2
            nn.Conv2d(64,128,3,1,1), nn.ReLU(), nn.MaxPool2d((2,2)), # H/4, W/4
            nn.Conv2d(128,256,3,1,1), nn.ReLU()
        )
        self.proj_h = nn.AdaptiveAvgPool2d((1, None))  # collapse height
        self.rnn = nn.LSTM(input_size=256, hidden_size=256, num_layers=2,
                           bidirectional=True, batch_first=False)
        self.fc = nn.Linear(512, num_classes)
        self.log_sm = nn.LogSoftmax(dim=-1)

    def forward(self, x):              # x: (B,1,H,W)
        f = self.cnn(x)                # (B,256,H',W')
        f = self.proj_h(f)             # (B,256,1,W')
        f = f.squeeze(2).permute(2,0,1) # (T=W', B, 256)
        f,_ = self.rnn(f)              # (T,B,512)
        logits = self.fc(f)            # (T,B,C)
        return self.log_sm(logits)


In [8]:
def decode_greedy(logp):  # (T,B,C)
    seqs = []
    path = logp.argmax(-1)  # (T,B)
    T,B = path.shape
    for b in range(B):
        prev = BLANK; chars = []
        for t in range(T):
            p = int(path[t,b])
            if p != BLANK and p != prev:
                chars.append(idx2char[p])
            prev = p
        seqs.append(''.join(chars))
    return seqs

In [None]:
# # --- Imports & progress bar ---
from tqdm.auto import tqdm  # auto-chooses notebook/terminal-friendly tqdm
import torch
from torch.utils.data import Subset, DataLoader
import numpy as np

train_root = r'C:\\Users\\muhid\\Downloads\\UTN-CV25-Captcha-Dataset\\part2\\train'
val_root   = r'C:\\Users\\muhid\\Downloads\\UTN-CV25-Captcha-Dataset\\part2\\val'

train_ds_full = Captcha2(train_root)
val_ds_full   = Captcha2(val_root)

FRACTION = 0.5   # <- set 0.25 for quarter dataset, etc.
rng = np.random.default_rng(seed=42)
train_idx = rng.choice(len(train_ds_full), size=max(1, int(len(train_ds_full)*FRACTION)), replace=False)
val_idx   = rng.choice(len(val_ds_full),   size=max(1, int(len(val_ds_full)*FRACTION)),   replace=False)

train_ds = Subset(train_ds_full, train_idx)
val_ds   = Subset(val_ds_full,   val_idx)

BATCH_SIZE = 16  # smaller batch to reduce memory and speed up iteration starts

# # --- Device & model setup ---
num_epochs = 10
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = CRNN(num_classes=1+len(charset)).to(device)
crit = nn.CTCLoss(blank=BLANK, zero_infinity=True)
opt = torch.optim.Adam(model.parameters(), lr=1e-3)

train_loader = DataLoader(
    train_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate,
    num_workers=0, persistent_workers=False, pin_memory=(device=='cuda')
)
val_loader = DataLoader(
    val_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate,
    num_workers=0, persistent_workers=False, pin_memory=(device=='cuda')
)

def evaluate_seq_acc(model, loader):
    model.eval()
    n, correct = 0, 0
    with torch.no_grad():
        for X, targets, in_lens, tar_lens in loader:
            X = X.to(device, non_blocking=True)
            logp = model(X)                       # (T,B,C)
            preds = decode_greedy(logp)           # list[str] length B

            # unpack flat targets back to strings
            offs = 0
            gts = []
            for L in tar_lens.tolist():
                seq = targets[offs:offs+L].tolist()
                gts.append(''.join(idx2char[i] for i in seq))
                offs += L

            for p, g in zip(preds, gts):
                correct += int(p == g)
                n += 1
    return correct / max(n, 1)

# --- Training loop with reliable progress bars ---
for epoch in range(num_epochs):
    model.train()
    total_loss, steps = 0.0, 0

    # tqdm needs a known total if __len__ might be 0 due to a path issue
    pbar = tqdm(train_loader, total=len(train_loader),
                desc=f"Epoch {epoch+1}/{num_epochs}", leave=False)

    for X, targets, in_lens, tar_lens in pbar:
        X = X.to(device, non_blocking=True)
        targets = targets.to(device, non_blocking=True)
        in_lens = in_lens.to(device, non_blocking=True)
        tar_lens = tar_lens.to(device, non_blocking=True)

        logp = model(X)                                      # (T,B,C)
        loss = crit(logp, targets, in_lens, tar_lens)

        opt.zero_grad(set_to_none=True)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 5.0)
        opt.step()

        total_loss += float(loss)
        steps += 1
        pbar.set_postfix(loss=f"{loss.item():.4f}")

    # Validation after each epoch
    val_acc = evaluate_seq_acc(model, val_loader)
    print(f"Epoch {epoch+1}/{num_epochs} - "
          f"train_loss: {total_loss/max(steps,1):.4f}  val_seq_acc: {val_acc:.4f}")

# --- Save model after training ---
torch.save(model.state_dict(), "crnn_captcha.pth")
print("Model saved to crnn_captcha.pth")


                                                                            

Epoch 1/10 - train_loss: 3.9301  val_seq_acc: 0.0008


                                                                            

Epoch 2/10 - train_loss: 3.5945  val_seq_acc: 0.0015


                                                                            

Epoch 3/10 - train_loss: 3.0796  val_seq_acc: 0.0040


                                                                            

Epoch 4/10 - train_loss: 2.3444  val_seq_acc: 0.0421


                                                                            

Epoch 5/10 - train_loss: 1.5440  val_seq_acc: 0.1043


                                                                            

Epoch 6/10 - train_loss: 1.0116  val_seq_acc: 0.1988


                                                                            

Epoch 7/10 - train_loss: 0.7037  val_seq_acc: 0.2644


                                                                            

Epoch 8/10 - train_loss: 0.5339  val_seq_acc: 0.2976


                                                                            

Epoch 9/10 - train_loss: 0.4257  val_seq_acc: 0.3229


                                                                             

Epoch 10/10 - train_loss: 0.3494  val_seq_acc: 0.3571
Model saved to crnn_captcha.pth


In [10]:
# Recreate the model with the same architecture & charset
# model = CRNN(num_classes=1+len(charset)).to(device)
# model.load_state_dict(torch.load("crnn_captcha.pth", map_location=device))
# model.eval()  # set to evaluation mode

In [11]:
import os, json, glob
from PIL import Image

def predict_image_string(img_path):
    # Preprocess (uses earlier preprocess())
    x = preprocess(img_path).unsqueeze(0).to(device)   # (1,1,H,W)
    with torch.no_grad():
        logp = model(x)                                # (T,1,C)
    pred = decode_greedy(logp)[0]                      # str
    return pred

def make_part2_predictions_json(test_root, out_json_path):
    """
    test_root: '/path/to/part2/test' (must contain 'images' subfolder)
    Writes predictions JSON with required keys per image.
    """
    img_dir = os.path.join(test_root, 'images')
    img_paths = sorted(glob.glob(os.path.join(img_dir, '*.png')))

    results = []
    for p in img_paths:
        fname = os.path.basename(p)
        image_id = os.path.splitext(fname)[0]

        # read size from original image
        with Image.open(p) as im:
            w, h = im.size

        pred_str = predict_image_string(p)

        results.append({
            "height": int(h),
            "width": int(w),
            "image_id": image_id,
            "captcha_string": pred_str,
            "annotations": []          # CRNN text-only; leave empty
        })

    with open(out_json_path, 'w') as f:
        json.dump(results, f, indent=2)


In [12]:
# Example usage:
make_part2_predictions_json('C:\\Users\\muhid\\Downloads\\UTN-CV25-Captcha-Dataset\\part2\\test', 'C:\\Users\\muhid\\OneDrive\\Desktop\\Work\\Temp git work\\Captcha-Cracker\\predictions_part2.json')