# Brazilian Document Classification (Kaggle)

**Plan:** 60% training, 40% held-out testing. Predict document type (8–9 classes).

Logic adapted from [identity-document-image-classification](https://github.com/leomaurodesenv/kaggle) (BID dataset).

1. Add your dataset in Kaggle Input (folder per class: CNH_Frente, CPF_Frente, RG_Frente, etc.)
2. Set `DATA_PATH` below to your dataset path
3. Run all cells (enable GPU in Settings → Accelerator)

In [None]:
# Config - EfficientNet-B7, 60/40 split, Kaggle-ready
DATA_PATH = "/kaggle/input/bid-dataset"  # Kaggle: Bid_dataset. If zip, may need /kaggle/input/bid-dataset/BID Sample Dataset
TRAIN_RATIO = 0.60
RANDOM_SEED = 42
BATCH_SIZE = 16  # B7 is memory-heavy; reduce to 8 if OOM on Kaggle
IMG_SIZE = (224, 224)  # ResNet standard input
EPOCHS = 5
LR = 1e-3
EARLY_STOP_PATIENCE = 20
EARLY_STOP_MIN_DELTA = 0.001

In [None]:
from pathlib import Path

import torch
import torch.nn as nn
from PIL import Image
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

In [None]:
# 60/40 split - BID dataset compatible (excludes _ocr.txt, _segmentation.jpg)
def load_and_split(data_path, train_ratio=0.60, seed=42, exclude_patterns=("_ocr", "_segmentation")):
    data_path = Path(data_path)
    classes = sorted([d.name for d in data_path.iterdir() if d.is_dir()])
    class_to_idx = {c: i for i, c in enumerate(classes)}
    train_samples, test_samples = [], []
    for cls in classes:
        imgs = [p for p in (data_path / cls).glob("*") 
                if p.suffix.lower() in (".jpg", ".jpeg", ".png", ".bmp")
                and not any(ex in p.stem for ex in exclude_patterns)]
        if not imgs:
            continue
        tr, te = train_test_split(imgs, train_size=train_ratio, random_state=seed)
        train_samples += [(str(p), class_to_idx[cls]) for p in tr]
        test_samples += [(str(p), class_to_idx[cls]) for p in te]
    return train_samples, test_samples, class_to_idx

In [None]:
class DocumentDataset(Dataset):
    def __init__(self, samples, transform=None):
        self.samples = samples
        self.transform = transform

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        path, label = self.samples[idx]
        img = Image.open(path).convert("RGB")
        if self.transform:
            img = self.transform(img)
        return img, label

In [None]:
# Transforms - ImageNet norm for pretrained ResNet; augmentation for robustness
def get_transforms(img_size, is_train=True):
    norm = transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    if is_train:
        return transforms.Compose([
            transforms.Resize(img_size),
            transforms.RandomHorizontalFlip(p=0.5),
            transforms.RandomVerticalFlip(p=0.5),
            transforms.RandomRotation(15),
            transforms.RandomAffine(0, scale=(0.95, 1.05)),
            transforms.ToTensor(),
            norm,
        ])
    return transforms.Compose([transforms.Resize(img_size), transforms.ToTensor(), norm])

In [None]:
# Resolve path (Kaggle zip may extract to subfolder e.g. "BID Sample Dataset")
_data_path = Path(DATA_PATH)
if _data_path.exists():
    subdirs = [d for d in _data_path.iterdir() if d.is_dir()]
    class_names = [d.name for d in subdirs]
    if len(subdirs) == 1 and not any(c.startswith("CNH") or c.startswith("CPF") or c.startswith("RG") for c in class_names):
        DATA_PATH = str(subdirs[0])
        print(f"Using subfolder: {DATA_PATH}")

train_samples, test_samples, class_to_idx = load_and_split(DATA_PATH, TRAIN_RATIO, RANDOM_SEED)
if len(train_samples) == 0:
    _list = list(Path(DATA_PATH).iterdir()) if Path(DATA_PATH).exists() else "path not found"
    raise ValueError(f"No images at {DATA_PATH}. Expected CNH_Frente, CPF_Frente, RG_Frente folders. Contents: {_list}")
num_classes = len(class_to_idx)
print(f"Classes: {list(class_to_idx.keys())}")
print(f"Train: {len(train_samples)} | Test: {len(test_samples)} (40% held-out)")

In [None]:
train_ds = DocumentDataset(train_samples, get_transforms(IMG_SIZE, is_train=True))
test_ds = DocumentDataset(test_samples, get_transforms(IMG_SIZE, is_train=False))
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

In [None]:
# EfficientNet-B7 - pretrained, high-accuracy document classification
model = models.efficientnet_b7(weights=models.EfficientNet_B7_Weights.IMAGENET1K_V1)
model.classifier[1] = nn.Linear(model.classifier[1].in_features, num_classes)
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

In [None]:
# Train on 60% only (with early stopping from identity-document notebook)
best_loss = float("inf")
patience_counter = 0
best_state = None

for epoch in range(EPOCHS):
    model.train()
    total, correct, total_loss = 0, 0, 0.0
    for imgs, labels in train_loader:
        imgs, labels = imgs.to(device), labels.to(device)
        optimizer.zero_grad()
        out = model(imgs)
        loss = criterion(out, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        correct += (out.argmax(1) == labels).sum().item()
        total += labels.size(0)
    avg_loss = total_loss / len(train_loader)
    acc = 100 * correct / total
    print(f"Epoch {epoch+1}/{EPOCHS} | Loss: {avg_loss:.4f} | Train Acc: {acc:.2f}%")

    if avg_loss < best_loss - EARLY_STOP_MIN_DELTA:
        best_loss = avg_loss
        patience_counter = 0
        best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
    else:
        patience_counter += 1
        if patience_counter >= EARLY_STOP_PATIENCE:
            print(f"Early stopping at epoch {epoch+1}")
            if best_state:
                model.load_state_dict(best_state)
                model = model.to(device)
            break

In [None]:
# Evaluate on 40% held-out test set
model.eval()
correct, total = 0, 0
idx_to_class = {v: k for k, v in class_to_idx.items()}
class_correct = {i: 0 for i in range(num_classes)}
class_total = {i: 0 for i in range(num_classes)}

with torch.no_grad():
    for imgs, labels in test_loader:
        imgs, labels = imgs.to(device), labels.to(device)
        out = model(imgs)
        pred = out.argmax(1)
        correct += (pred == labels).sum().item()
        total += labels.size(0)
        for l, p in zip(labels, pred):
            class_total[l.item()] += 1
            if l.item() == p.item():
                class_correct[l.item()] += 1

acc = 100 * correct / total
print("="*50)
print("Test Set Accuracy (40% held-out)")
print("="*50)
print(f"Overall: {acc:.2f}% ({correct}/{total})")
for i in range(num_classes):
    c_acc = 100 * class_correct[i] / class_total[i] if class_total[i] else 0
    print(f"  {idx_to_class[i]}: {c_acc:.2f}%")

In [None]:
# Save model
torch.save({
    "model_state": model.state_dict(),
    "class_to_idx": class_to_idx,
    "num_classes": num_classes,
    "arch": "efficientnet_b7",
}, "/kaggle/working/document_classifier.pt")
print("Model saved to /kaggle/working/document_classifier.pt")