<center><h1> CAPTCHA Recognition Based on Convolutional Neural Network </h1></center>

**Author**: Akira37

**Correspondence**: Email: hyperplasma@qq.com

**Abstract**: When a crawler encounters an image CAPTCHA system, a CAPTCHA recognition program is needed. This project uses PyTorch to build a deep learning model based on a Convolutional Neural Network (CNN) to recognize complex CAPTCHA images composed of numbers and letters. Here we use the captcha library's built-in generator to produce tens of thousands of images and divide them into train sets and test sets. Through PyTorch framework, a CNN model is built and later trained for a certain number of rounds, and finally the result model achieve a relatively accurate recognition rate.

**Keywords**: CAPTCHA Recognition; Convolutional Neural Network; Deep Learning; PyTorch



# 1. Pre-procession

## Import Packages

In [None]:
!pip install captcha

Collecting captcha
  Downloading captcha-0.5.0-py3-none-any.whl (102 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/102.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.2/102.2 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: captcha
Successfully installed captcha-0.5.0


In [None]:
import concurrent.futures
import os
import random
import shutil
from pathlib import Path

import PIL
import captcha
import matplotlib
import matplotlib.pyplot as plt
import torch
import torchvision
from PIL import Image
from captcha.image import ImageCaptcha
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

%matplotlib inline


print("---Versions of Required Packages---")
print("torch:", torch.__version__)
print("torchvision:", torchvision.__version__)
print("pillow:", PIL.__version__)
print("captcha:", captcha.__version__)
print("matplotlib:", matplotlib.__version__)

---Versions of Required Packages---
torch: 2.2.1+cu121
torchvision: 0.17.1+cu121
pillow: 9.4.0
captcha: 0.5.0
matplotlib: 3.7.1


## Define Hyper-parameters

In [None]:
CHAR_NUMBER = 4                                 # Number of characters in the image CAPTCHA
IMG_WIDTH = 160                                 # Image width
IMG_HEIGHT = 60                                 # Image height
SEED = "0123456789abcdefghijkmnpqrstuvwxyzABCDEFGHJKLMNPQRSTUVWXYZ"   # Character pool

TRAIN_SIZE = 30000      # Size of train set
VALIDATION_SIZE = 10000   # Size of validation set
TEST_SIZE = 10000       # Size of test set

BATCH_SIZE = 60             # Number of images in a mini-batch
TOTAL_EPOCH = 30             # Training rounds
LEARNING_RATE = 1e-3        # Learning rate for backpropagation

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"     # Run this model on GPU if possible

torch.cuda.is_available()

True

In [None]:
# Path format: "./{folder}" (run locally) or "/content/{folder}" (run on Google Colab)

# train_set_path = "./data/train"
# validation_set_path = './data/validation'
# test_set_path = "./data/test"
# save_file_path = "./result/model.pth"

train_set_path = "/content/data/train"
validation_set_path = '/content/data/validation'
test_set_path = "/content/data/test"
save_file_path = "/content/result/model.pth"

In [None]:
# Visualization
epoch_list = []
train_loss_list = []
valid_loss_list = []

# 2. Generate Data

In [None]:
def captcha_generator(num, output_dir, thread_name=0):
    if Path(output_dir).exists():
        shutil.rmtree(output_dir)   # If the directory already exists, delete it before creating a new one.
    Path(output_dir).mkdir()

    for i in range(num):
        image_captcha = ImageCaptcha(width=IMG_WIDTH, height=IMG_HEIGHT)

        chars = "".join([random.choice(SEED) for _ in range(CHAR_NUMBER)])  # Randomly choose an element in the char-pool for the CAPTCHA string.
        save_path = f"{output_dir}/{i + 1}-{chars}.png"     # The default output format is png.

        image_captcha.write(chars, save_path)
        # if (i + 1) % (num / 10) == 0:
        #     print(f"Thread {thread_name}: {i + 1} CAPTCHA code{'s' if i > 0 else ''} ha{'ve' if i > 0 else 's'} been generated. ")

    print(f"Thread {thread_name}: Congrats! All {num} CAPTCHA code{'s' if num > 0 else ''} ha{'ve' if num > 0 else 's'} been generated at {output_dir} ")

In [None]:
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:     # Multithread generation.
    # executor.submit(captcha_generator, 3, "./data", 1)
    executor.submit(captcha_generator, TRAIN_SIZE, train_set_path, 0)
    executor.submit(captcha_generator, TEST_SIZE, test_set_path, 1)
    executor.submit(captcha_generator, VALIDATION_SIZE, validation_set_path, 2)

Thread 2: Congrats! All 10000 CAPTCHA codes have been generated at /content/data/validation 
Thread 1: Congrats! All 10000 CAPTCHA codes have been generated at /content/data/test 


In [None]:
for filename in os.listdir(train_set_path):
    image = Image.open(train_set_path + "/" + filename)
    image.show()
    print(image)
    break

# 3. Load Data

## One-hot Code

In [None]:
def one_hot_encode(chars):
    """Convert Characters to One-hot Codes"""
    cols = len(SEED)
    rows = CHAR_NUMBER
    res = torch.zeros(rows, cols, dtype=torch.float32)  # Initialize the result tensor (can combine first two arguments to be a tuple for some reason)

    for i, char in enumerate(chars):
        j = SEED.index(char)    # The column index (j) is the position of a character in the char-pool string
        res[i, j] = 1.0         # Set the j-th element in the i-th row to be 1

    return res.view(1, -1)[0]   # Reshape and return the tensor as a row vector.

In [None]:
def one_hot_decode(code):
    """Revert One-hot Codes to Characters"""
    code = code.view(-1, len(SEED)) # Reshape the row vector (one-hot code)
    index_list = torch.argmax(code, dim=1)  # Return a tensor containing the indices of the respective biggest values in every line (column/dim-1 indices), i.e. every index is determined by its character that has the biggest possibilities!
    chars = "".join([SEED[i] for i in index_list])  # Restore the characters respectively.
    return chars

In [None]:
temp_code = one_hot_encode("TEST")
print(temp_code)
print(temp_code.shape)
print(one_hot_decode(temp_code))

## Define Data Loader

In [None]:
class ImageDataset(Dataset):
    def __init__(self, dir_path):
        super(ImageDataset, self).__init__()
        self.img_path_list = [f"{dir_path}/{filename}" for filename in os.listdir(dir_path)]    # Load all the paths of images in the data set.
        self.transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Grayscale(),
        ])

    def __getitem__(self, index):
        image = self.transform(Image.open(self.img_path_list[index]))
        label = self.img_path_list[index].split("-")[-1].replace(".png", "")    # Detach the characters from leading numbers and the file format(".png")
        label = one_hot_encode(label)
        return image, label

    def __len__(self):
        return len(self.img_path_list)

In [None]:
def get_dataloader(path):
    dataset = ImageDataset(path)
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)   # Actually not necessary to re-shuffle the already shuffled data generated.
    return dataloader

In [None]:
# View the shape of tensors.
train_dataloader = get_dataloader(train_set_path)
test_dataloader = get_dataloader(test_set_path)
for inputs, targets in train_dataloader:
    # print(inputs)
    print(inputs.shape)
    # print(targets)
    print(targets.shape)
    break

# 4. Design Model

In [None]:
class NeuralNetWork(nn.Module):
    """Convolutional Neural Network (VGG-16)

    Layout:
        1. Conv_1x64 -> ReLU -> MaxPool_2x2
        2. Conv_64x128 -> ReLU -> MaxPool_2x2
        3. Conv_128x256 -> ReLU -> MaxPool_2x2
        4. Conv_256x512 -> ReLU -> MaxPool_2x2
        5. FC -(drop out)-> ReLU -> FC
    """
    def __init__(self):
        super(NeuralNetWork, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.layer2 = nn.Sequential(
            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.layer3 = nn.Sequential(
            nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.layer4 = nn.Sequential(
            nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.layer5 = nn.Sequential(
            nn.Flatten(),
            nn.Linear(in_features=15360, out_features=4096),
            nn.Dropout(0.5),
            nn.ReLU(),
            nn.Linear(in_features=4096, out_features=CHAR_NUMBER * len(SEED))   # The number of predictions must be the CAPTCHA character number times the length of the character pool
        )

    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.layer5(x)
        return x

# 5. Train Model

## Training Cycles

In [None]:
def train(dataloader, model, loss_func, optimizer):
    model.train()

    running_loss = 0.0
    for batch, (inputs, targets) in enumerate(dataloader):
        inputs, targets = inputs.to(device), targets.to(device)

        outputs = model(inputs)
        loss = loss_func(outputs, targets)  # Forward propagation
        running_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()                     # Backpropagation
        optimizer.step()                    # Optimization

        # if (batch + 1) % (BATCH_SIZE / 10) == 0:
        #     print(f"Batch {batch + 1}: Loss = {loss:>7f}")

    # print(f"Total loss on Train Set is {running_loss:>7f}")
    return running_loss

In [None]:
def validate(dataloader, model, loss_func):
    model.eval()

    running_loss = 0.0
    with torch.no_grad():   # No backpropagation during evaluating the model
        for batch, (inputs, targets) in enumerate(dataloader):
            inputs, targets = inputs.to(device), targets.to(device)

            outputs = model(inputs)
            loss = loss_func(outputs, targets)
            running_loss += loss.item()

    # print(f"Total loss on Validation Set is {running_loss:>7f}")
    return running_loss

In [None]:
model = NeuralNetWork().to(device)
loss_func = nn.MultiLabelSoftMarginLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

Loss Function: MultiLabelSoftMarginLoss (Multi-Label Cross-Entropy)
$$
loss(x,y)=-\frac{1}{C}\sum\limits_{i}(y^{(i)}\log(1+\exp(-x^{(i)}))^{-1}+(1-y^{(i)})\log\frac{\exp(-x^{(i)})}{1+\exp(-x^{(i)})})
$$
where $x$ is the input tensor whose shape is $(N,C)$ (batch size and number of classification), and $y$ is the real label with the same shape.

In [None]:
train_dataloader = get_dataloader(train_set_path)
validation_dataloader = get_dataloader(validation_set_path)

for epoch in range(TOTAL_EPOCH):
    # print(f"--------------- Training Epoch {epoch + 1} ---------------")
    epoch_list.append(epoch + 1)

    train_loss = train(train_dataloader, model, loss_func, optimizer)
    train_loss_list.append(train_loss)

    valid_loss = validate(validation_dataloader, model, loss_func)
    valid_loss_list.append(valid_loss)
    print()

torch.save(model.state_dict(), save_file_path)
print(f"The {TOTAL_EPOCH}-epoch training is complete and the model is saved at \"{save_file_path}\"")

## Visualization

In [None]:
plt.plot(epoch_list, train_loss_list)
plt.xlabel('Epoch')
plt.ylabel('Loss on Training Set')
plt.grid()
plt.show()

In [None]:
plt.plot(epoch_list, valid_loss_list)
plt.xlabel('Epoch')
plt.ylabel('Loss on Validation Set')
plt.grid()
plt.show()

# 6. Test Model

In [None]:
def predict(model, file_path):
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Grayscale(),
    ])

    with torch.no_grad():   # No backpropagation during using the model
        inputs = transform(Image.open(file_path)).reshape(1, 1, 60, 160).to(device)     # All tensors(operators) should be on the same device.
        outputs = model(inputs)
        # print(outputs)
        chars = one_hot_decode(outputs)
        return chars


def recognize(model, file_path):
    model.eval()
    real_captcha = file_path.split("-")[-1].replace(".png", "")     # File name formation: {index}-{characters}.{file format}
    pred_captcha = predict(model, file_path)

    correct = 1 if pred_captcha == real_captcha else 0
    # if pred_captcha == real_captcha:
    #     print(f"The prediction result of \"{file_path}\" is {pred_captcha}. The prediction is CORRECT!")
    # else:
    #     print(f"The prediction result of \"{file_path}\" is {pred_captcha}. The prediction is WRONG!")
    return correct

In [None]:
def model_test(model):
    correct = 0
    total = len(os.listdir(test_set_path))
    for filename in os.listdir(test_set_path):
        file_path = f"{test_set_path}/{filename}"
        correct += recognize(model, file_path)
    accuracy = f"{correct / total * 100:.7f}%"
    print("\nThe accuracy of the model is", accuracy)

In [None]:
model = NeuralNetWork().to(device)
model.load_state_dict(torch.load(save_file_path, map_location=torch.device(device)))

model_test(model)