<center><h1> CAPTCHA Identifier </h1></center>

Author: Akira37

Abstract: When a crawler encounters a CAPTCHA system, a CAPTCHA recognition program is needed. This project uses PyTorch to build a Convolutional Neural Network (CNN) to recognize complex image CAPTCHA composed of numbers and letters. Here we use the captcha library's built-in generator to generate tens of thousands of graphs and divide them into train sets and test sets. Through PyTorch framework, a CNN model is built and later trained for a certain number of rounds, and finally the result model achieve a relatively accurate recognition rate.

Keywords: Image CAPTCHA; Convolutional Neural Network; Deep Learning; PyTorch

This is an image CAPTCHA identifier based on a Convolutional Neural Network model. My FIRST-EVER PyTorch project!

CAPTCHA: Completely Automated Public Turing test to tell Computers and Humans Apart (or simply Verification Code)

# Pre-procession

## Import Packages

In [None]:
!pip install captcha

In [None]:
import concurrent.futures
import os
import random
import shutil
from pathlib import Path

import PIL
import captcha
import matplotlib
import matplotlib.pyplot as plt
import torch
import torchvision
from PIL import Image
from captcha.image import ImageCaptcha
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

%matplotlib inline


print("---Versions of Required Packages---")
print("torch:", torch.__version__)
print("torchvision:", torchvision.__version__)
print("pillow:", PIL.__version__)
print("captcha:", captcha.__version__)
print("matplotlib:", matplotlib.__version__)

## Hyper-parameter

In [None]:
CHAR_NUMBER = 4                                 # Number of characters in the image CAPTCHA
IMG_WIDTH = 160                                 # Image width
IMG_HEIGHT = 60                                 # Image height
SEED = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"   # Character pool

TRAIN_SIZE = 300      # Size of train set
VALIDATION_SIZE = 100   # Size of validation set
TEST_SIZE = 100       # Size of test set

BATCH_SIZE = 60             # Number of images in a mini-batch
TOTAL_EPOCH = 25             # Training rounds
LEARNING_RATE = 1e-3        # Learning rate while backward

device = "cuda" if torch.cuda.is_available() else "cpu"

# On Google Colab, change paths to "/content/" + ???
train_set_path = "./data/train"
validation_set_path = './data/validation'
test_set_path = "./data/test"
save_file_path = "./result/model.pth"

In [None]:
epoch_list = []
train_acc_list = []
valid_acc_list = []

# Prepare Data

## Define Image CAPTCHA Generator

In [None]:
def captcha_generator(num, output_dir, thread_name=0):
    if Path(output_dir).exists():
        shutil.rmtree(output_dir)   # If the directory already exists, delete it before creating the new one.
    Path(output_dir).mkdir()

    for i in range(num):
        image_captcha = ImageCaptcha(width=IMG_WIDTH, height=IMG_HEIGHT)

        chars = "".join([random.choice(SEED) for _ in range(CHAR_NUMBER)])  # Generate characters randomly
        save_path = f"{output_dir}/{i + 1}-{chars}.png"

        image_captcha.write(chars, save_path)
        print(f"Thread {thread_name}: {i + 1} CAPTCHA code{'s' if i > 0 else ''} ha{'ve' if i > 0 else 's'} been "
              f"generated. ")

    print(f"Thread {thread_name}: Congrats! All CAPTCHA codes have been generated! ")

## Multithread Generation

In [None]:
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: 
    # executor.submit(captcha_generator, 3, "data", 1)
    executor.submit(captcha_generator, TRAIN_SIZE, train_set_path, 0)
    executor.submit(captcha_generator, TEST_SIZE, test_set_path, 1)
    executor.submit(captcha_generator, VALIDATION_SIZE, validation_set_path, 2)

# Progress and Load Data

## One-hot Coding

In [1]:
def one_hot_encode(chars):
    """Convert Characters to One-hot Codes"""
    cols = len(SEED)
    rows = CHAR_NUMBER
    res = torch.zeros(rows, cols, dtype=torch.float32)

    for i, char in enumerate(chars):
        j = SEED.index(char)
        res[i, j] = 1.0

    return res.view(1, -1)[0]

def one_hot_decode(code):
    """Revert One-hot Codes to Characters"""
    code = code.view(-1, len(SEED))
    index_list = torch.argmax(code, dim=1)
    chars = "".join([SEED[i] for i in index_list])
    return chars

## Define Data Loader

In [None]:
class ImageDataset(Dataset):
    def __init__(self, dir_path):
        super(ImageDataset, self).__init__()
        self.img_path_list = [f"{dir_path}/{filename}" for filename in os.listdir(dir_path)]
        self.transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Grayscale(),
        ])

    def __getitem__(self, index):
        image = self.transform(Image.open(self.img_path_list[index]))
        label = self.img_path_list[index].split("-")[-1].replace(".png", "")
        label = one_hot_encode(label)
        return image, label

    def __len__(self):
        return len(self.img_path_list)

In [None]:
def get_dataloader(path):
    dataset = ImageDataset(path)
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
    return dataloader

In [None]:
# View the shape of tensors.
train_dataloader = get_dataloader(train_set_path)
test_dataloader = get_dataloader(test_set_path)
for inputs, targets in train_dataloader:
    print(inputs.shape)
    print(targets.shape)
    break

# Design Model

In [None]:
class NeuralNetWork(nn.Module):
    """Convolutional Neural Network"""
    def __init__(self):
        super(NeuralNetWork, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.layer2 = nn.Sequential(
            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.layer3 = nn.Sequential(
            nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.layer4 = nn.Sequential(
            nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.layer5 = nn.Sequential(
            nn.Flatten(),
            nn.Linear(in_features=15360, out_features=4096),
            nn.Dropout(0.5),
            nn.ReLU(),
            nn.Linear(in_features=4096, out_features=CHAR_NUMBER * len(SEED))
        )

    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.layer5(x)
        return x

# Train Model

In [None]:
def train(dataloader, model, loss_func, optimizer):
    model.train()
    
    correct = 0
    total = 0
    for batch, (inputs, targets) in enumerate(dataloader):
        inputs, targets = inputs.to(device), targets.to(device)

        outputs = model(inputs)
        loss = loss_func(outputs, targets)
        
        total += targets.size(0)
        correct += (outputs == targets).sum().item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if batch % (BATCH_SIZE / 10) == 0:
            print(f"Batch {batch + 1}: Loss = {loss:>8f}")
    
    accuracy = 1.0 * correct / total
    print(f"Accuracy on Train Set is {accuracy:>7f}")
    return accuracy

In [None]:
def validate(dataloader, model, loss_func):
    model.eval()
    
    correct = 0
    total = 0
    with torch.no_grad():
        for batch, (inputs, targets) in enumerate(dataloader):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)

            total += targets.size(0)
            correct += (outputs == targets).sum().item()
    
    accuracy = 1.0 * correct / total
    print(f"Accuracy on Validation Set is {accuracy:>8f}")
    return accuracy

In [None]:
model = NeuralNetWork().to(device)
loss_func = nn.MultiLabelSoftMarginLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

train_dataloader = get_dataloader(train_set_path)
validation_dataloader = get_dataloader(validation_set_path)

for epoch in range(TOTAL_EPOCH):
    print(f"--------------- Training Epoch {epoch + 1} ---------------")
    epoch_list.append(epoch + 1)
    
    train_acc = train(train_dataloader, model, loss_func, optimizer)
    train_acc_list.append(train_acc)
    
    valid_acc = validate(validation_dataloader, model, loss_func)
    valid_acc_list.append(valid_acc)
    print()

torch.save(model.state_dict(), save_file_path)
print(f"The training is complete and the model is saved at \"{save_file_path}\"")

### Visualization

In [None]:
plt.plot(epoch_list, train_acc_list)
plt.xlabel('Epoch')
plt.ylabel('Accuracy on Training Set')
plt.grid()
plt.show()

In [None]:
plt.plot(epoch_list, valid_acc_list)
plt.xlabel('Epoch')
plt.ylabel('Accuracy on Validation Set')
plt.grid()
plt.show()

# Test Model

In [None]:
def predict(model, file_path):
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Grayscale(),
    ])

    with torch.no_grad():
        inputs = transform(Image.open(file_path)).reshape(1, 1, 60, 160).to(device) # All tensors(operators) should be on the same device!
        outputs = model(inputs)
        # print(outputs)
        chars = one_hot_decode(outputs)
        return chars

In [None]:
model = NeuralNetWork().to(device)
model.load_state_dict(torch.load(save_file_path, map_location=torch.device("cpu")))
model.eval()

correct = 0
test_dir = test_set_path
total = len(os.listdir(test_dir))
for filename in os.listdir(test_dir):
    file_path = f"{test_dir}/{filename}"
    real_captcha = file_path.split("-")[-1].replace(".png", "")
    pred_captcha = predict(model, file_path)

    if pred_captcha == real_captcha:
        correct += 1
        print(f"The prediction result of \"{file_path}\" is {pred_captcha}. The prediction is CORRECT!")
    else:
        print(f"The prediction result of \"{file_path}\" is {pred_captcha}. The prediction is WRONG!")

accuracy = f"{correct / total * 100:.8f}%"
print("\nThe accuracy of the model is", accuracy)