In [157]:
from glob import glob
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
from typing import List, Tuple

from sklearn.model_selection import train_test_split

import torchvision.transforms as T
from torch.utils.data import DataLoader, Dataset

import torch.nn as nn
import torch.nn.functional as F
import torch

import torch.optim as optim

from torch.autograd import Variable

In [158]:
file_locations = glob('./Captchas/*')
captcha_names = [file.split('/')[-1].split('.')[0] for file in file_locations]
print( f'identified {len(file_locations)} images' )

identified 113062 images


In [159]:
# Unique characters is global -- expect only 60 out
unique_characters = [*set(char for name in captcha_names for char in name)]

print( f'{len(unique_characters)} unique characters: {unique_characters}' )

60 unique characters: ['Z', 'U', '7', 'I', 'K', '6', 'O', 'Y', 'J', 'A', 'R', '9', 'V', 'N', 'x', 'b', '3', 'm', 's', 'r', 'z', 'G', 'p', 'W', 'M', 'f', 'T', '5', 'h', 'e', 'E', 'k', 'y', 'c', 'X', '2', 'B', 't', 'w', 'j', '8', 'l', 'F', 'u', 'q', 'H', 'g', 'a', 'C', '1', '4', 'L', 'i', 'D', 'P', 'v', 'd', 'S', 'n', 'Q']


In [160]:
class Captcha_Dataset(Dataset):
    def __init__(self, data, labels):
        self.X = data
        self.y = labels
        return 

    @classmethod
    def import_image(cls, location:str) -> np.ndarray:
        """
        Import a single image.

        Parameters: location (str) Location of image
        Returns: (np.ndarray) Image dimensions = Captchas 40 x 150 x 3 RGB channels
        """
        image = Image.open(location)
        image.load()
        #image.show()
        data = np.asarray(image, dtype='float32')
        return data
    @classmethod
    def stack_images(cls, file_locations:List[str]) -> np.ndarray:
        """
        Stack imageset from directory.

        Parameters: file_locations (List[str]) List of image locations
        Returns: (np.ndarray) len(file_locations) x image dimensions
        """
        return np.array([cls.import_image(location) for location in file_locations ])
    
    @classmethod
    def read_label_names(cls, file_locations:List[str]) -> List[str]:
        """
        Simply extracts labels from filenames.

        Parameters: file_locations (List[str]) List of image locations
        Returns: (List[str]) List of label names
        """
        labels = [file.split('/')[-1].split('.')[0] for file in file_locations]
        
        return labels

    @classmethod
    def from_dir(cls, file_locations:List[str]):
        """
        Instantiate from only a list of files.

        Parameters: file_locations (List[str]) List of image locations
        Returns: (Captcha_Dataset) object
        """
        return cls(
            cls.stack_images(file_locations),
            cls.read_label_names(file_locations)
        )

    def transform(self, image:np.ndarray) -> torch.Tensor:
        """Apply dataset transform."""
        return T.ToTensor()(image) # This is a hack for now.
        # Not sure why, but this transforming doesn't work. It's weird. Idk.
        # I originally tried using only PIL images and then resizing from there, but it didn't work.
        # Tried now going from PIL --> ndarray --> PIL --> Tensor; also doesn't work. 
        # Bit lost.
        # return  T.Compose([
        #     T.ToPILImage(),
        #     T.Resize([40, 150]),
        #     T.ToTensor()
        #     ])(image)

    def encode_label(self, label:str) -> np.ndarray:
        """
        
        """
        label_array = []
        for char in label:
            node_array = [0]*len(unique_characters)
            node_array[unique_characters.index(char)] += 1
            label_array.append(node_array)
        return np.array(label_array)

    def __getitem__(self, index:int) -> Tuple[torch.Tensor, str]:
        """Select one sample. DataLoader accesses samples through this function."""
        return self.transform(self.X[index]), self.encode_label(self.y[index]), self.y[index]
    
    def __len__(self) -> int:
        """Also needed for DataLoader."""
        return len(self.X)

In [164]:
# Split training/test data
train_files, test_files = train_test_split(file_locations[0:10_000], test_size = .2)
print(f'Split dataset into 80:20 train/test of sizes {len(train_files)},{len(test_files)}.')

Split dataset into 80:20 train/test of sizes 8000,2000.


In [165]:
# Load in training files.
train = Captcha_Dataset.from_dir(train_files)#[0:128])

print(f'{train.X.shape}\nSample of images in format 40px x 150px x 3 RGB channels, of type {type(train.X[0][0][0][0])}')

# Instantiate dataloader (the iterable that provides batches for gradient descent.)
dl = DataLoader(train, \
    64, # Fetch 4 samples per batch
    shuffle=True, num_workers=2)

(8000, 40, 150, 3)
Sample of images in format 40px x 150px x 3 RGB channels, of type <class 'numpy.float32'>


In [166]:
# Test out the iterable.
dataiter = iter(dl)
images, label_array, labels = next(dataiter)
print(f'Each batch has a dataset of shape {images.shape} and a corresponding set of {label_array.shape} labels.')

Each batch has a dataset of shape torch.Size([64, 3, 40, 150]) and a corresponding set of torch.Size([64, 5, 60]) labels.


In [167]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()

        self.conv1 = nn.Conv2d(3, # 3 input channels
            6, # 6 output channels
            5, bias = False) # kernel of size 5x5

        self.pool = nn.MaxPool2d(2, # kernel size of 2
            2) # stride of 2

        self.conv2 = nn.Conv2d(6, 16, 5) # 6 in / 16 out / 5x5 kernel

        self.fc1 = nn.Linear(3808, # features in
            1404) # features out

        self.fc2 = nn.Linear(1404, 702)
        self.fc3 = nn.Linear(702, 
                            len(unique_characters) * 5 # 300 output nodes: 5-chars 60 nodes
                            , bias = True)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)#nn.Softmax(self.fc3(x))
        return x

In [168]:
# Instantiate all our stuff
net = Net()
criterion = nn.MultiLabelSoftMarginLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=0.001)

In [173]:
for epoch in range(20):
    print(f'Starting epoch {epoch}')

    running_loss = 0.0
    for i,(images, label_array, labels) in enumerate(dl,0):
        
        # Zero param grads
        optimizer.zero_grad()

        # Forward
        prediction = net(images)
        
        # Calculate loss
        loss = criterion(prediction.reshape(prediction.shape[0],5,60), label_array)
        # Backpropagate
        loss.backward()
        # Step optimizer
        optimizer.step()

        # print stats
        running_loss += loss.item()
        if i%( len(train_files) // 64 // 10 ) == 0: print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / ( len(train_files) / 64 / 10 ):.3e}')

print('Finished Training')

Starting epoch 0
[1,     1] loss: 3.965e-03
[1,    26] loss: 1.007e-01


KeyboardInterrupt: 

In [None]:
PATH = './trained_net.pth'
if False:
    torch.save(net.state_dict(), PATH)

In [None]:
tl = DataLoader(Captcha_Dataset.from_dir(test_files), \
    4, # Fetch 4 samples per batch
    shuffle=True, num_workers=2)

total = 0
correct = 0
# since we're not training, we don't need to calculate the gradients for our outputs
with torch.no_grad():
    for (images, label_array, labels) in tl:

        # calculate outputs by running images through the network
        prediction = net(images)

        pred_labels = []
        for i,single_pred in enumerate(prediction.reshape(prediction.shape[0], 5, 60)):
            captcha = ''
            for char in single_pred:
                outchar = unique_characters[np.argmax(char.detach())]
                captcha += outchar
            pred_labels.append(captcha)

            if False: print('Predicted: %s Ground Truth: %s'%(captcha, labels[i]))
            if captcha == labels[i]: correct+=1
            total+=1

print(f'Accuracy of the network on the 10000 test images: {100 * correct / total} %')