In [35]:
from glob import glob
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
from typing import List, Tuple
import torchvision.transforms as T
from torch.utils.data import DataLoader, Dataset

import torch.nn as nn
import torch.nn.functional as F
import torch

import torch.optim as optim

from torch.autograd import Variable

In [2]:
file_locations = glob('./Captchas/*')
captcha_names = [file.split('/')[-1].split('.')[0] for file in file_locations]
print( f'identified {len(file_locations)} images' )

identified 113062 images


In [3]:
# Unique characters is global -- expect only 60 out
unique_characters = set(char for name in captcha_names for char in name)

character_encode = {char:i for i,char in enumerate(unique_characters)}

print( f'{len(unique_characters)} unique characters: {character_encode}' )

60 unique characters: {'6': 0, 'F': 1, '8': 2, 'w': 3, 'n': 4, 'C': 5, 's': 6, '2': 7, 'P': 8, 't': 9, 'S': 10, '7': 11, 'N': 12, 'T': 13, 'a': 14, 'u': 15, 'y': 16, 'e': 17, 'g': 18, 'b': 19, 'R': 20, 'Y': 21, 'v': 22, 'O': 23, 'B': 24, 'Q': 25, 'D': 26, 'E': 27, '5': 28, 'm': 29, 'd': 30, 'W': 31, 'j': 32, 'r': 33, 'x': 34, 'q': 35, 'K': 36, '3': 37, 'L': 38, 'A': 39, 'Z': 40, 'H': 41, '9': 42, 'i': 43, '1': 44, '4': 45, 'h': 46, 'U': 47, 'z': 48, 'M': 49, 'I': 50, 'J': 51, 'p': 52, 'f': 53, 'l': 54, 'X': 55, 'c': 56, 'G': 57, 'V': 58, 'k': 59}


In [24]:
class Captcha_Dataset(Dataset):
    def __init__(self, data, labels):
        self.X = data
        self.y = torch.tensor(labels)

    @classmethod
    def import_image(cls, location:str) -> np.ndarray:
        """
        Import a single image.

        Parameters: location (str) Location of image
        Returns: (np.ndarray) Image dimensions = Captchas 40 x 150 x 3 RGB channels
        """
        image = Image.open(location)
        image.load()
        #image.show()
        data = np.asarray(image, dtype='float32')
        return data
    @classmethod
    def stack_images(cls, file_locations:List[str]) -> np.ndarray:
        """
        Stack imageset from directory.

        Parameters: file_locations (List[str]) List of image locations
        Returns: (np.ndarray) len(file_locations) x image dimensions
        """
        return np.array([cls.import_image(location) for location in file_locations ])
    
    @classmethod
    def read_label_names(cls, file_locations:List[str]) -> List[str]:
        """
        Simply extracts labels from filenames.

        Parameters: file_locations (List[str]) List of image locations
        Returns: (List[str]) List of label names
        """
        labels = [file.split('/')[-1].split('.')[0] for file in file_locations]
        
        return [[character_encode[char] for char in name] for name in labels]


        # print(labels)
        # return [[character_encode[char] for char in name] for name in labels]


    @classmethod
    def from_dir(cls, file_locations:List[str]):
        """
        Instantiate from only a list of files.

        Parameters: file_locations (List[str]) List of image locations
        Returns: (Captcha_Dataset) object
        """
        return cls(
            cls.stack_images(file_locations),
            cls.read_label_names(file_locations)
        )

    def transform(self, image:np.ndarray) -> torch.Tensor:
        """Dataset transform for loading."""
        return T.ToTensor()(image) # This is a hack for now.
        # Not sure why, but this transforming doesn't work. It's weird. Idk.
        # I originally tried using only PIL images and then resizing from there, but it didn't work.
        # Tried now going from PIL --> ndarray --> PIL --> Tensor; also doesn't work. 
        # Bit lost.
        # return  T.Compose([
        #     T.ToPILImage(),
        #     T.Resize([40, 150]),
        #     T.ToTensor()
        #     ])(image)

    def __getitem__(self, index:int) -> Tuple[torch.Tensor, str]:
        """Select one sample. DataLoader accesses samples through this function."""
        return self.transform(self.X[index]), self.y[index]
    
    def __len__(self) -> int:
        """Also needed for DataLoader."""
        return len(self.X)

In [25]:
sample = Captcha_Dataset.from_dir(file_locations[0:128])

print(f'{sample.X.shape}\nSample of first 128 images in format 40px x 150px x 3 RGB channels, of type {type(sample.X[0][0][0][0])}')

(128, 40, 150, 3)
Sample of first 128 images in format 40px x 150px x 3 RGB channels, of type <class 'numpy.float32'>


In [26]:
dl = DataLoader(sample, \
    4, # Fetch 4 samples per batch
    shuffle=True, num_workers=2)

In [27]:
dataiter = iter(dl)
images, labels = next(dataiter)
print(f'Each batch has a dataset of shape {images.shape} and a corresponding set of {len(labels)} labels.')

Each batch has a dataset of shape torch.Size([4, 3, 40, 150]) and a corresponding set of 4 labels.


In [39]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()

        self.conv1 = nn.Conv2d(3, # 3 input channels
            6, # 6 output channels
            5) # kernel of size 5x5

        self.pool = nn.MaxPool2d(2, # kernel size of 2
            2) # stride of 2

        self.conv2 = nn.Conv2d(6, 16, 5) # 6 in / 16 out / 5x5 kernel

        self.fc1 = nn.Linear(3808, # features in
            1404) # features out

        self.fc2 = nn.Linear(1404, 702)
        self.fc3 = nn.Linear(702, 5)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)#nn.Softmax(self.fc3(x))
        return x


net = Net()

In [40]:
criterion = nn.MultiLabelSoftMarginLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=0.001)

In [52]:
for epoch in range(2):  # loop over the dataset multiple times
    print(f'Starting epoch {epoch}')

    running_loss = 0.0
    for i, data in enumerate(dl, 0):
        # print(f'Iteration {i}')
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        # print('Read data.')
        # zero the parameter gradients
        optimizer.zero_grad()
        # print('Grads zeroed.')
        
        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')

print('Finished Training')

Starting epoch 0
[1,     1] loss: -462624248561.664
[1,     2] loss: -1137976516018.176
[1,     3] loss: -1615332871503.872
[1,     4] loss: -2139805118889.984
[1,     5] loss: -3301313928495.104
[1,     6] loss: -3859321246973.952
[1,     7] loss: -4524389013913.600
[1,     8] loss: -5234046057578.496
[1,     9] loss: -5743536687284.224
[1,    10] loss: -6573962641276.928
[1,    11] loss: -7125715984056.320
[1,    12] loss: -7687504283041.792
[1,    13] loss: -8154613517647.872
[1,    14] loss: -8976243915489.279
[1,    15] loss: -9761567277056.000
[1,    16] loss: -10606484182269.951
[1,    17] loss: -11599641816596.480
[1,    18] loss: -12349139647987.713
[1,    19] loss: -12954745203326.977
[1,    20] loss: -14022971752448.000
[1,    21] loss: -14917231657877.504
[1,    22] loss: -15356013637009.408
[1,    23] loss: -15982385157898.240
[1,    24] loss: -16685439024889.855
[1,    25] loss: -17763585337851.902
[1,    26] loss: -18658125825441.793
[1,    27] loss: -19673133434798.078


In [48]:
tl = DataLoader(Captcha_Dataset.from_dir(file_locations[-128:-1]), \
    4, # Fetch 4 samples per batch
    shuffle=True, num_workers=2)
correct = 0
total = 0
# since we're not training, we don't need to calculate the gradients for our outputs
with torch.no_grad():
    for data in tl:
        images, labels = data
        # calculate outputs by running images through the network
        outputs = net(images)
        # the class with the highest energy is what we choose as prediction
        _, predicted = torch.max(outputs.data, 1)
        predicted = outputs.data # np.array([max(i) for i in outputs.data])
        total += labels.size(0)
        print(predicted)
        print(labels)
        correct += (predicted == labels).sum().item()

print(f'Accuracy of the network on the 10000 test images: {100 * correct // total} %')

tensor([[8.1803e+09, 7.8717e+09, 8.2863e+09, 7.6505e+09, 8.4072e+09],
        [5.5434e+09, 5.3342e+09, 5.6152e+09, 5.1844e+09, 5.6971e+09],
        [8.2874e+09, 7.9747e+09, 8.3947e+09, 7.7507e+09, 8.5172e+09],
        [1.3922e+09, 1.3397e+09, 1.4103e+09, 1.3021e+09, 1.4309e+09]])
tensor([[32, 55,  6, 38,  5],
        [22, 37, 40, 31, 41],
        [14,  8, 42, 56, 22],
        [56, 46, 55, 45, 45]])
tensor([[8.4344e+09, 8.1162e+09, 8.5436e+09, 7.8881e+09, 8.6683e+09],
        [4.3255e+09, 4.1623e+09, 4.3815e+09, 4.0453e+09, 4.4454e+09],
        [7.1948e+09, 6.9234e+09, 7.2880e+09, 6.7289e+09, 7.3943e+09],
        [2.7138e+09, 2.6114e+09, 2.7489e+09, 2.5380e+09, 2.7891e+09]])
tensor([[25,  7, 33, 15, 43],
        [36, 20,  9, 54, 22],
        [51, 19,  4, 24, 51],
        [54, 55, 46, 52,  3]])
tensor([[8.5256e+09, 8.2039e+09, 8.6360e+09, 7.9734e+09, 8.7620e+09],
        [2.7425e+09, 2.6391e+09, 2.7781e+09, 2.5649e+09, 2.8186e+09],
        [3.7386e+09, 3.5975e+09, 3.7870e+09, 3.4964e+09,