In [1]:
from glob import glob
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
from typing import List, Tuple
import torchvision.transforms as T
from torch.utils.data import DataLoader, Dataset

import torch.nn as nn
import torch.nn.functional as F
import torch

import torch.optim as optim


In [14]:
file_locations = glob('./Captchas/*')
captcha_names = [file.split('/')[-1].split('.')[0] for file in file_locations]
unique_characters = set(char for name in captcha_names for char in name)
print( f'identified {len(file_locations)} images' )
print( f'{len(unique_characters)} unique characters: {unique_characters}' )

identified 113062 images
60 unique characters: {'i', '8', 'G', 'z', 'm', 'D', 'l', 'w', '2', 'R', 'k', 'p', 'X', 'J', 'n', 'C', 'd', 'V', 'j', 't', 's', '3', 'g', '6', '7', 'M', 'e', 'A', 'r', 'K', 'a', 'F', 'E', '4', 'U', 'T', 'N', 'W', '9', 'f', 'Q', 'O', 'c', 'I', '1', 'x', 'b', 'P', 'u', 'Y', 'H', 'h', 'q', 'B', '5', 'L', 'Z', 'S', 'y', 'v'}


In [3]:
class Captcha_Dataset(Dataset):
    def __init__(self, data, labels):
        self.X = data
        self.y = labels

    @classmethod
    def import_image(cls, location:str) -> np.ndarray:
        """
        Import a single image.

        Parameters: location (str) Location of image
        Returns: (np.ndarray) Image dimensions = Captchas 40 x 150 x 3 RGB channels
        """
        image = Image.open(location)
        image.load()
        #image.show()
        data = np.asarray(image, dtype='float32')
        return data
    @classmethod
    def stack_images(cls, file_locations:List[str]) -> np.ndarray:
        """
        Stack imageset from directory.

        Parameters: file_locations (List[str]) List of image locations
        Returns: (np.ndarray) len(file_locations) x image dimensions
        """
        return np.array([cls.import_image(location) for location in file_locations ])
    
    @classmethod
    def read_label_names(cls, file_locations:List[str]) -> List[str]:
        """
        Simply extracts labels from filenames.

        Parameters: file_locations (List[str]) List of image locations
        Returns: (List[str]) List of label names
        """
        return [file.split('/')[-1].split('.')[0] for file in file_locations]


    @classmethod
    def from_dir(cls, file_locations:List[str]):
        """
        Instantiate from only a list of files.

        Parameters: file_locations (List[str]) List of image locations
        Returns: (Captcha_Dataset) object
        """
        return cls(
            cls.stack_images(file_locations),
            cls.read_label_names(file_locations)
        )

    def transform(self, image:np.ndarray) -> torch.Tensor:
        """Dataset transform for loading."""
        return T.ToTensor()(image) # This is a hack for now.
        # Not sure why, but this transforming doesn't work. It's weird. Idk.
        # I originally tried using only PIL images and then resizing from there, but it didn't work.
        # Tried now going from PIL --> ndarray --> PIL --> Tensor; also doesn't work. 
        # Bit lost.
        # return  T.Compose([
        #     T.ToPILImage(),
        #     T.Resize([40, 150]),
        #     T.ToTensor()
        #     ])(image)

    def __getitem__(self, index:int) -> Tuple[torch.Tensor, str]:
        """Select one sample. DataLoader accesses samples through this function."""
        return self.transform(self.X[index]), self.y[index]
    
    def __len__(self) -> int:
        """Also needed for DataLoader."""
        return len(self.X)

In [4]:
sample = Captcha_Dataset.from_dir(file_locations[0:128])

print(f'{sample.X.shape}\nSample of first 128 images in format 40px x 150px x 3 RGB channels, of type {type(sample.X[0][0][0][0])}')

(128, 40, 150, 3)
Sample of first 128 images in format 40px x 150px x 3 RGB channels, of type <class 'numpy.float32'>


In [5]:
dl = DataLoader(sample, \
    4, # Fetch 4 samples per batch
    shuffle=True, num_workers=2)

In [6]:
dataiter = iter(dl)
images, labels = next(dataiter)
print(f'Each batch has a dataset of shape {images.shape} and a corresponding set of {len(labels)} labels.')

Each batch has a dataset of shape torch.Size([4, 3, 40, 150]) and a corresponding set of 4 labels.


In [7]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()

        self.conv1 = nn.Conv2d(3, # 3 input channels
            6, # 6 output channels
            5) # kernel of size 5x5

        self.pool = nn.MaxPool2d(2, # kernel size of 2
            2) # stride of 2

        self.conv2 = nn.Conv2d(6, 16, 5) # 6 in / 16 out / 5x5 kernel

        self.fc1 = nn.Linear(3808, # features in
            120) # features out

        self.fc2 = nn.Linear(1404, 702)
        self.fc3 = nn.Linear(702, 60)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


net = Net()

In [8]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

In [10]:
for epoch in range(2):  # loop over the dataset multiple times
    print(f'Starting epoch {epoch}')

    running_loss = 0.0
    for i, data in enumerate(dl, 0):
        print(f'Iteration {i}')
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        print('Read data.')
        # zero the parameter gradients
        optimizer.zero_grad()
        print('Grads zeroed.')
        
        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        # running_loss += loss.item()
        # if i % 2000 == 1999:    # print every 2000 mini-batches
        #     print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
        #     running_loss = 0.0

print('Finished Training')

Starting epoch 0
Iteration 0
Read data.
Grads zeroed.


TypeError: cross_entropy_loss(): argument 'target' (position 2) must be Tensor, not tuple

In [9]:

net(images)

tensor([[-0.8046,  2.0783,  0.9522,  1.4867, -0.2791, -0.7392,  1.1491,  0.1646,
         -0.0837, -1.6451],
        [-0.1766,  5.0565,  2.4338,  0.9848, -3.1592, -0.4532,  1.3642,  0.5081,
         -0.7839, -1.9731],
        [ 1.5589,  4.8706,  1.6537,  1.1962, -2.1339,  0.2627, -0.2543,  1.7555,
         -2.0709, -3.4647],
        [ 0.9590,  3.3156,  1.2383,  1.6371, -1.6653,  1.2395,  1.1717,  1.2776,
          0.0771, -3.3323]], grad_fn=<AddmmBackward0>)