In [None]:
#Import libraries and initial configuration
import os
import random

import matplotlib.pyplot as plt
import numpy as np

import torch
import torchvision
import torchvision.models as models
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import DataLoader,Dataset
from torch.autograd import Variable
from PIL import Image

#Specific configuration to use GPUs if available
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="1"

## DATA LOAD

In [None]:
#Definition of captcha character sets and parameters
NUMBER = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
ALPHABET = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
NONE = ['NONE'] #Label for empty space
ALL_CHAR_SET = NUMBER + ALPHABET + NONE #List of all possible characters
ALL_CHAR_SET_LEN = len(ALL_CHAR_SET) #Total length of the character set
MAX_CAPTCHA = 7 #Maximum length of the captcha

print(ALL_CHAR_SET.index('NONE'))

#One-hot encoding
def encode(a):
    onehot = [0]*ALL_CHAR_SET_LEN
    idx = ALL_CHAR_SET.index(a)
    onehot[idx] += 1
    return onehot

36


In [None]:
#Define train size and test size
TRAIN_SIZE = 10000
TEST_SIZE = 1000

In [None]:
#Modified dataset class
class Mydataset(Dataset):
    def __init__(self, img_path, label_path, is_train=True, transform=None):
        self.path = img_path
        self.label_path = label_path
        if is_train:
            self.img = os.listdir(self.path)[:TRAIN_SIZE] #List of image filenames for training
            self.labels = open(self.label_path, 'r').read().split('\n')[:-1][:TRAIN_SIZE] #List of labels for training
        else:
            self.img = os.listdir(self.path)[:TEST_SIZE]
            self.labels = open(self.label_path, 'r').read().split('\n')[:-1][:TEST_SIZE]

        self.transform = transform #Transformation to apply to the images
        self.max_length = MAX_CAPTCHA #Maximum length of the captcha label

    def __getitem__(self, idx):
        img_path = self.img[idx]
        img = Image.open(f'{self.path}/{self.img[idx]}')
        img = img.convert('L') #Convert image to grayscale
        label = self.labels[idx] #Get the label corresponding to the image
        label_oh = []

        #One-hot for each character
        for i in range(self.max_length):
            if i < len(label):
                label_oh += encode(label[i]) #Append one-hot vector for the current character
            else:
                label_oh += encode('NONE') #Append one-hot vector for empty space if label length < max_length

        if self.transform is not None:
            img = self.transform(img) #Apply the transformation to the image if specified
        return img, np.array(label_oh), label

    def __len__(self):
        return len(self.img)

In [None]:
transform = transforms.Compose([
    transforms.Resize([160, 60]),           #Resize data
    transforms.ToTensor(),                  #Convert input data into a tensor format
    transforms.Normalize((0.5,), (0.5,)),
])

In [None]:
from google.colab import drive
drive.mount('/content/drive')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

train_ds = Mydataset('/content/drive/MyDrive/Data/train', '/content/drive/MyDrive/Data/train.txt', transform=transform)
test_ds = Mydataset('/content/drive/MyDrive/Data/test', '/content/drive/MyDrive/Data/test.txt', is_train=False, transform=transform)

train_dl = DataLoader(train_ds, batch_size=128, num_workers=4)
test_dl = DataLoader(test_ds, batch_size=1, num_workers=4)

Mounted at /content/drive




## Models

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
class ResNetWithFeatures(nn.Module):
    def __init__(self):
        super(ResNetWithFeatures, self).__init__()
        self.resnet = models.resnet18(pretrained=False) #Load a ResNet-18 model without pre-trained weights
        self.resnet.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False) #Modify the first convolutional layer to have 64 output channels
        self.resnet.fc = nn.Identity()  # Ignore the final fully connected layer
        self.fc = nn.Linear(512, ALL_CHAR_SET_LEN * MAX_CAPTCHA) #Add a fully connected layer with the appropriate number of inputs and outputs

    def forward(self, x):
        features = self.resnet(x) #Forward pass through the modified ResNet model to extract features
        out = self.fc(features) #Apply the fully connected layer to the extracted features
        return out, features

Net = ResNetWithFeatures().to(device)



In [None]:
#LSTM
class LSTM(nn.Module):
    def __init__(self, cnn_dim, hidden_size, vocab_size, num_layers=1):
        super(LSTM, self).__init__()

        self.cnn_dim = cnn_dim #Dimensionality of CNN features
        self.hidden_size = hidden_size #Hidden size of LSTM
        self.vocab_size = vocab_size #Vocabulary size

        #Building your LSTM cell
        self.lstm_cell = nn.LSTMCell(input_size=self.vocab_size, hidden_size=hidden_size)


        #Problem 2.1: Connect CNN model to LSTM model

        #Output fully connected layer
        self.fc_in =nn.Linear(self.cnn_dim, self.hidden_size) #input
        self.fc_out =nn.Linear(self.hidden_size, self.vocab_size) #output

        #Embedding layer
        self.embed = nn.Embedding(num_embeddings=self.vocab_size, embedding_dim=self.vocab_size)

        #Activations
        self.softmax = nn.Softmax(dim=1) #Softmax activation function along dimension 1


    def forward(self, features, captions):
        batch_size = features.size(0) #Get batch size
        cnn_dim = features.size(1) #Get CNN dimension

        hidden_state = torch.zeros((batch_size, self.hidden_size)).cuda()  # Initialize hidden state with zeros
        cell_state = torch.zeros((batch_size, self.hidden_size)).cuda()   # Initialize cell state with zeros

        #Initialize ouput tensor
        outputs = torch.empty((batch_size, captions.size(1), self.vocab_size)).cuda()

        #Embed captions
        captions_embed = self.embed(captions)

        #Problem 1: Design LSTM model for captcha image recognition
        for t in range(captions).size(1):
            # for the first time step the input is the feature vector
            if t == 0:
                hidden_state, cell_state = self.lstm_cell(self.fc_in(features), (hidden_state, cell_state))
            # for the 2nd+ time steps
            else:
                hidden_state, cell_state = self.lstm_cell(captions_embed[:, t, :], (hidden_state, cell_state))
            # build the output tensor
            outputs[:, t, :] = self.softmax(self.fc_out(hidden_state))

        return outputs  # 정답으로 예측한 값

In [None]:
cnn_dim=512   #resnet18-512
hidden_size=128
vocab_size=37     #ALL_CHAR_SET_LEN

lstm = LSTM(cnn_dim=cnn_dim, hidden_size=hidden_size, vocab_size=vocab_size)
lstm = lstm.to(device)

In [None]:
#Define loss and optimizer
loss_func = nn.MultiLabelSoftMarginLoss()
cnn_optim = torch.optim.Adam(Net.parameters(), lr=0.001)
lstm_optim = torch.optim.Adam(lstm.parameters(), lr=0.001)

## TRAIN

In [None]:
print_interval = 15 #Interval for printing loss during training
max_epoch = 30 #Maximum number of epochs for training
use_lstm = False #Use LSTM or not
tf_rate = 1 #TensorFlow rate

for epoch in range(max_epoch):
    for step, i in enumerate(train_dl):
        img, label_oh, label = i #Get data from DataLoader
        img = Variable(img).to(device) #Move image data to GPU if available
        label_oh = Variable(label_oh.float()).to(device)
        batch_size, _ = label_oh.shape
        pred, feature = Net(img) #Forward pass through the CNN model

        if use_lstm:
            #Prepare the one-hot encoded labels for LSTM input
            lstm_input = label_oh.view(batch_size, MAX_CAPTCHA, ALL_CHAR_SET_LEN).argmax(dim=2)
            outputs = lstm(feature, lstm_input)
            #Reshape the LSTM outputs for loss calculation
            pred = outputs.view(batch_size, -1)
            lstm_loss = loss_func(pred, label_oh)  #Calculate LSTM loss
            lstm_optim.zero_grad()  #Zero the gradients of LSTM optimizer
            lstm_loss.backward()  #Backward propagation for LSTM
            lstm_optim.step()  #Update LSTM model parameters

        loss = loss_func(pred, label_oh) #Calculate CNN loss


        cnn_optim.zero_grad()  #Zero the gradients of CNN optimizer
        loss.backward()  #Backward propagation for CNN
        cnn_optim.step()  #Update CNN model parameters

        if (step+1)%print_interval == 0:
            print('epoch:', epoch+1, 'step:', step+1, 'loss:', loss.item())

epoch: 1 step: 15 loss: 0.08794360607862473
epoch: 1 step: 30 loss: 0.07887865602970123
epoch: 1 step: 45 loss: 0.07343178987503052
epoch: 1 step: 60 loss: 0.07322997599840164
epoch: 1 step: 75 loss: 0.07137478142976761
epoch: 2 step: 15 loss: 0.0740855485200882
epoch: 2 step: 30 loss: 0.07332928478717804
epoch: 2 step: 45 loss: 0.07229837030172348
epoch: 2 step: 60 loss: 0.07299891114234924
epoch: 2 step: 75 loss: 0.07120326906442642
epoch: 3 step: 15 loss: 0.07419353723526001
epoch: 3 step: 30 loss: 0.07337061315774918
epoch: 3 step: 45 loss: 0.07239318639039993
epoch: 3 step: 60 loss: 0.07298305630683899
epoch: 3 step: 75 loss: 0.07120394706726074
epoch: 4 step: 15 loss: 0.07427690178155899
epoch: 4 step: 30 loss: 0.07337729632854462
epoch: 4 step: 45 loss: 0.07243189960718155
epoch: 4 step: 60 loss: 0.0729156956076622
epoch: 4 step: 75 loss: 0.07119856774806976
epoch: 5 step: 15 loss: 0.07430651783943176
epoch: 5 step: 30 loss: 0.07340433448553085
epoch: 5 step: 45 loss: 0.07242332

## TEST

In [None]:
use_lstm = False #Use LSTM or not
tf_rate = 0 #TensorFlow rate
char_correct = 0
word_correct = 0
total = 0

Net.eval() #Set the CNN model to evaluation mode
lstm.eval() #Set the LSTM model to evaluation mode

def get_str(ch_arr):
  """Function to convert a character array into a string representation."""
    ch_str = ''
    for ch in ch_arr:
        if ch == 'NONE':
            ch_str = ch_str + '_'
        else:
            ch_str = ch_str + ch
    return ch_str

# Evaluate the model on the test dataset
with torch.no_grad():
    for step, (img, label_oh, label) in enumerate(test_dl):
        img = Variable(img).to(device)  #Move image data to GPU if available
        label_oh = Variable(label_oh.float()).to(device)  #Move label data to GPU if available
        pred, feature = Net(img)  #Forward pass through the CNN model to get predictions

        if use_lstm:
            outputs = lstm(feature, label_oh.view(1,MAX_CAPTCHA,ALL_CHAR_SET_LEN).argmax(dim=2))
            pred = outputs.view(1, -1)
        pred = pred.squeeze(0)  #Remove the batch dimension
        label_oh = label_oh.squeeze(0)
        #Decode predictions and labels
        c0 = ALL_CHAR_SET[np.argmax(pred.squeeze().cpu().tolist()[0:ALL_CHAR_SET_LEN])]
        c1 = ALL_CHAR_SET[np.argmax(pred.squeeze().cpu().tolist()[ALL_CHAR_SET_LEN:ALL_CHAR_SET_LEN*2])]
        c2 = ALL_CHAR_SET[np.argmax(pred.squeeze().cpu().tolist()[ALL_CHAR_SET_LEN*2:ALL_CHAR_SET_LEN*3])]
        c3 = ALL_CHAR_SET[np.argmax(pred.squeeze().cpu().tolist()[ALL_CHAR_SET_LEN*3:ALL_CHAR_SET_LEN*4])]
        c4 = ALL_CHAR_SET[np.argmax(pred.squeeze().cpu().tolist()[ALL_CHAR_SET_LEN*4:ALL_CHAR_SET_LEN*5])]
        c5 = ALL_CHAR_SET[np.argmax(pred.squeeze().cpu().tolist()[ALL_CHAR_SET_LEN*5:ALL_CHAR_SET_LEN*6])]
        c6 = ALL_CHAR_SET[np.argmax(pred.squeeze().cpu().tolist()[ALL_CHAR_SET_LEN*6:ALL_CHAR_SET_LEN*7])]

        d0 = ALL_CHAR_SET[np.argmax(label_oh.cpu().tolist()[0:ALL_CHAR_SET_LEN])]
        d1 = ALL_CHAR_SET[np.argmax(label_oh.cpu().tolist()[ALL_CHAR_SET_LEN:ALL_CHAR_SET_LEN*2])]
        d2 = ALL_CHAR_SET[np.argmax(label_oh.cpu().tolist()[ALL_CHAR_SET_LEN*2:ALL_CHAR_SET_LEN*3])]
        d3 = ALL_CHAR_SET[np.argmax(label_oh.cpu().tolist()[ALL_CHAR_SET_LEN*3:ALL_CHAR_SET_LEN*4])]
        d4 = ALL_CHAR_SET[np.argmax(label_oh.cpu().tolist()[ALL_CHAR_SET_LEN*4:ALL_CHAR_SET_LEN*5])]
        d5 = ALL_CHAR_SET[np.argmax(label_oh.cpu().tolist()[ALL_CHAR_SET_LEN*5:ALL_CHAR_SET_LEN*6])]
        d6 = ALL_CHAR_SET[np.argmax(label_oh.cpu().tolist()[ALL_CHAR_SET_LEN*6:ALL_CHAR_SET_LEN*7])]

        #Convert character arrays to strings
        c_arr = (c0, c1, c2, c3, c4, c5, c6)
        d_arr = (d0, d1, d2, d3, d4, d5, d6)

        # Construct strings from character arrays
        c = '%s%s%s%s%s%s%s' % (c0, c1, c2, c3, c4, c5, c6)
        d = '%s%s%s%s%s%s%s' % (d0, d1, d2, d3, d4, d5, d6)

        c = get_str(c_arr)
        d = get_str(d_arr)

        char_correct += (c0==d0)+(c1==d1)+(c2==d2)+(c3==d3)+(c4==d4)+(c5==d5)+(c6==d6)
        word_correct += (c==d)
        total += 1

        print('label:', d, ', pred:', c)

print(100/7*char_correct/total) #Character leve accuracy
print(100*word_correct/total) #Word level accuracy

"""END TEST"""



label: b9x____ , pred: 7r_____
label: mb_____ , pred: c0h____
label: d5q7qh_ , pred: vr02___
label: 6tl0kqv , pred: oyx____
label: t1_____ , pred: mck____
label: avhjn3z , pred: dc_____
label: 74z0z__ , pred: 7cx____
label: f1kfa__ , pred: 7egx___
label: sripns_ , pred: mcd2v__
label: bg4____ , pred: 56rq___
label: gmb45tz , pred: 7y9b___
label: sr5____ , pred: fu_____
label: 0nt____ , pred: ccx____
label: lxfg98_ , pred: 9w2____
label: 2b8o___ , pred: 7dt____
label: kr25___ , pred: 5cd____
label: fl_____ , pred: 7c8____
label: 0tiwrd_ , pred: el_____
label: k5_____ , pred: 7sd____
label: 8k_____ , pred: s42ed__
label: ggin___ , pred: 7b_____
label: qc6e___ , pred: 7yud___
label: giz6rv_ , pred: x0x____
label: tf15___ , pred: qca____
label: 7jz____ , pred: 7x_____
label: v3zl9__ , pred: 744____
label: p78ec__ , pred: qpw____
label: 7rh____ , pred: ti2____
label: exqo___ , pred: 5u9____
label: yrs____ , pred: 74xa___
label: si_____ , pred: 0e7s___
label: n4tikm_ , pred: f8_____
label: l

'END TEST'