<a href="https://colab.research.google.com/github/ArjunNarendra/455finalproject/blob/main/Simple_Captcha_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Final Project: Breaking CAPTCHAs

Our final project will involve solving CAPTCHAs, which are images with sequences of letters and numbers used to "verify" that a user is a human.

##Step 1: Neural Network Modeling

Since CAPTCHAs use uppercase, lowercase, and digits, we will be using the EMNIST dataset to create a classifier. After we train it, we can use the pre-trained weights. That means that this section of code should only be run once to train the classifier.

In [None]:
import torch
import torchvision

def get_emnist_data():
  # Need to perform a rotate and flip to properly orient the images
  trainset = torchvision.datasets.EMNIST(root='./data', split='byclass', train=True, download=True,
                                        transform=torchvision.transforms.Compose([
                                            lambda img: torchvision.transforms.functional.rotate(img, -90),
                                            lambda img: torchvision.transforms.functional.hflip(img),
                                            torchvision.transforms.ToTensor()
                                        ]))
  trainloader = torch.utils.data.DataLoader(trainset, batch_size=32, shuffle=True, num_workers=8)

  testset = torchvision.datasets.EMNIST(root='./data', split='byclass', train=False, download=True,
                                      transform=torchvision.transforms.Compose([
                                          lambda img: torchvision.transforms.functional.rotate(img, -90),
                                          lambda img: torchvision.transforms.functional.hflip(img),
                                          torchvision.transforms.ToTensor()
                                      ]))
  testloader = torch.utils.data.DataLoader(testset, batch_size=32, shuffle=False, num_workers=8)
  # 0-9 are for digits, 10-35 are for uppercase letters, 36-61 are for lowercase letters
  classes = []
  for i in range(0, 10):
    classes.append(i)
  for letter in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
    classes.append(letter)
  for letter in "abcdefghijklmnopqrstuvwxyz":
    classes.append(letter)
  return {'train': trainloader, 'test': testloader, 'classes': classes}

In [None]:
# Get EMNIST data
data = get_emnist_data()

In [None]:
# Print out details about the training data
print(data['train'].__dict__)

In [None]:
# Print out details about the testing data
print(data['test'].__dict__)

In [None]:
# Get images and labels for one batch of the training data
dataiter = iter(data['train'])
images, labels = next(dataiter)
print(images.size())

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import torchvision

def imshow(img):
  npimg = img.numpy()
  plt.imshow(np.transpose(npimg, (1, 2, 0)))
  plt.show()

# Show first batch of images
imshow(torchvision.utils.make_grid(images))
# Print labels for first 8 digits
print("Labels:" + ' '.join('%9s' % data['classes'][labels[j]] for j in range(8)))

print(images.size())
flat = torch.flatten(images, 1)
print(flat.size())

In [None]:
import torch

# Try to switch to the CPU if possible
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
import torch.optim as optim

def train(net, dataloader, epochs=1, lr=0.01, momentum=0.9, decay=0.0005, verbose=1):
  net.to(device)
  losses = []
  # We are using CrossEntropy loss and Stochastic Gradient Descent 
  criterion = nn.CrossEntropyLoss()
  optimizer = optim.SGD(net.parameters(), lr=lr, momentum=momentum, weight_decay=decay)
  for epoch in range(epochs):
    sum_loss = 0.0
    for i, batch in enumerate(dataloader, 0):
      # Get the inputs and associated labels for this particular batch of data
      inputs, labels = batch[0].to(device), batch[1].to(device)
      # Zero the parameter gradients
      optimizer.zero_grad()
      # Forward propogate, backward propogate, and update weights
      outputs = net(inputs)
      loss = criterion(outputs, labels)
      loss.backward()  
      optimizer.step()
      # Print loss information every 100 batches
      losses.append(loss.item())
      sum_loss += loss.item()
      if i % 100 == 99:
        if verbose:
          print('[%d, %5d] loss: %.3f' %
            (epoch + 1, i + 1, sum_loss / 100))
        sum_loss = 0.0
  return losses

In [None]:
import torch.nn as nn
import torch.nn.functional as F

# Convolutional neural network with two convolutional layers
class ConvNet(nn.Module):
  def __init__(self):
    super(ConvNet, self).__init__()
    # Input 28x28x1 image
    # 16 filters
    # 3x3 filter size (they also have 3 channels)
    # stride 2 (downsampling by factor of 2)
    # Output image: 14x14x16
    self.conv1 = nn.Conv2d(1, 16, 3, stride=2, padding=1)

    # Input 14x14x16 image
    # 32 filters
    # 3x3x16 filter size (they also have 16 channels)
    # stride 2 (downsampling by factor of 2)
    # Output image: 7x7x32
    self.conv2 = nn.Conv2d(16, 32, 3, stride=2, padding=1)

    # Fully connected linear layer
    self.fc1 = nn.Linear(1568, 62)

  def forward(self, x):
    x = self.conv1(x)
    x = F.relu(x)
    x = self.conv2(x)
    x = F.relu(x)
    x = torch.flatten(x, 1)
    x = self.fc1(x)
    return x

In [None]:
# Use simulated annealing for training ConvNet
anneal_net = ConvNet()

anneal_losses =  train(anneal_net, data['train'], epochs=2, lr=.1)
anneal_losses += train(anneal_net, data['train'], epochs=2, lr=.01)
anneal_losses += train(anneal_net, data['train'], epochs=2, lr=.001)

plt.plot(anneal_losses)

In [None]:
from google.colab import files

# Save the model weights into a file, then download that file
torch.save(anneal_net.state_dict(), 'model_state.pth')
files.download('model_state.pth')

In [None]:
def accuracy(net, dataloader):
  correct = 0
  total = 0
  with torch.no_grad():
    for batch in dataloader: 
      # Get images and labels for the current batch
      images, labels = batch[0].to(device), batch[1].to(device)
      # Get predicted labels from our network
      outputs = net(images)
      _, predicted = torch.max(outputs.data, 1)
      # Tally up the number of correct predictions that our network made
      total += labels.size(0)
      correct += (predicted == labels).sum().item()
  return correct/total

In [None]:
print("AnnealNet train accuracy: %f" % accuracy(anneal_net, data['train']))

In [None]:
print("AnnealNet test accuracy: %f" % accuracy(anneal_net, data['test']))


## Step 2: Load images

We load images of CAPTCHAs from a Kaggle dataset.

In [None]:
# NOTE: First, must upload kaggle.json with credentials
# Install the kaggle library
! pip install kaggle
# Make a directory named .kaggle
! mkdir ~/.kaggle
# Copy the kaggle.json into this new directory
! cp kaggle.json ~/.kaggle/
# Allocate the required permissions for this file
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
# Download the simple CAPTCHA dataset
! kaggle datasets download fanbyprinciple/captcha-images

In [None]:
# Unzip the file
! unzip captcha-images.zip

In [None]:
import glob
import torchvision
import re
from PIL import Image
import numpy as np

# Dictionary for images. Key is the filename information. Value is the pixel data
# stored as a tensor.
imagesDict = {}

for im_path in glob.glob("captcha_images/*.png"):
  # Convert the image to one channel
  oneChannelImage = Image.open(im_path).convert("L")
  # Transform the image to a numpy array
  imageData = np.array(oneChannelImage, dtype=np.uint8)
  # Extract relevant file name information
  filename = re.search(r"[A-Z1-9]{4}", im_path).group()
  # Add the key-value mapping to dictionary
  imagesDict[filename] = imageData

In [None]:
import matplotlib.pyplot as plt

# Get all the tags of the images in this dataset
imagesTags = list(imagesDict.keys())
# Get the tag corresponding to the first image
firstImageTag = imagesTags[0]
# Get the image data corresponding to the tag
imageData = imagesDict.get(firstImageTag)

# Show this image with dimensions 24 x 72
plt.gray()
plt.imshow(imageData)
plt.show()

# Print out its dimensions
print(imageData.shape)

## Step 3: Character Segmentation

We find bounding boxes around each character in a CAPTCHA, extract the individual characters, and normalize them.

In [None]:
import cv2

def segment_nonnoise(im):
  # img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  ind_chars = []
  MIN_CHAR_AREA = 50
  to_segment = im 
  blurred = cv2.blur(to_segment, (5,5), 0)
  # plt.gray()  
  # plt.imshow(blurred)
  # plt.show()
  img_thresh = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 2)
  # plt.gray()  
  # plt.imshow(img_thresh)
  # plt.show()
  # thresh = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
  contours, hierarchy = cv2.findContours(img_thresh, cv2.RETR_LIST, cv2.CHAIN_APPROX_NONE)
  # print(len(contours))
  # for contour in contours:
  #     if cv2.contourArea(contour) > MIN_CHAR_AREA:
  #         [X, Y, W, H] = cv2.boundingRect(contour)
  #         # TODO: uncomment and run the code with this line to generate some
  #         # cool pics for the writeup
  #         # cv2.rectangle(to_segment, (X, Y), (X + W, Y + H), (0,255, 0), 1)

  # plt.gray()
  # plt.imshow(to_segment)
  # plt.show()

  # cv2.boundingRect returns X, Y, and width and height of the bounding
  # box. Use the box's X coordinate to sort the contours from left to right,
  # which will make it easier to enumerate through individual characters
  contours = sorted(contours, key=lambda contour: cv2.boundingRect(contour)[0])
  # print(len(contours))
  for contour in contours:
    # print("Contour!")
    if cv2.contourArea(contour) >= MIN_CHAR_AREA:
      x, y, w, h = cv2.boundingRect(contour)
      char = im[y:y+h, x:x+w]

      # EMNIST uses square images
      square = max(w, h)

      # Set background to white
      square_char = np.zeros((square, square), dtype=np.uint8)
      square_char[...] = 255

      # center the image
      x_off = (square - w) // 2
      y_off = (square - h) // 2

      square_char[y_off:y_off+h, x_off:x_off+w] = char
      # plt.imshow(square_char)
      # plt.show()
      
      # resize to 28x28 pixels, since this is the size EMNIST operates on.
      # Use INTER_AREA since according to the docs, this is what works best
      # for shrinking and for zooming it is approximately nearest-neighbors
      # interpolation
      adj_char = cv2.resize(square_char, (28, 28), interpolation=cv2.INTER_AREA)

      ind_chars.append(adj_char)

      # plt.imshow(adj_char)
      # plt.show()
    
  return ind_chars

In [None]:
characters = segment_nonnoise(imageData)
print(len(characters))
for character in characters:
  plt.imshow(character)
  plt.show()

## Step 4: CAPTCHA Processing

Once we have trained our neural network model on the EMNIST dataset, we can proceed to test our model on a CAPTCHA. 

In [None]:
import torch.nn as nn
import torch.nn.functional as F

# Convolutional neural network with two convolutional layers
class ConvNet(nn.Module):
  def __init__(self):
    super(ConvNet, self).__init__()
    # Input 28x28x1 image
    # 16 filters
    # 3x3 filter size (they also have 3 channels)
    # stride 2 (downsampling by factor of 2)
    # Output image: 14x14x16
    self.conv1 = nn.Conv2d(1, 16, 3, stride=2, padding=1)

    # Input 14x14x16 image
    # 32 filters
    # 3x3x16 filter size (they also have 16 channels)
    # stride 2 (downsampling by factor of 2)
    # Output image: 7x7x32
    self.conv2 = nn.Conv2d(16, 32, 3, stride=2, padding=1)

    # Fully connected linear layer
    self.fc1 = nn.Linear(1568, 62)

  def forward(self, x):
    x = self.conv1(x)
    x = F.relu(x)
    x = self.conv2(x)
    x = F.relu(x)
    x = torch.flatten(x, 1)
    x = self.fc1(x)
    return x

In [None]:
import torch

# Try to switch to the CPU if possible
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
import torch

# Reload weights from previously trained model
# NOTE: Must upload file that contains model's pre-trained weights
anneal_net = ConvNet()
anneal_net.to(device)
state_dict = torch.load('model_state.pth')
anneal_net.load_state_dict(state_dict)

In [None]:
def accuracy(net, dataloader):
  correct = 0
  total = 0
  with torch.no_grad():
    for batch in dataloader: 
      # Get images and labels for the current batch
      images, labels = batch[0].to(device), batch[1].to(device)
      # Get predicted labels from our network
      outputs = net(images)
      _, predicted = torch.max(outputs.data, 1)
      # Tally up the number of correct predictions that our network made
      total += labels.size(0)
      correct += (predicted == labels).sum().item()
  return correct/total

In [None]:
import torch
import torchvision

def get_emnist_data():
  # Need to perform a rotate and flip to properly orient the images
  trainset = torchvision.datasets.EMNIST(root='./data', split='byclass', train=True, download=True,
                                        transform=torchvision.transforms.Compose([
                                            lambda img: torchvision.transforms.functional.rotate(img, -90),
                                            lambda img: torchvision.transforms.functional.hflip(img),
                                            torchvision.transforms.ToTensor()
                                        ]))
  trainloader = torch.utils.data.DataLoader(trainset, batch_size=32, shuffle=True, num_workers=8)

  testset = torchvision.datasets.EMNIST(root='./data', split='byclass', train=False, download=True,
                                      transform=torchvision.transforms.Compose([
                                          lambda img: torchvision.transforms.functional.rotate(img, -90),
                                          lambda img: torchvision.transforms.functional.hflip(img),
                                          torchvision.transforms.ToTensor()
                                      ]))
  testloader = torch.utils.data.DataLoader(testset, batch_size=32, shuffle=False, num_workers=8)
  # 0-9 are for digits, 10-35 are for uppercase letters, 36-61 are for lowercase letters
  classes = []
  for i in range(0, 10):
    classes.append(i)
  for letter in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
    classes.append(letter)
  for letter in "abcdefghijklmnopqrstuvwxyz":
    classes.append(letter)
  return {'train': trainloader, 'test': testloader, 'classes': classes}

In [None]:
data = get_emnist_data()

In [None]:
# Ensure that the pre-trained model gives reasonable accuracy on the EMNIST dataset
print("AnnealNet train accuracy: %f" % accuracy(anneal_net, data['train']))
print("AnnealNet test accuracy: %f" % accuracy(anneal_net, data['test']))

In [None]:
def flipColor(character):
  for i in range(character.shape[0]):
    for j in range(character.shape[1]):
      # Let's say that anything at or above 150 will go to 0
      if character[i, j] > 150:
        character[i, j] = 0
      else:
        character[i, j] = 255
  return character

In [None]:
def createIndexToCharacterMap():
  indexToCharacter = []
  for i in range(0, 10):
    indexToCharacter.append(i)
  for letter in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
    indexToCharacter.append(letter)
  for letter in "abcdefghijklmnopqrstuvwxyz":
    indexToCharacter.append(letter)
  return indexToCharacter

In [None]:
import torch
import numpy as np
from PIL import Image
from torchvision import transforms

indexToCharacterMap = createIndexToCharacterMap()
transform = transforms.Compose([transforms.ToTensor()])

# Feed each character of the CAPTCHA into the neural network and make a prediction
for character in characters:
  # Flip color schema of character to match that of the EMNIST dataset
  character = flipColor(character)
  plt.imshow(character)
  plt.show()
  # Transform ndarray to tensor before feeding into character into neural network
  character = transform(character)
  with torch.no_grad():
    output = anneal_net(character.unsqueeze(0).to(device))
    modelPrediction = output.argmax(dim=1)
    print("Prediction: " + str(indexToCharacterMap[modelPrediction.item()]))


## Step 5: Putting it all Together

Let us see how well our model performs on all the CAPTCHA images in the dataset.

In [None]:
from torchvision import transforms

def accuracy(net, imagesDict):
  # Initial set up
  numCorrectCAPTCHAs = 0
  numTotalCAPTCHAs = 0
  numCorrectCharacters = 0
  numTotalCharacters = 0
  indexToCharacterMap = createIndexToCharacterMap()
  transform = transforms.Compose([transforms.ToTensor()]) 

  # Go through each CAPTCHA image in the dataset and calculate relevant statistics
  for imageTag in imagesDict:
    imageData = imagesDict.get(imageTag)
    characters = segment_nonnoise(imageData)
    numCorrectCharactersInCAPTCHA = 0
    currCharacter = 0
    for character in characters:
      character = flipColor(character)
      character = transform(character)
      with torch.no_grad():
        output = anneal_net(character.unsqueeze(0).to(device))
        modelPrediction = output.argmax(dim=1)
        prediction = str(indexToCharacterMap[modelPrediction.item()])
        if currCharacter <= 3 and prediction == imageTag[currCharacter]:
          numCorrectCharactersInCAPTCHA = numCorrectCharactersInCAPTCHA + 1
        currCharacter = currCharacter + 1

    # Perform updates
    if numCorrectCharactersInCAPTCHA == 4:
      numCorrectCAPTCHAs = numCorrectCAPTCHAs + 1
    numCorrectCharacters = numCorrectCharacters + numCorrectCharactersInCAPTCHA
    numTotalCAPTCHAs = numTotalCAPTCHAs + 1
    numTotalCharacters = numTotalCharacters + 4
    
  return [numCorrectCAPTCHAs / numTotalCAPTCHAs, numCorrectCharacters / numTotalCharacters]

In [None]:
accuracies = accuracy(anneal_net, imagesDict)
print("CAPTCHA accuracy: %f" % accuracies[0])
print("Character accuracy: %f" % accuracies[1])