# Captcha Bypass

**Team 99**

Pranavbhai Patel

Nicholas Leung

Coden Mercurius

Ravi Singh

**Description**

CAPTCHA, or Completely Automated Public Turing Test to Tell Computers and Humans Apart, is a challenge-response test that determines whether a user is authentic (human) or inauthentic (machine). They require users to authenticate themselves by retyping a character sequence prior to completing a request. This notebook implements a CAPTCHA bypass using deep learning. The team aims to investigate weaknesses and vulnerabilities of the CAPTCHA system.

In [None]:
import torch
import torch.nn as nn
import os
from skimage import io
from torch.utils.data import Dataset, DataLoader
import torch.utils.data
import torchvision 
from torchvision import transforms
import matplotlib.pyplot as plt
import numpy as np
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
import random

torch.manual_seed(0)
random.seed(0)

## Part 1. Data Processing

The dataset for this model is generated using the following library: https://github.com/lepture/captcha. No additional data augmentation is performed at this time. The code used to generate the dataset is found in the team private repo and is named `dataset_generator.py`.

**Primary Dataset Characteristics:**
- Uniform distribution of characters used
- 36 characters, 0-9 and A-Z
- 3000 captchas

**Secondary Dataset:**
- A dataset of a smaller character space is also availiable
- 10 numerical characters, 0-9
- 1981 captchas


The generated datasets are availiable through the team private repo. Upload `alphanumeric_dataset.zip` or `numeric_dataset.zip` into the root session storage and run the below cells to unzip. Update the path as needed.

In [None]:
segmentation_dataset_path = '/content/2Chars Labeled'

In [None]:
!unzip /content/labelled_2Char.zip -d /content

In [None]:
#To delete a folder
#!rm -rf '/content/3Chars_Partialy_Labeled'

In [None]:
class SegmentationDataset(Dataset):
  def __init__(self, directory):
    self.directory = directory
    self.images = os.listdir(directory)

    self.transform = transforms.Compose(
        [transforms.ToTensor()])

  def __len__(self):
    # Assumes each file in the dataset directory represents a data sample
    return len(self.images)

  def __getitem__(self, index):
    sample_name = self.images[index]
    sample_name_wo_extension = sample_name[0:-4] # Slice s.t. remove png file extension

    # Read the image and represent it as a tensor
    image = io.imread(self.directory + '/' + sample_name)
    #print(image)
    #image = self.transform(image)
    image=torch.tensor(image)
    image=image.float()
    image=image.unsqueeze(0)
    #print(image)

    # Define label
    name=sample_name_wo_extension.split(" ")
    #label = float(name[0]),float(name[1])
    label = float(name[0])

    return (image, torch.tensor(label))

In [None]:
def get_data_loaders(dataset, batch_size):

  training_ratio = 0.7
  validation_ratio = 0.15
  # test_ratio implied

  train_length = int(len(dataset) * training_ratio)
  validation_length = int((len(dataset) - train_length) * (validation_ratio / ( 1 - training_ratio )))
  test_length = len(dataset) - train_length - validation_length
  
  train_set, valid_set, test_set = torch.utils.data.random_split(dataset, [train_length, validation_length, test_length], torch.Generator().manual_seed(10))

  # REMINDER: Data is not shuffled per epoch, we may want this
  train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, num_workers=1)
  valid_loader = torch.utils.data.DataLoader(valid_set, batch_size=batch_size, num_workers=1)
  test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size, num_workers=1)

  return train_loader, valid_loader, test_loader

In [None]:
class Chars2(nn.Module):
    def __init__(self):
        super(Chars2, self).__init__()
        self.conv1 = nn.Conv2d(1,7,5,1,4)
        self.pool1 = nn.MaxPool2d(2, 2)

        self.conv2 = nn.Conv2d(7, 14, 5,1, 4)
        self.pool2 = nn.MaxPool2d(2,2)
        

        self.conv3 = nn.Conv2d(14, 28, 5,1, 4)
        self.pool3 = nn.MaxPool2d(2,2)

        self.conv4 = nn.Conv2d(28, 56, 5,1, 4)
        self.pool4 = nn.MaxPool2d(2,2)

        self.conv5 = nn.Conv2d(56, 70, 5,1, 4)
        self.pool5 = nn.MaxPool2d(2,2)

        self.conv6 = nn.Conv2d(70, 80, 5,1, 4)
        self.pool6 = nn.MaxPool2d(2,2)
        


        self.fc1 = nn.Linear(2000, 100)
        self.fc2 = nn.Linear(100, 1)

        self.lrelu=torch.nn.LeakyReLU(-0.001)

    def forward(self, img):
        x = self.pool1(self.lrelu(self.conv1(img)))
        x = self.pool2(self.lrelu(self.conv2(x)))
        x = self.pool3(self.lrelu(self.conv3(x)))
        x = self.pool4(self.lrelu(self.conv4(x)))
        #print(x.shape)
        x = self.pool5(self.lrelu(self.conv5(x)))
        #print(x.shape)
        x = self.pool6(self.lrelu(self.conv6(x)))
        #print(x.shape)
        x = x.view(-1, 2000)
        x = self.fc2(self.lrelu(self.fc1(x)))
        return x

In [None]:
class Chars3(nn.Module):
    def __init__(self):
        super(Chars3, self).__init__()
        self.conv1 = nn.Conv2d(1,7,5,1,2)
        self.pool1 = nn.MaxPool2d(2, 2)

        self.conv2 = nn.Conv2d(7, 14, 5,1, 2)
        self.pool2 = nn.MaxPool2d(2,2)
        
        self.conv3 = nn.Conv2d(14, 28, 5,1, 2)
        self.pool3 = nn.MaxPool2d(2,2)

        self.conv4 = nn.Conv2d(28, 56, 5,1, 2)
        self.pool4 = nn.MaxPool2d(2,2)


        self.conv5 = nn.Conv2d(56, 80, 5,1, 2)
        


        self.fc1 = nn.Linear(2000, 130)
        self.fc2 = nn.Linear(130, 2)

        self.lrelu=torch.nn.LeakyReLU(-0.001)

    def forward(self, img):
        x = self.pool1(self.lrelu(self.conv1(img)))
        #print(x.shape)
        x = self.pool2(self.lrelu(self.conv2(x)))
        #print(x.shape)
        x = self.pool3(self.lrelu(self.conv3(x)))
        #print(x.shape)
        x = self.pool4(self.lrelu(self.conv4(x)))
        #print(x.shape)
        x = self.lrelu(self.conv5(x))
        #print(x.shape)
        x = x.view(-1, 2000)
        x = self.fc2(self.lrelu(self.fc1(x)))
        return x

In [None]:
def trainer(model,data, numepochs,lr):
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr)

    iters = []
    losses = []
    epochnum = 0
    for epoch in range(numepochs):
        totloss=0
        index=1
        for img, label in data:  
            #if use_cuda and torch.cuda.is_available():
                #imgs = imgs.cuda()
                #labels = labels.cuda()
            out = model(img) # forward pass
            #label=label.unsqueeze(0)
            #label=torch.reshape(label,out.shape)
            #print(label," ",out)    
            loss = criterion(out, label) # compute the total loss
            loss.backward()  # backward pass (compute parameter updates)
            optimizer.step()  # make the updates for each parameter
            optimizer.zero_grad()  # a clean up step for PyTorch

            totloss+=float(loss.item())
            #print(totloss)
            index+=1
            
        iters.append(epochnum)
        #print(totloss)
        losses.append(totloss/index)  
        print(epochnum)
        epochnum += 1
        # save the current training information
        # torch.save(model.state_dict(), model_path)
        plt.title("Loss")
        plt.plot(iters, losses, label="Train")
        plt.xlabel("Iterations")
        plt.ylabel("Loss")
        plt.show()

In [None]:
def getaccuracy(model,data):
    criterion = nn.MSELoss()

    index=1
    totloss=0
    epochnum = 0
    for img, label in data:  
        #if use_cuda and torch.cuda.is_available():
            #imgs = imgs.cuda()
            #labels = labels.cuda()
        out = model(img) # forward pass
        #label=label.unsqueeze(0)
        #label=torch.reshape(label,out.shape)
        out=torch.tensor([40])
        loss = criterion(out, label) # compute the total loss

        print(label," ",out)
        print(loss)
        
        totloss+=float(loss.item())
        index+=1
            
    losses= totloss/index  
    print("Average loss is ",losses)



In [None]:
# Instantiate dataset
segmentation_dataset = SegmentationDataset(segmentation_dataset_path)
train, valid, test = get_data_loaders(segmentation_dataset, 1)

In [None]:
#save model
#torch.save(model.state_dict(), '3Char.pth')

In [None]:
#Load Model
model=Chars2()
model.load_state_dict(torch.load('2Char.pth'))

<All keys matched successfully>

In [None]:
"""CLICK ONLY ONCE"""
#New Model
model=Chars2()

In [None]:

trainer(model,valid,1000,0.000005)

In [None]:
torch.set_printoptions(edgeitems=1000)
getaccuracy(model,test)

tensor([43.])   tensor([40])
tensor(9.)
tensor([40.])   tensor([40])
tensor(0.)
tensor([41.])   tensor([40])
tensor(1.)
tensor([43.])   tensor([40])
tensor(9.)
tensor([45.])   tensor([40])
tensor(25.)
tensor([40.])   tensor([40])
tensor(0.)
tensor([40.])   tensor([40])
tensor(0.)
tensor([35.])   tensor([40])
tensor(25.)
tensor([41.])   tensor([40])
tensor(1.)
tensor([40.])   tensor([40])
tensor(0.)
tensor([40.])   tensor([40])
tensor(0.)
tensor([40.])   tensor([40])
tensor(0.)
tensor([32.])   tensor([40])
tensor(64.)
tensor([39.])   tensor([40])
tensor(1.)
tensor([40.])   tensor([40])
tensor(0.)
tensor([39.])   tensor([40])
tensor(1.)
tensor([43.])   tensor([40])
tensor(9.)
tensor([35.])   tensor([40])
tensor(25.)
tensor([35.])   tensor([40])
tensor(25.)
tensor([40.])   tensor([40])
tensor(0.)
tensor([42.])   tensor([40])
tensor(4.)
tensor([42.])   tensor([40])
tensor(4.)
tensor([40.])   tensor([40])
tensor(0.)
tensor([35.])   tensor([40])
tensor(25.)
tensor([41.])   tensor([40])
tenso

In [None]:
#See Progress
index=-1
for img, label in test:
  index+=1
  if (True):
    plt.figure(figsize=(8, 8))
    plt.imshow(img.squeeze(), cmap='gray', vmin = 0, vmax = 1)
    ax = plt.gca()
    ax.set_xticks(np.arange(0, 80, 5))
    plt.grid(color='b', linewidth=2)
    plt.show()
    out = model(img) # forward pass
    label=label.unsqueeze(0)
    label=torch.reshape(label,out.shape)
    print(label," ",out)
    
  


In [None]:
print(len(valid))