# Data Preparation

In [1]:
#importing the dependencies
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
CAPTCHA_IMAGE_FOLDER = "generated_captcha_images"
OUTPUT_FOLDER = "extracted_letter_images"

In [2]:
#forming the train and test directories
image_names=os.listdir(CAPTCHA_IMAGE_FOLDER)
test_image_names=image_names[:100]
train_image_names=image_names[100:]
os.mkdir(CAPTCHA_IMAGE_FOLDER+'/'+'train_images')
os.mkdir(CAPTCHA_IMAGE_FOLDER+'/'+'test_images')
for image in train_image_names:
    os.rename(CAPTCHA_IMAGE_FOLDER+'/'+image,CAPTCHA_IMAGE_FOLDER+'/'+'train_images/'+image)
for image in test_image_names:
    os.rename(CAPTCHA_IMAGE_FOLDER+'/'+image,CAPTCHA_IMAGE_FOLDER+'/'+'test_images/'+image)

## Forming our data set of single characters using contours to segment the CAPTCHA letters

In [3]:
def preprocess(image_names):
    letter_count={}
    i=1
    CAPTCHA_IMG_FOLDER=CAPTCHA_IMAGE_FOLDER+'/'+'train_images'
    for image in image_names:
        print("images processed: ",i,"out of ",len(image_names))
        i+=1
        if(image.startswith('.')):
            continue
        im_path=CAPTCHA_IMG_FOLDER +'/'+ image
        img=cv2.imread(im_path) #reading the image
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) #converting to gray scale
        gray = cv2.copyMakeBorder(gray, 8, 8, 8, 8, cv2.BORDER_REPLICATE) # Add some extra padding around the image
        thresh = cv2.threshold(gray,0,255,cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1]  # threshold the image (convert it to pure black and white)
        contours= cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) # find the contours (continuous blobs of pixels) the image
        contours=contours[0]
        im_letters=[]
        for contour in contours: # Now we can loop through each of the four contours and extract the letter inside of each one
            (x,y,w,h)=cv2.boundingRect(contour)
            if w/h>1.5: #if contour to wide the split it into 2 
                im_letters.append((x,y,w//2,h))
                im_letters.append((x+w//2,y,w//2,h))
            else:
                im_letters.append((x,y,w,h))
        if(len(im_letters)!=4):
            continue
        im_letters=sorted(im_letters,key=lambda x : x[0]) #Sort the detected letter images based on the x coordinate to make sure we are processing them from left-to-right
        capcha=str(image).split('.')[0]
        for letter_box,letter in zip(im_letters,capcha): # Save out each letter as a single image
            (x,y,w,h)=letter_box
            letter_img=gray[y-2:y+h+2,x-2:x+w+2] # Extract the letter from the original image with a 2-pixel margin around the edge
            l_count=letter_count.get(letter,1)
            l_imfolder_path=OUTPUT_FOLDER + '/' +letter
            l_img_path=l_imfolder_path+'/'+str(l_count).zfill(6)+'.png'
            if(letter not in os.listdir(OUTPUT_FOLDER)):
                os.mkdir(l_imfolder_path)
            cv2.imwrite(l_img_path, letter_img) # write the letter image to a file
            letter_count[letter]=l_count+1

In [None]:
preprocess(train_image_names)