# Setup

In [None]:
!pip install opendatasets -q
!pip install imutils -q

## Import Libraries

In [None]:
import cv2
import imutils
import glob
import matplotlib.pyplot as plt
import numpy as np
import opendatasets as od
import os
import tensorflow as tf
import tensorflow.keras.layers as layers

# Data Loading

In [None]:
od.download("https://www.kaggle.com/datasets/johnbergmann/captcha-image-dataset")

In [None]:
IMAGE_FOLDER = "/content/captcha-image-dataset/captchas"
OUTPUT_FOLDER = "extracted_letter_images"

# Split the captcha into individual letters

## Get all the captcha images

In [None]:
captcha_image_files = glob.glob(os.path.join(IMAGE_FOLDER, "*"))
counts = {}

## Loop over the images

In [None]:
for (i, captcha_image_file) in enumerate(captcha_image_files):
  print("[INFO] processing image {}/{}".format(i + 1, len(captcha_image_files)))

  # Grab the label from the file name
  filename = os.path.basename(captcha_image_file)
  captcha_correct_label = os.path.splitext(filename.split('_')[1])[0].lower()

  # Convert the image to grayscale
  image = cv2.imread(captcha_image_file)
  gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

  # Add padding to image
  gray_image = cv2.copyMakeBorder(gray_image, 8, 8, 8, 8, cv2.BORDER_REPLICATE)

  # Threshold the image to convert it to pure black and white
  thresh = cv2.threshold(gray_image, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1]

  # Find the contours of the image
  contours = cv2.findContours(thresh.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

  contours = contours[1] if imutils.is_cv3() else contours[0]

  letter_image_regions = []

  # Loop through each of the six contours and extract letter
  for contour in contours:
    # Get the rectangle that contains the contour
    (x, y, w, h) = cv2.boundingRect(contour)

    # Compare the width and height of the contour to detect if a letter is conjoined
    if w / h > 1.25:
      # Conjoined letter
      # Split into half
      half_width = int(w / 2)
      letter_image_regions.append((x, y, half_width, h))
      letter_image_regions.append((x + half_width, y, half_width, h))
    else:
      # Normal letter
      letter_image_regions.append((x, y, w, h))

  # If no 6 letters are found then skip the image
  if len(letter_image_regions) != 6:
    continue

  # Sort the image
  letter_image_regions = sorted(letter_image_regions, key=lambda x: x[0])

  # Save each letter as single image
  for letter_bounding_box, letter_text in zip(letter_image_regions, captcha_correct_label):
    x, y, w, h = letter_bounding_box

    # Extract the letter from the original image with a 2 pixel margin
    letter_image = gray_image[y-2:y + h + 2, x-2: x + w + 2]
    save_path = os.path.join(OUTPUT_FOLDER, letter_text)

    # If the output directory does not exist then create it
    if not os.path.exists(save_path):
      os.makedirs(save_path)

    # Write the letter image to a file
    count = counts.get(letter_text, 1)
    p = os.path.join(save_path, "{}.png".format(str(count).zfill(6)))
    cv2.imwrite(p, letter_image)

    # Increment the count for the current key
    counts[letter_text] = count + 1