## Libraries

In [7]:
# Determinadas.
import numpy as np
import pandas
import matplotlib.pyplot as plt
import random

# Procesamiento de imágenes.
import numpy as np
import cv2
import os

## Data collection

In [8]:
# Listas a utilizar.
lettersImgs = []
numbersImgs = []

for folderName1 in os.listdir('Images datasets'):
    # Folder contenedor de los folders de las imágenes de las letras.
    if folderName1.startswith("Letters Images"):
        # Folder contenedor de las imágenes de la letra en específico.
        for folderName2 in os.listdir('Images datasets/' + folderName1 ):
            imgs = []
            # Imagen de la letra en específico.
            for fileName in os.listdir('Images datasets/' + folderName1 + '/' + folderName2):
                img = cv2.imread('Images datasets/' + folderName1 + '/' + folderName2 + '/' + fileName)
                imgs.append(img)
            lettersImgs.append(imgs)
            
    # Folder contenedor de los folders de las imágenes de los números.
    if folderName1.startswith("Numbers Images"):
        # Folder contenedor de las imágenes del número en específico.
        for folderName2 in os.listdir('Images datasets/' + folderName1 ):
            imgs = []
            # Imagen del número en específico.
            for fileName in os.listdir('Images datasets/' + folderName1 + '/' + folderName2):
                img = cv2.imread('Images datasets/' + folderName1 + '/' + folderName2 + '/' + fileName)
                imgs.append(img)
            numbersImgs.append(imgs)

## Create custom dataset

### Image processing

In [9]:
def getImageProcessing(characterImg):
    # BGR a escala de grises.
    grayCharacterImg = cv2.cvtColor(characterImg.copy(), cv2.COLOR_BGR2GRAY)

    # Umbralización de Otsu después del filtrado gaussiano.
    # binaryCharacterImg = cv2.threshold(grayCharacterImg.copy(), 180, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
    binaryCharacterImg = cv2.threshold(grayCharacterImg.copy(), 180, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

    # Encontrar contornos de la imagen.
    countours, hierarchy = cv2.findContours(binaryCharacterImg.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # Obtener rectángulos de cada contorno.
    rectangles = [cv2.boundingRect(countour) for countour in countours]
    
    # Buscar el mayor rectángulo.
    biggestRect = [0,0,-1,-1]
    for rect in rectangles:
        if rect[2]*rect[3] > biggestRect[2]*biggestRect[3]:
            biggestRect = rect

    # Obtener imagen del sector del rectángulo.
    characterImgn = characterImg[biggestRect[1]:biggestRect[1] + biggestRect[3], biggestRect[0]:biggestRect[0] + biggestRect[2]]

    # Redimensionar imagen a 28x28 píxeles.
    resizeWidth = 28
    resizeHeight = 28
    points = (resizeWidth, resizeHeight)
    resizeImg = cv2.resize(characterImgn.copy(), points, interpolation= cv2.INTER_LINEAR)

    # Convertir a escala de grises.
    grayImg = cv2.cvtColor(resizeImg.copy(), cv2.COLOR_BGR2GRAY)

    # Eliminar imperfecciones con filtrado gaussiano.
    blur = cv2.GaussianBlur(grayImg.copy(),(7,7),0)

    # Aplicar umbralización de Otsu después del filtrado gaussiano.
    # binary = cv2.threshold(blur, 180, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
    binary = cv2.threshold(blur.copy(), 180, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
    
    # Retornar imagen procesada.
    return binary

### Image treatment

In [10]:
# Listas de información de imágenes.
lettersData = []
numbersData = []

# Por cada letra.
for i in range( len(lettersImgs) ):
    # Imagen de la letra en específico.
    for j in range( len(lettersImgs[i]) ):
        # Copiar imagen original.
        letterImgCopy = lettersImgs[i][j].copy()
        
        # Aplicar procesamiento de imagen.
        processedImg = getImageProcessing(letterImgCopy)
        
        # Convertir a una sola dimensión.
        imgFlat = processedImg.flatten()
        
        # Obtener la letra en específico.
        asc = i+65
        
        # Agregar Y al vector plano.
        imgFlat = np.append(imgFlat, asc)
        
        # Agregar a la matrix de datos.
        lettersData.append(imgFlat)
        
        
# Por cada número.
for i in range( len(numbersImgs) ):
    # Imagen del número en específico.
    for j in range( len(numbersImgs[i]) ):
        # Copiar imagen original.
        numbersImgCopy = numbersImgs[i][j].copy()
        
        # Aplicar procesamiento de imagen.
        processedImg = getImageProcessing(numbersImgCopy)
        
        # Convertir a una sola dimensión.
        imgFlat = processedImg.flatten()
        
        # Agregar Y al vector plano.
        imgFlat = np.append(imgFlat, i)
        
        # Agregar a la matrix de datos.
        numbersData.append(imgFlat)

## Randomize the letters dataset

In [35]:
# Aleatorizar posición de las imágenes de letras en el dataset.
random.shuffle(lettersData)

# Convertir información a DataFrame.
lettersDF = pandas.DataFrame(lettersData)

# Mostrar dataset.
lettersDF

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,775,776,777,778,779,780,781,782,783,784
0,0,0,0,0,0,0,0,0,255,255,...,255,0,0,0,0,0,0,0,0,77
1,0,0,0,0,0,255,255,255,255,0,...,0,0,0,0,0,0,0,0,0,81
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,66
3,0,0,0,0,0,0,0,0,255,255,...,255,0,0,0,0,0,0,0,0,66
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26409,255,255,255,255,255,255,255,255,255,0,...,0,0,0,0,0,0,0,0,0,89
26410,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,71
26411,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,79
26412,255,255,255,255,255,255,255,255,255,255,...,0,0,0,0,0,0,0,0,0,86


## Randomize the numbers dataset

In [36]:
# Aleatorizar posición de las imágenes de números en el dataset.
random.shuffle(numbersData)

# Convertir información a DataFrame.
numbersDF = pandas.DataFrame(numbersData)

# Mostrar dataset.
numbersDF

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,775,776,777,778,779,780,781,782,783,784
0,0,0,0,0,0,0,255,255,0,0,...,255,255,0,0,0,0,0,0,0,3
1,0,0,0,0,0,0,0,255,255,255,...,255,255,255,255,255,255,255,255,255,2
2,0,255,255,255,255,255,255,255,255,255,...,0,0,0,0,0,0,0,0,0,7
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,9
4,0,0,0,0,0,0,0,0,0,0,...,255,255,255,255,255,255,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10155,0,0,0,0,0,0,0,0,0,0,...,255,255,255,0,0,0,0,0,0,1
10156,0,0,0,0,0,0,0,0,255,255,...,255,0,0,0,0,0,0,0,0,9
10157,0,0,0,0,255,255,255,255,255,255,...,255,255,255,255,255,255,255,255,0,2
10158,0,0,0,0,0,0,255,255,255,255,...,255,255,0,0,0,0,0,0,0,8


## Create .csv

In [37]:
# Crear dataset de letras.
# lettersDF.to_csv("Letters and numbers datasets/SimpleDatasets/SimpleLettersDataset.csv")
lettersDF.to_csv("Letters and numbers datasets/ComplexDatasets/ComplexLettersDataset.csv")

# Crear dataset de números.
# numbersDF.to_csv("Letters and numbers datasets/SimpleDatasets/SimpleNumbersDataset.csv")
numbersDF.to_csv("Letters and numbers datasets/ComplexDatasets/ComplexNumbersDataset.csv")

## Load datasets

In [38]:
# Leer dataset de letras.
# lettersDF = pandas.read_csv("Letters and numbers datasets/SimpleDatasets/SimpleLettersDataset.csv")
lettersDF = pandas.read_csv("Letters and numbers datasets/ComplexDatasets/ComplexLettersDataset.csv")

# Leer dataset de números.
# numbersDF = pandas.read_csv("Letters and numbers datasets/SimpleDatasets/SimpleNumbersDataset.csv")
numbersDF = pandas.read_csv("Letters and numbers datasets/ComplexDatasets/ComplexNumbersDataset.csv")

## Show datasets

### Letters Dataset

In [39]:
# Mostrar dataset.
lettersDF

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,775,776,777,778,779,780,781,782,783,784
0,0,0,0,0,0,0,0,0,0,255,...,255,0,0,0,0,0,0,0,0,77
1,1,0,0,0,0,0,255,255,255,255,...,0,0,0,0,0,0,0,0,0,81
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,66
3,3,0,0,0,0,0,0,0,0,255,...,255,0,0,0,0,0,0,0,0,66
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26409,26409,255,255,255,255,255,255,255,255,255,...,0,0,0,0,0,0,0,0,0,89
26410,26410,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,71
26411,26411,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,79
26412,26412,255,255,255,255,255,255,255,255,255,...,0,0,0,0,0,0,0,0,0,86


### Numbers Dataset

In [40]:
# Mostrar dataset.
numbersDF

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,775,776,777,778,779,780,781,782,783,784
0,0,0,0,0,0,0,0,255,255,0,...,255,255,0,0,0,0,0,0,0,3
1,1,0,0,0,0,0,0,0,255,255,...,255,255,255,255,255,255,255,255,255,2
2,2,0,255,255,255,255,255,255,255,255,...,0,0,0,0,0,0,0,0,0,7
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,9
4,4,0,0,0,0,0,0,0,0,0,...,255,255,255,255,255,255,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10155,10155,0,0,0,0,0,0,0,0,0,...,255,255,255,0,0,0,0,0,0,1
10156,10156,0,0,0,0,0,0,0,0,255,...,255,0,0,0,0,0,0,0,0,9
10157,10157,0,0,0,0,255,255,255,255,255,...,255,255,255,255,255,255,255,255,0,2
10158,10158,0,0,0,0,0,0,255,255,255,...,255,255,0,0,0,0,0,0,0,8
