## 이름 데이터셋 생성
- 폭넓은 데이터 학습을 위해 한글 이름의 로마자 표기 영어 이름과 외국 영어 이름 사용
- 데이터의 품질을 높이기 위해 크롭, 패딩, 리사이즈 등의 이미지 처리
- 결과물은 train.obj와 val.obj로 저장됨
- 사용에 따라 test.obj를 생성할 수 있음

In [None]:
# -*- coding: utf-8 -*-
from __future__ import print_function
from __future__ import absolute_import

import sys
import numpy as np
import cv2
import os
from PIL import Image
from PIL import ImageDraw
from PIL import ImageFont
import json
import collections
import glob
import pickle
import random

from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# 글씨 있는 부분 좌표 추출
def crop_image(img):
    image = np.array(img)
    blur = cv2.GaussianBlur(image, ksize=(3,3), sigmaX=0)
    ret, thresh1 = cv2.threshold(blur, 127, 255, cv2.THRESH_BINARY)
    edged = cv2.Canny(blur, 10, 250)
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (7,7))
    closed = cv2.morphologyEx(edged, cv2.MORPH_CLOSE, kernel)
    contours, _ = cv2.findContours(closed.copy(),cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    total = 0
    contours_xy = np.array(contours)
    x_min, x_max = 0,0
    value = list()
    for i in range(len(contours_xy)):
      for j in range(len(contours_xy[i])):
        value.append(contours_xy[i][j][0][0])
        x_min = min(value)
        x_max = max(value)
 
    y_min, y_max = 0,0
    value = list()
    for i in range(len(contours_xy)):
      for j in range(len(contours_xy[i])):
        value.append(contours_xy[i][j][0][1])
        y_min = min(value)
        y_max = max(value)

    x = x_min
    y = y_min
    w = x_max-x_min
    h = y_max-y_min

    return x, y, w, h

# 글씨 있는 부분으로만 자르고 비율로 리사이즈
def process_image(img, x, y, w, h, canvas_size):
    new_width = int(canvas_size * 0.9)
    new_height = int(new_width * h / w)
    if new_height > canvas_size - 10:
        new_height = int(canvas_size * 0.8)
        new_width = int(new_height * w / h)
    img = img.crop((x-1, y-1, x+w+1, y+h+1)).resize((new_width,new_height))
    new_left = int((canvas_size - img.width) / 2)
    new_top = int((canvas_size - img.height) / 2)
    result = Image.new("L", (canvas_size, canvas_size), color=255)
    result.paste(img, (new_left, new_top))

    return result

In [None]:
# 입력받은 string 이미지에 쓰기
def draw_single_char(st, font, canvas_size, final_canvas_size, x_offset, y_offset, type):
    img = Image.new("RGB", (canvas_size, canvas_size), (255, 255, 255))
    draw = ImageDraw.Draw(img)
    # 적합한 학습을 위해 19번째 폰트는 두껍게 작성함
    if type == 0:
      draw.text((x_offset, y_offset), st, (0, 0, 0), font=font, stroke_width=1, stroke_fill=(0, 0, 0))
    else:
      draw.text((x_offset, y_offset), st, (0, 0, 0), font=font)
    x, y, w, h = crop_image(img)
    img = process_image(img, x, y, w, h, final_canvas_size)
    return img

# dst_img와 src_img 각각 생성해서 연결
def draw_example(ch, src_font, dst_font, canvas_size, final_canvas_size, x_offset, y_offset):
    if num == 19:
        dst_img = draw_single_char(ch, dst_font, canvas_size, final_canvas_size, x_offset, y_offset, 0)
    else:
        dst_img = draw_single_char(ch, dst_font, canvas_size, final_canvas_size, x_offset, y_offset, 1)
    src_img = draw_single_char(ch, src_font, canvas_size, final_canvas_size, x_offset, y_offset, 1)
    example_img = Image.new("RGB", (final_canvas_size * 2, final_canvas_size), (255, 255, 255))
    example_img.paste(dst_img, (0, 0))
    example_img.paste(src_img, (final_canvas_size, 0))
    return example_img

# 배열에 있는 모든 문자열을 samples 디렉토리에 'label_count.jpg' 형식으로 저장
def font2img(src, dst, stringset, char_size, canvas_size, final_canvas_size, x_offset, y_offset, sample_dir, label):
    src_font = ImageFont.truetype(src, size=char_size)
    dst_font = ImageFont.truetype(dst, size=char_size)

    count = 0
    for s in stringset:
        e = draw_example(' '+s, src_font, dst_font, canvas_size, final_canvas_size, x_offset, y_offset)
        if e:
            e.save(os.path.join(sample_dir, "%02d_%04d.jpg" % (label, count)))
            count += 1

In [None]:
# num은 폰트의 종류를 의미하고, 적합한 학습을 위해 src_font는 필기체 폰트를 사용, 학습되는 글자는 알파벳이 아닌 학생들 이름 리스트
num = 19
src_font = './gdrive/MyDrive/englishwords/font/font_unused/DancingScript-Regular.ttf'
path = './gdrive/MyDrive/englishwords/font/'
dst_fonts = os.listdir(path)
f = open("./gdrive/MyDrive/englishwords/student_name.txt", 'r')
string_set = f.readlines()
char_size, canvas_size, final_canvas_size = 100, 2000, 256
x_offset, y_offset = 400, 400
sample_dir = './samples/'
if not os.path.isdir(sample_dir):
  os.mkdir(sample_dir)
label = 1

# test.obj를 위한 string set 제작 시 아래 리스트 사용
# string_set = ['Jamie', 'Gooreum', 'Hongja', 'Deoksoo', 'Malgeul']

for dst_font in dst_fonts[num-1:num]:
    font2img(src_font, path + dst_font, string_set, char_size, canvas_size, final_canvas_size, x_offset, y_offset, sample_dir, label)

  # Remove the CWD from sys.path while we load stuff.


font#01 데이터셋 생성 완료


In [None]:
# train.obj와 val.obj 생성
def pickle_examples(paths, train_path, val_path, train_val_split):
    with open(train_path, 'wb') as ft:
        with open(val_path, 'wb') as fv:
            for p in paths:
                label = int(os.path.basename(p).split("_")[0])
                with open(p, 'rb') as f:
                    print("img %s" % p, label)
                    img_bytes = f.read()
                    r = random.random()
                    example = (label, img_bytes)
                    if r < train_val_split:
                        pickle.dump(example, fv)
                    else:
                        pickle.dump(example, ft)

# test.obj 생성
def pickle_tests(paths, test_path):
    with open(test_path, 'wb') as fv:
        for p in paths:
            label = int(os.path.basename(p).split("_")[0])
            with open(p, 'rb') as f:
                print("img %s" % p, label)
                img_bytes = f.read()
                example = (label, img_bytes)
                pickle.dump(example, fv)

font_dir = './gdrive/MyDrive/englishwords/' + str(num) + '/'
if not os.path.isdir(font_dir):
  os.mkdir(font_dir)
experiment_dir = './gdrive/MyDrive/englishwords/' + str(num) + '/experiment/'
if not os.path.isdir(experiment_dir):
  os.mkdir(experiment_dir)
save_dir = './gdrive/MyDrive/englishwords/' + str(num) + '/experiment/data/'
if not os.path.isdir(save_dir):
  os.mkdir(save_dir)
split_ratio = 0.1

# test.obj 파일 생성 시 아래 주석 풀고 사용 (pickle_examples 함수 사용은 주석처리 해야함)
# test_path = os.path.join(save_dir, "test.obj")
# pickle_tests(sorted(glob.glob(os.path.join(sample_dir, "*.jpg"))), test_path=test_path)

train_path = os.path.join(save_dir, "train.obj")
val_path = os.path.join(save_dir, "val.obj")
pickle_examples(sorted(glob.glob(os.path.join(sample_dir, "*.jpg"))), train_path=train_path, val_path=val_path, train_val_split=split_ratio)

img ./samples/01_0000.jpg 1
img ./samples/01_0001.jpg 1
img ./samples/01_0002.jpg 1
img ./samples/01_0003.jpg 1
img ./samples/01_0004.jpg 1
img ./samples/01_0005.jpg 1
img ./samples/01_0006.jpg 1
img ./samples/01_0007.jpg 1
img ./samples/01_0008.jpg 1
img ./samples/01_0009.jpg 1
img ./samples/01_0010.jpg 1
img ./samples/01_0011.jpg 1
img ./samples/01_0012.jpg 1
img ./samples/01_0013.jpg 1
img ./samples/01_0014.jpg 1
img ./samples/01_0015.jpg 1
img ./samples/01_0016.jpg 1
img ./samples/01_0017.jpg 1
img ./samples/01_0018.jpg 1
img ./samples/01_0019.jpg 1
img ./samples/01_0020.jpg 1
img ./samples/01_0021.jpg 1
img ./samples/01_0022.jpg 1
img ./samples/01_0023.jpg 1
img ./samples/01_0024.jpg 1
img ./samples/01_0025.jpg 1
img ./samples/01_0026.jpg 1
img ./samples/01_0027.jpg 1
img ./samples/01_0028.jpg 1
img ./samples/01_0029.jpg 1
img ./samples/01_0030.jpg 1
img ./samples/01_0031.jpg 1
img ./samples/01_0032.jpg 1
img ./samples/01_0033.jpg 1
img ./samples/01_0034.jpg 1
img ./samples/01_003