In [8]:
import json

import db
import glob
import zipfile
import pandas as pd
import re
import copy
from collections import defaultdict
from functools import reduce

%load_ext autoreload
%autoreload 2
root_path = './data/'
db.create_tables()

# 0번 디렉토리 13.한국어 글자체 json -> df 처리 함수
def kor_directory(json_data, select_cols):
    '''
    :param json_data: json 정보
    :param select_cols: 최종 선택 컬럼
    :return:
    '''
    cols = ['images', 'annotations']
    select_df = pd.json_normalize(json_data)

    dfs = []
    for col in cols:
        tmp = []
        for nested_json in select_df[col]:
            tmp.append(pd.json_normalize(nested_json))
        dfs.append(pd.concat(tmp))

    merge_function = \
        lambda left, right: pd.merge(left, right, left_on='id', right_on='image_id', how='inner', validate="1:m")

    result = reduce(merge_function, dfs)

    _class = 'attributes.type'

    if 'attributes.class' in result.columns:
        _class = 'attributes.class'

    result.rename(columns={'file_name':'FILE_NAME', 'text': 'TEXT', 'bbox': 'BOX', _class: 'CLASS'}, inplace=True)

    # '문장': '문장'
    change_name = {
        '글자(음절)': '문자', '단어(어절)': '단어', 'character': '문자', 'word': '단어'
    }

    result['CLASS'] = result['CLASS'].replace(change_name)

    return result[select_cols]

# JSON 데이터 가져오는 함수
def get_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        result = json.load(file)
    return result

# JSON 파일 경로 찾는 함수
def find_json_paths(directory):
    pattern = f"{directory}/**/*.json"
    result = glob.glob(pattern, recursive=True)
    result = [file.replace('\\', '/') for file in result]
    return  result

# json 데이터 구조 확인
def get_json_structure(data, prefix=''):
    '''
    :param data: json 데이터
    :param prefix: 이전
    :return: json 구조를 나타내는 tuple
    '''
    result = []
    if isinstance(data, dict):
        for key, value in data.items():
            current_prefix = f"{prefix}.{key}" if prefix else key
            result.append(current_prefix)
            if isinstance(value, (dict, list)):
                result.extend(get_json_structure(data=value, prefix=current_prefix))
    elif isinstance(data, list) and data:
        result.append(f"{prefix}[list]")
        result.extend(get_json_structure(data=data[0], prefix=prefix))
    return tuple(result)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
# './data' 디렉토리와 모든 하위 폴더에서 zip 파일 리스트 가져오기
def list_zip_files_glob(root):
    result = glob.glob(f"{root}**/*.zip", recursive=True)
    result = [file.replace('\\', '/').replace(root, '') for file in result]
    return result

separate = {
    '필기체': ['open.zip', 'handwriting', '필기체'],
    '인쇄체': ['print'],
    '인쇄체(증강)': ['augmentation'],
    '인쇄체(다양한 형태)': ['textinthewild', 'Training_인쇄체']
}

def find_key_by_value(input_string):
    for k, values in separate.items():
        if any(v in input_string for v in values):
            return k

zip_files = list_zip_files_glob(root_path)

zip_df = pd.DataFrame(data=zip_files, columns=['ZIP_PATH'])
zip_df['KIND'] = zip_df['ZIP_PATH'].apply(find_key_by_value)
zip_df = zip_df[~zip_df['ZIP_PATH'].str.contains('라벨')]
db.insert_root(zip_df)

In [10]:
for zip_path in zip_df['ZIP_PATH']:
    full_zip_path = f"{root_path}{zip_path}"
    with zipfile.ZipFile(full_zip_path, 'r') as zip_ref:
        zip_contents = zip_ref.namelist()
        image_files = [file for file in zip_contents if file.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp'))]
        image_df = pd.DataFrame(image_files, columns=['IMAGE_PATH'])
        image_df['ZIP_PATH'] = zip_path
        db.insert_image_path(image_df)

# 13.한국어글자체

In [11]:
search = '13.한국어글자체'
json_paths = find_json_paths(f"{root_path}{search}")

# {json 파일 경로: base_directory}
base_directories = {path: path.split('/')[-2] for path in json_paths}
json_infos = {path: get_json(path) for path in json_paths}

In [12]:
structure_paths = defaultdict(list)

for path, json_info in json_infos.items():
    key = get_json_structure(json_info)
    structure_paths[key].append(path)

structure_features = [
    ['FILE_NAME', 'TEXT', 'CLASS'], ['FILE_NAME', 'TEXT', 'CLASS'], ['FILE_NAME', 'TEXT', 'BOX', 'CLASS']
]

In [13]:
df_list = []

for i, (structure, json_paths) in enumerate(structure_paths.items()):
    for path in json_paths:
        kor_info = kor_directory(json_infos[path], structure_features[i])
                
        image_list = db.select_image(base_directories[path])
        image_list = pd.DataFrame(image_list)
        image_list['FILE_NAME'] = image_list['IMAGE_PATH'].str.split('/').str[-1]
        
        merged = image_list.merge(kor_info, on='FILE_NAME', how='inner', validate='1:m')
        merged = merged.dropna()
        
        regular_type = \
            lambda x: 'augmentation' if re.match(r'^zip|^\d+(-\d+)*$', x.split('/')[-2]) else re.sub(r'^\d+_', '', x.split('/')[-2])
        merged['TYPE'] = merged['IMAGE_PATH'].apply(regular_type)

        merged.rename(columns={'ID': 'IMAGE_ID'}, inplace=True)
        
        if 'BOX' in merged.columns:
            image_info = copy.deepcopy(merged[['IMAGE_ID', 'TYPE']])
            image_info = image_info.drop_duplicates()
            db.insert_image_info(image_info)
        
        merged = merged[merged['TEXT'] != '']
        
        db.insert_info(merged, 'sum')

# 다양한 형태의 한글 문자 OCR

In [14]:
from tqdm import tqdm

search = '다양한 형태의 한글 문자 OCR'

def find_zip_paths(directory):
    pattern = f"{directory}/**/*.zip"
    result = glob.glob(pattern, recursive=True)
    result = [file.replace('\\', '/') for file in result]
    return  result

def get_zip_json(zip_path, paths):
    result = []
    with zipfile.ZipFile(zip_path, 'r') as z:
        for path in tqdm(paths):
            with z.open(path) as f:
                json_file = json.load(f)
                result.append(json_file)
    return result

zip_paths = [path for path in find_zip_paths(f'{root_path}{search}') if '라벨' in path]

json_paths = []
# zip 파일 안의 json 파일 경로 찾기
for full_zip_path in zip_paths: 
    with zipfile.ZipFile(full_zip_path, 'r') as zip_ref:
        zip_contents = zip_ref.namelist()
        json_paths.extend([
            file for file in zip_contents if file.lower().endswith('.json')
        ])

base_paths = defaultdict(list) # {base_directory: [json_path]]}
structure_infos = defaultdict(list)

for path in json_paths:    
    base_path = path.encode('cp437').decode('cp949').split('/')[0]
    base_paths[base_path].append(path)

for base_path, json_paths in base_paths.items():
    zip_prefix = f'{root_path}{search}/Training/'
    if base_path == 'form':
        zip_path = f'{zip_prefix}[라벨]Training_인쇄체.zip'
    else:
        zip_path = f'{zip_prefix}[라벨]Training_필기체.zip'
    json_infos = get_zip_json(zip_path, json_paths)
    for json_info in json_infos:
        structure = get_json_structure(json_info)
        structure_infos[structure].append(json_info)    

100%|██████████| 60000/60000 [10:36<00:00, 94.21it/s]  
100%|██████████| 276886/276886 [00:11<00:00, 24620.30it/s]
100%|██████████| 605741/605741 [00:28<00:00, 21331.39it/s]


In [15]:
search = '원천'

image_paths = db.select_image(search)
image_paths = pd.DataFrame(image_paths)
image_paths['FILE_NAME'] = image_paths['IMAGE_PATH'].str.split('/').str[-1]
image_paths.rename(columns={'ID': 'IMAGE_ID'}, inplace=True)
image_paths.set_index('FILE_NAME', inplace=True)

In [17]:
for structure, jsons_list in structure_infos.items():
    image_infos = []    
    infos = []

    for json_info in tqdm(jsons_list):
        image = json_info['image']
        filename = image.get('file_name', image.get('file_fame', None))
        
        text = json_info['text']
        if 'letter' in text.keys(): # 단일 값
            _class = '문자'
            value = text['letter']['value'] # 텍스트 값
            infos.append({
                'FILE_NAME': filename,
                'CLASS': _class,
                'TEXT': value
            })            
            continue

        word = text['word']
        
        _type = 'form' # 인쇄체
        _class = '단어'
        if 'output' in text.keys(): # 필기체 좌표
            _type = 'cursive'
            _class = '문자'
        
        for item in word:
            item.update({'FILE_NAME': filename, 'CLASS': _class})

        image_infos.append({'FILE_NAME': filename, 'TYPE': _type})
        infos.extend(word)
    
    if image_infos:
        image_infos = pd.DataFrame(image_infos)
        image_infos.drop_duplicates(inplace=True)
        image_infos.set_index('FILE_NAME', inplace=True)
        image_infos = image_infos.join(image_paths, how='inner')
        image_infos = image_infos[['IMAGE_ID', 'TYPE']]
        db.insert_image_info(image_infos)
        
    change_columns = {'wordbox': 'BOX', 'value': 'TEXT', 'charbox': 'BOX'}
    infos = pd.DataFrame(infos).rename(columns=change_columns)

    select_cols = ['IMAGE_ID', 'CLASS', 'TEXT']

    if 'BOX' in infos.columns:
        select_cols.append('BOX')
    
    if 'letter' in infos.columns:
        letter_infos = []
        def letter_process(row):
            letters = row['letter']
            for letter in letters:
                letter.update({'FILE_NAME': row['FILE_NAME'], 'CLASS': '문자'})
                letter_infos.append(letter)
                
        infos.apply(letter_process, axis=1)
        
        letter_infos = pd.DataFrame(letter_infos).rename(columns=change_columns)
        letter_infos.set_index('FILE_NAME', inplace=True)
        letter_infos = letter_infos.join(image_paths, how='inner')
        letter_infos = letter_infos[select_cols]
        letter_infos = letter_infos[letter_infos['TEXT'] != '']
        letter_infos.dropna(inplace=True)
        db.insert_info(letter_infos)   
    
    infos = infos[infos['TEXT'] != '']
    infos.set_index('FILE_NAME', inplace=True)
    infos = infos.join(image_paths, how='inner')
        
    infos = infos[select_cols]
    infos.dropna(inplace=True)
    
    db.insert_info(infos)   
    

100%|██████████| 60000/60000 [00:03<00:00, 18766.43it/s]
100%|██████████| 276886/276886 [00:06<00:00, 45437.77it/s]
100%|██████████| 605739/605739 [00:20<00:00, 28898.14it/s]
100%|██████████| 1/1 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:00<00:00, 978.38it/s]


In [ ]:
# 저아