In [1]:
import db
import re
import copy
import zipfile
import pandas as pd
import data_preprocessing as dp
from collections import defaultdict

%load_ext autoreload
%autoreload 2
root_path = './data'
db.create_tables()

# ROOT_ZIP 데이터 추가

In [2]:
separate = {
    '필기체': ['교원그룹', 'handwriting', '필기체'],
    '인쇄체': ['print'],
    '인쇄체(증강)': ['augmentation'],
    '인쇄체(다양한 형태)': ['textinthewild', 'Training_인쇄체'],
    '인쇄체(간판)': ['중심대학']
}

def find_key_by_value(input_string):
    for k, values in separate.items():
        if any(v in input_string for v in values):
            return k

zip_files = dp.find_file_paths(root_path, 'zip')
zip_df = pd.DataFrame(data=zip_files, columns=['ZIP_PATH'])
zip_df['KIND'] = zip_df['ZIP_PATH'].apply(find_key_by_value)
zip_df = zip_df[~zip_df['ZIP_PATH'].str.contains('라벨')]
db.insert_root(zip_df)

# 이미지 경로 추가

In [3]:
for zip_path in zip_df['ZIP_PATH']:
    full_zip_path = f"{root_path}/{zip_path}"
    with zipfile.ZipFile(full_zip_path, 'r') as zip_ref:
        zip_contents = zip_ref.namelist()
        image_files = [file for file in zip_contents if file.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp'))]
        image_df = pd.DataFrame(image_files, columns=['IMAGE_PATH'])
        image_df['ZIP_PATH'] = zip_path
        db.insert_image_path(image_df)

# 13.한국어글자체

In [4]:
search = '13.한국어글자체'
json_paths = dp.find_file_paths(f"{root_path}/{search}", 'json')

# {json 파일 경로: base_directory}
base_directories = {path: path.split('/')[-2] for path in json_paths}
json_infos = {path: dp.get_default_json(path) for path in json_paths}

In [5]:
structure_paths = defaultdict(list)

for path, json_info in json_infos.items():
    key = dp.get_json_structure(json_info)
    structure_paths[key].append(path)

structure_features = [
    ['FILE_NAME', 'TEXT', 'CLASS'], ['FILE_NAME', 'TEXT', 'CLASS'], ['FILE_NAME', 'TEXT', 'BOX', 'CLASS']
]

In [6]:
df_list = []

for i, (structure, json_paths) in enumerate(structure_paths.items()):
    for path in json_paths:
        kor_info = dp.kor_directory(json_infos[path], structure_features[i])
                
        image_list = db.select_image_by_zip_path(base_directories[path])
        image_list = pd.DataFrame(image_list)
        image_list['FILE_NAME'] = image_list['IMAGE_PATH'].str.split('/').str[-1]
        
        merged = image_list.merge(kor_info, on='FILE_NAME', how='inner', validate='1:m')
        merged = merged.dropna()
        
        regular_type = \
            lambda x: 'augmentation' if re.match(r'^zip|^\d+(-\d+)*$', x.split('/')[-2]) else re.sub(r'^\d+_', '', x.split('/')[-2])
        merged['TYPE'] = merged['IMAGE_PATH'].apply(regular_type)

        merged.rename(columns={'ID': 'IMAGE_ID'}, inplace=True)
        
        if 'BOX' in merged.columns:
            image_info = copy.deepcopy(merged[['IMAGE_ID', 'TYPE']])
            image_info = image_info.drop_duplicates()
            db.insert_image_info(image_info)
        
        merged = merged[merged['TEXT'] != '']
        
        db.insert_info(merged, 'sum')

# 다양한 형태의 한글 문자 OCR

In [7]:
search = '다양한 형태의 한글 문자 OCR'


zip_paths = [path for path in dp.find_file_paths(f'{root_path}/{search}', 'zip', False) if '라벨' in path]

json_paths = []
# zip 파일 안의 json 파일 경로 찾기
for full_zip_path in zip_paths: 
    with zipfile.ZipFile(full_zip_path, 'r') as zip_ref:
        zip_contents = zip_ref.namelist()
        json_paths.extend([
            file for file in zip_contents if file.lower().endswith('.json')
        ])

base_paths = defaultdict(list) # {base_directory: [json_path]]}
structure_infos = defaultdict(list)

for path in json_paths:    
    base_path = path.encode('cp437').decode('cp949').split('/')[0]
    base_paths[base_path].append(path)

for base_path, json_paths in base_paths.items():
    zip_prefix = f'{root_path}/{search}/Training/'
    if base_path == 'form':
        zip_path = f'{zip_prefix}[라벨]Training_인쇄체.zip'
    else:
        zip_path = f'{zip_prefix}[라벨]Training_필기체.zip'
    json_infos = dp.get_zip_json(zip_path, json_paths)
    for json_info in json_infos:
        structure = dp.get_json_structure(json_info)
        structure_infos[structure].append(json_info)    

100%|██████████| 60000/60000 [02:33<00:00, 391.39it/s] 
100%|██████████| 276886/276886 [00:10<00:00, 26059.19it/s]
100%|██████████| 605741/605741 [01:48<00:00, 5590.30it/s] 


In [8]:
search = '원천'

image_paths = db.select_image_by_zip_path(search)
image_paths = pd.DataFrame(image_paths)
image_paths['FILE_NAME'] = image_paths['IMAGE_PATH'].str.split('/').str[-1]
image_paths.rename(columns={'ID': 'IMAGE_ID'}, inplace=True)
image_paths.set_index('FILE_NAME', inplace=True)

In [9]:
for structure, jsons_list in structure_infos.items():    
    image_infos, infos = dp.variables_directory(jsons_list)
    
    if image_infos:
        image_infos = pd.DataFrame(image_infos)
        image_infos.drop_duplicates(inplace=True)
        image_infos.set_index('FILE_NAME', inplace=True)
        image_infos = image_infos.join(image_paths, how='inner')
        image_infos = image_infos[['IMAGE_ID', 'TYPE']]
        db.insert_image_info(image_infos)
        
    change_columns = {'wordbox': 'BOX', 'value': 'TEXT', 'charbox': 'BOX'}
    infos = pd.DataFrame(infos).rename(columns=change_columns)

    select_cols = ['IMAGE_ID', 'CLASS', 'TEXT']

    if 'BOX' in infos.columns:
        select_cols.append('BOX')
    
    if 'letter' in infos.columns:
        letter_infos = []
        def letter_process(row):
            letters = row['letter']
            for letter in letters:
                letter.update({'FILE_NAME': row['FILE_NAME'], 'CLASS': '문자'})
                letter_infos.append(letter)
                
        infos.apply(letter_process, axis=1)
        
        letter_infos = pd.DataFrame(letter_infos).rename(columns=change_columns)
        letter_infos.set_index('FILE_NAME', inplace=True)
        letter_infos = letter_infos.join(image_paths, how='inner')
        letter_infos = letter_infos[select_cols]
        letter_infos = letter_infos[letter_infos['TEXT'] != '']
        letter_infos.dropna(inplace=True)
        db.insert_info(letter_infos)   
    
    infos = infos[infos['TEXT'] != '']
    infos.set_index('FILE_NAME', inplace=True)
    infos = infos.join(image_paths, how='inner')
        
    infos = infos[select_cols]
    infos.dropna(inplace=True)
    
    db.insert_info(infos)   

100%|██████████| 60000/60000 [00:07<00:00, 7852.90it/s] 
100%|██████████| 276886/276886 [00:03<00:00, 89216.44it/s] 
100%|██████████| 605739/605739 [00:09<00:00, 65493.36it/s] 
100%|██████████| 1/1 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:00<?, ?it/s]


# 2023 교원그룹 AI OCR 챌린지, SW 중심대학 공동 AI 경진대회 <본선>

In [10]:
search_list = ['2023 교원그룹 AI OCR 챌린지.zip', 'SW 중심대학 공동 AI 경진대회.zip']
_class = '단어'
df_list = []

change_columns = {'label': 'TEXT', 'text': 'TEXT'}
path_select = ['IMAGE_PATH', 'ZIP_PATH']
info_select = ['TEXT', 'CLASS', 'IMAGE_ID']


for search in search_list:
    df_list.append(dp.get_zip_csv(f'{root_path}/{search}', '.csv'))
    
for i, df in enumerate(df_list):
    df['IMAGE_PATH'] = df['img_path'].str.replace('./', '')
    df['ZIP_PATH'] = search_list[i]
    df['CLASS'] = _class
    df.rename(columns=change_columns, inplace=True)
    db.insert_image_path(df[path_select])
    image_path_df = pd.DataFrame((db.select_image_by_zip_path(search_list[i])))
    image_path_df.rename(columns={'ID': 'IMAGE_ID'}, inplace=True)
    df = df.merge(image_path_df, on='IMAGE_PATH')
    db.insert_info(df[info_select])