In [None]:
%pip install -r requirements.txt

In [69]:
from bs4 import BeautifulSoup
from PIL import Image
import lxml
import os, shutil
import yaml
import glob
from tqdm import tqdm

# train, val, test split 계산
def set_divi(img_path):
    file_count = len(os.listdir(img_path))

    train_divi = round(file_count*0.8*0.8) + 1
    val_divi = train_divi + round(file_count*0.8*0.2)
    test_divi = val_divi + round(file_count*0.2)
    divi = [(0,train_divi,'train'), (train_divi,val_divi,'val'), (val_divi,test_divi,'test')]

    print(file_count+1 == test_divi)
    return divi


# xml parsing - label.txt 생성
def xml_parse(dir_path, label_dict):
    xmls = glob.glob(rf"{dir_path}xmls\\*.xml")

    os.makedirs(f".\\labels", exist_ok = True)

    for xml in tqdm(xmls):
        rawdata = open(xml, 'r', encoding='utf-8').read()
        soup = BeautifulSoup(rawdata, "xml")

        images = soup.select('image')

        for item in images:
            boxs = item.select('box')
            if box['label'] not in list(label_dict.values()):
                continue
            name = item['name'][:-4]
            
            txt=''
            for box in boxs:
                img_width = int(item['width'])
                img_height = int(item['height'])

                x1 = float(box['xtl'])
                x2 = float(box['xbr'])
                y1 = float(box['ytl'])
                y2 = float(box['ybr'])

                x_pos = round(((x1 + x2)/2)/img_width, 6)
                y_pos = round(((y1 + y2)/2)/img_height, 6)
                width = round((x2 - x1)/img_width, 6)
                height = round((y2 - y1)/img_height, 6)

                if x_pos > 1 or y_pos > 1 or width > 1 or height > 1 :
                    print("박스의 위치 계산이 적합하지 않습니다. 오류 수정 후 재실행이 필요합니다.")

                txt += f"{list(label_dict.values()).index(box['label'])} {x_pos} {y_pos} {width} {height} \n"
            with open(f'./labels/{name}.txt','w') as f:
                f.write(txt)

# image resize - 가로 사이즈 기준, 이미지 즉시 대체됨
def img_resize(img_path):
    imglist = glob.glob(f"{img_path}\\*.jpg")

    basewidth = 640

    for img in tqdm(imglist):
        try:
            imgor = Image.open(img)
            wpercent = (basewidth/float(imgor.size[0])) 
            hsize = int((float(imgor.size[1])*float(wpercent))) 
            imgrs = imgor.resize((basewidth, hsize), Image.LANCZOS)
            imgrs.save(img)
        except Exception as e:
            print(e)

# image & label 폴더 구분 이동
def movedir(train_test, img_true=True, lbl_true=True):

    imgdir = []
    lbldir = []

    for (root, directories, files) in os.walk('.\\'):
        for file in files:
            if ('images' in root) and ('.jpg' in file):
                imgdir.append((root, file))
            if ('labels' in root) and ('.txt' in file):
                lbldir.append((root, file))

    imgdir.sort(key=lambda x:x[1])
    lbldir.sort(key=lambda x:x[1])

    for divi in train_test:
        print(divi[2],"이동 시작")
        
        for (img, lbl) in tqdm(zip(imgdir[divi[0]:divi[1]], lbldir[divi[0]:divi[1]])):
            imgroot, imgfile = img
            lblroot, lblfile = lbl

            if img_true == True:
                os.makedirs(f".\\{divi[2]}\\images", exist_ok = True)
                
                imgsrc = os.path.join(imgroot, imgfile)
                imgdest = fr'.\\{divi[2]}\\images\\{imgfile}'
                shutil.move(imgsrc, imgdest)

            if lbl_true == True:
                os.makedirs(f".\\{divi[2]}\\labels", exist_ok = True)

                lblsrc = os.path.join(lblroot, lblfile)
                lbldest = fr'.\\{divi[2]}\\labels\\{lblfile}'
                shutil.move(lblsrc, lbldest)

def check_len(dir_path):
    jpg = 0
    txt = 0
    for (root, directories, files) in os.walk(dir_path):
        for file in files:
            if '.jpg' in file:
                jpg+=1
            if '.txt' in file:
                txt+=1

    print(f"jpg = {jpg}, txt = {txt}, jpg==txt = {jpg==txt}")



In [None]:
img_path = ".\\images"
lbl_path = ".\\labels"
dir_path = ".\\"

# yaml 파일 불러오기
with open('./data.yaml') as f:
    data = yaml.load(f, Loader=yaml.FullLoader)
    label_dict = data['names']

# division 계산
train_test = set_divi(img_path)

# xml 파싱 --> txt 파일로
xml_parse(dir_path, label_dict)

# 이미지 리사이징
img_resize(img_path)

# img 파일과 txt 파일 각자 폴더로 이동
movedir(train_test, img_true=True, lbl_true=True)

# 모든 파일이 잘 들어왔는지 확인
check_len(".\\")
check_len(".\\train")
check_len(".\\val")
check_len(".\\test")
