<a href="https://colab.research.google.com/github/7ZXU/AI/blob/main/KoreaCarObjectDetection/DataPreprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Git clone**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive/TNT/KoreanCarObject

In [None]:
!git clone https://github.com/seokbongyoo/Dataset_for_LPR.git

# **Library**

In [None]:
import os, time, random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline 
# 브라우저 내부(inline)에서 rich output(그림, 소리, 애니메이션)을 바로 볼 수 있음

from IPython.display import Image # show image
from tqdm.auto import tqdm # 진행바 표시
import shutil as sh # 파일 이동

# **Parsing xml file**

In [None]:
%cd /content/drive/MyDrive/TNT/KoreanCarObject/Dataset_for_LPR

In [None]:
from bs4 import BeautifulSoup 
# xml 또는 html 파일을 파싱해서 data로 변환하는 기능을 제공하는 라이브러리 
# 웹 크롤링에 사용됨

def get_metadata(file_name):

    # BeautifulSoup 라이브러리를 이용해허 xml 파일 파싱해서 출력
    with open(file_name, 'r', encoding='euc-kr') as f: # file open 
        xml_text = BeautifulSoup(f, "html.parser")
        # print(xml_text)
    
    meta = xml_text.findAll('rexmetainfo')[0]

    # 이미지 경로 설정 
    image_id = file_name.split('.')[0].split('/')[-1] # Image000003
    image_path = file_name.split('.')[0]+'.jpg' # /content/drive/MyDrive/TNT/KoreanCarObject/Dataset_for_LPR/Black-box_01/Image000003.jpg

    # car info
    car_bb_xmin, car_bb_ymin, car_bb_w, car_bb_h = meta.carinfo.rect.string.split(',')
    car_bb_xmin, car_bb_ymin, car_bb_w, car_bb_h = int(car_bb_xmin), int(car_bb_ymin), int(car_bb_w), int(car_bb_h)
    
    # plate info
    plate_bb_xmin, plate_bb_ymin, plate_bb_w, plate_bb_h = meta.plateinfo.rect.string.split(',')
    plate_bb_xmin, plate_bb_ymin, plate_bb_w, plate_bb_h = int(plate_bb_xmin), int(plate_bb_ymin), int(plate_bb_w), int(plate_bb_h)
    plate_type = meta.plateinfo.platetype.string
    plate_code = meta.plateinfo.code.string

    # save data 
    data = {'image_id' : image_id, 
            'car_bb_xmin' : car_bb_xmin, 'car_bb_ymin' : car_bb_ymin, 'car_bb_w' : car_bb_w, 'car_bb_h' : car_bb_h,
            'plate_bb_xmin' : plate_bb_xmin, 'plate_bb_ymin' : plate_bb_ymin, 'plate_bb_w' : plate_bb_w, 'plate_bb_h' : plate_bb_h,
            'plate_type' : plate_type, 'plate_code' : plate_code, 'image_path' : image_path}

    return data


temp_data = get_metadata('/content/drive/MyDrive/TNT/KoreanCarObject/Dataset_for_LPR/Black-box_01/Image000003.xml')
temp_data

In [None]:
Image('/content/drive/MyDrive/TNT/KoreanCarObject/Dataset_for_LPR/Black-box_01/Image000003.jpg')

#**Drawing Bounding Box function**

In [None]:
!pip install Pillow

from PIL import Image # 이미지 처리
import matplotlib.patches as patches # 도형 그리기
# patches : 도형 설정
# add_patch : 그래프에 도형 시각화
 
def image_with_bb(data):
    print(data.iloc[0,0]) # image_id 출력
    img = Image.open(data.iloc[0,11]) # image 경로 open 
    plt.imshow(img) # 이미지 보여줌

    # car bounding box
    car_bb = patches.Rectangle((data.iloc[0, 1], data.iloc[0, 2]),
                                data.iloc[0, 3], data.iloc[0, 4],
                                linewidth=2,
                                edgecolor='red',
                                fill = False)
    
    # plate bounding box
    plate_bb = patches.Rectangle((data.iloc[0, 5], data.iloc[0, 6]),
                                 data.iloc[0, 7], data.iloc[0, 8],
                                 linewidth=2,
                                 edgecolor='blue',
                                 fill = False)
    ax = plt.gca()
    ax.add_patch(car_bb) # 바운딩 박스 시각화 
    ax.add_patch(plate_bb) 

    plt.axis('off')
    plt.show()


# **Preprocessing Data**

In [None]:
%cd /content/drive/MyDrive/TNT/KoreanCarObject/Dataset_for_LPR

In [None]:
os.listdir() # 디렉토리 목록 불러오기 

In [None]:
black_box_list = os.listdir()[1:7]
black_box_list

In [None]:
# 메타데이터 데이터프레임 정의
meta_df = pd.DataFrame()

In [None]:
for dir in tqdm(black_box_list): # 진행바 표시
    xml_file_lists = [os.path.join(os.getcwd(),dir+'/'+_) 
    for _ in os.listdir(dir) if _.endswith('.xml')]
    for file in tqdm(xml_file_lists):
        data = get_metadata(file)
        meta_df = meta_df.append(data, ignore_index=True) # 인덱스 없이 데이터 프레임 생성

    # os.getcwd() : 현재 자신의 위치 표시
    # os.path.join
    # /content/drive/MyDrive/TNT/KoreaCarObject/Dataset_for_LPR/Black-box_01/_
    # for _
    # /content/drive/MyDrive/TNT/KoreaCarObject/Dataset_for_LPR/Black-box_01/Image000003.xml

In [None]:
# 메타데이터 인덱스 생성
meta_df = meta_df[['image_id', 'car_bb_xmin', 'car_bb_ymin', 'car_bb_w', 'car_bb_h', 
                   'plate_bb_xmin', 'plate_bb_ymin', 'plate_bb_w', 'plate_bb_h',  
                   'plate_code', 'plate_type', 'image_path']]
meta_df

In [None]:
# 데이터프레임 -> csv 파일 저장
meta_df.to_csv('meta_df.csv', index = False)

# **Confirm data**

In [None]:
index = list(set(meta_df.image_id)) # image_id만 담긴 리스트 구성
random_img = random.choice(index) # 랜덤하게 image_id 선택
image_data = meta_df[meta_df['image_id']==random_img] # meta_df 에서 해당 행 반환
image_with_bb(image_data) # 바운딩 박스 그려진 이미지 반환

# **데이터 정리**

In [None]:
%cd /content/drive/MyDrive/TNT/KoreanCarObject/Dataset_for_LPR

In [None]:
dir = os.listdir()[1:7] # Black-box_01:Black-box_06

In [None]:
for folder in tqdm(dir):
    file_list = os.listdir(folder) # Black-box_
    xml_files = [file for file in file_list if file.endswith(".xml")] # xml 파일명 리스트  
    img_files = [file for file in file_list if file.endswith(".jpg")] # jpg 파일명 리스트 

    # annotations 폴더로 xml 파일 이동
    for xml_file_name in xml_files :
        src_path = os.path.join(folder,xml_file_name) # xml 파일 경로 
        target_path = os.path.join('annotations', xml_file_name)
        sh.move(src_path, target_path)

    # images 폴더로 jpg 파일 이동
    for img_file_name in img_files:
        src_path = os.path.join(folder,img_file_name)
        target_path = os.path.join('images', img_file_name)
        sh.move(src_path, target_path)

In [None]:
len(os.listdir('annotations'))

In [None]:
len(os.listdir('images'))

# **train, test split**

In [None]:
data = pd.read_csv('/content/drive/MyDrive/TNT/KoreanCarObject/Dataset_for_LPR/meta_df.csv')
data

In [None]:
# train 0.6 test 0.4 비율로 데이터 분할 
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.4, shuffle=True)

In [None]:
train

In [None]:
test

In [None]:
# csv 파일로 저장
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)

In [None]:
# images 폴더 내 이미지들을 -> train_images, test_images 로 분할해서 저장
train_img_list = train['image_id']
test_img_list = test['image_id']

base_path = "/content/drive/MyDrive/TNT/KoreanCarObject/LPRdata/images"

for img in tqdm(train_img_list):
    src_path = os.path.join('images',img+'.jpg')
    target_path = os.path.join(base_path,'train',img+'.jpg')
    sh.move(src_path, target_path)

for img in tqdm(test_img_list):
    src_path = os.path.join('images',img+'.jpg')
    target_path = os.path.join(base_path,'valid',img+'.jpg')
    sh.move(src_path, target_path)

In [None]:
len(os.listdir('train_images'))

In [None]:
len(os.listdir('test_images'))