# [ 2. Dataset ]

- __[ 실습목표 ]__  
파이썬의 데이터 가공 패키지인 Pandas 를 활용하여 해보도록 날 것의 데이터를 학습을 위한 데이터 형태로 가공해본다.

- __[ 실습 및 과제 내용 ]__  
1. Cover 데이터를 '(2) – 이미지 경로 설정' 과 같은 포맷으로 구성
2. Cover 데이터의 학습용 데이터프레임 생성

---

In [1]:
import os
import pathlib
import sys

default_dir_path = str(pathlib.Path(os.getcwd()).parent.absolute())

sys.path.append(default_dir_path)

from properties import *

[2020-12-09 20:12:03,343][INFO][properties.py:30] Complete to apply the random seed, RANDOM_SEED : 777


In [2]:
# from dataset.dataframe import DataFrameFactory
import os

import pandas as pd


class DataFrameFactory(object):

    def __init__(self):
        pass

    @classmethod
    def get_dataframe(cls, product, cam_number):
        # product = model_data.product
        # cam_number = model_data.cam_number
        label_df = cls.get_label_df(DATASET_PROPERTIES.LABEL_PATH, sep=DATASET_PROPERTIES.LABEL_CSV_SEPARATE)
        ng_image_path_dict = None
        ok_image_path_dict = None

        if product == "housing":
            ng_image_path_dict = cls.get_image_path_dict(
                dir_path=DATASET_PROPERTIES.HOUSING_NG_PATH,
                cam_number=cam_number
            )
            ok_image_path_dict = cls.get_image_path_dict(
                dir_path=DATASET_PROPERTIES.HOUSING_OK_PATH,
                cam_number=cam_number
            )
        elif product == "cover":
            ng_image_path_dict = cls.get_image_path_dict(
                dir_path=DATASET_PROPERTIES.COVER_NG_PATH,
                cam_number=cam_number
            )
            ok_image_path_dict = cls.get_image_path_dict(
                dir_path=DATASET_PROPERTIES.COVER_OK_PATH,
                cam_number=cam_number
            )

        ng_dataframe = cls.get_data(
            cam_num=cam_number,
            image_path_dict=ng_image_path_dict,
            label_df=label_df,
            is_NG=True
        )
        ok_dataframe = cls.get_data(
            cam_num=cam_number,
            image_path_dict=ok_image_path_dict,
            label_df=label_df,
            is_NG=False
        )

        ng_discolor_dataframe, ng_dataframe = cls.split_dataframe_with_defect_category(
            df=ng_dataframe,
            defect_category=DATASET_PROPERTIES.DEFECT_DISCOLOR
        )
        ok_dataframe = ok_dataframe.reset_index()  # ??
        ok_dataframe = ok_dataframe.drop(columns="index")

        total_dataframe = ng_dataframe.append(ok_dataframe)

        """
        @TODO (실습 x)
        변색 따로 할 것
        """
        # ng_discolor_dataframe

        total_dataframe = total_dataframe.reset_index()
        total_dataframe = total_dataframe.drop(columns="index")

        return total_dataframe

    @classmethod
    def get_image_path_dict(cls, dir_path, cam_number) -> dict:
        image_path_dict = dict()

        dir_path_list_with_serial = [os.path.join(dir_path, directory) for directory in os.listdir(dir_path) if
                                     os.path.isdir(os.path.join(dir_path, directory))]

        for dir_path_with_serial in dir_path_list_with_serial:
            for file in os.listdir(dir_path_with_serial):
                if os.path.splitext(file)[1] != ".png":
                    continue

                serial_number, file_cam_name = file.split("_")[-5:-3]
                file_cam_number = int(file_cam_name[-1])

                if cam_number == file_cam_number:
                    image_path_dict[serial_number] = os.path.join(dir_path_with_serial, file)

        return image_path_dict

    @classmethod
    def get_label_df(cls, label_path, sep='\t'):
        label_df = pd.read_csv(label_path, sep=sep)

        """
        @TODO
        Cover 파일 양식대로 수정할 것
        """
        # Preprocessing columns
        label_df.set_index('SERIAL_NO', inplace=True)
        label_df['CAM_INDEX'] = label_df['CAMERA_INFO'].map(lambda x: int(x[-1]))
        label_df['DEFECT_CATEGORY'] = label_df['LABEL_BAD_STATUS'].map(lambda x: int(x[1:]))
        label_df['PRODUCT_CATEGORY'] = label_df['WORK_SHOP_ID'].map(lambda x: 'Housing' if x == 1680 else 'Cover')

        return label_df

    @classmethod
    def get_data(cls, cam_num, image_path_dict, label_df, is_NG: bool):
        data_dict = {
            'serial_number_list': list(),
            'image_path_list': list(),
            'is_NG_list': list(),
            'cam_list': list(),
            'defect_category_list': list(),
            'x_st_list': list(),
            'x_ed_list': list(),
            'y_st_list': list(),
            'y_ed_list': list(),
            'ratio_list': list()
        }

        for serial_number, image_path in image_path_dict.items():
            if is_NG:
                label_df_with_cam = label_df[(label_df.index == serial_number) & (label_df['CAM_INDEX'] == cam_num)]

                if label_df_with_cam.size == 0:
                    continue

                defect_category_list_by_cam = list()
                x_st_list_by_cam = list()
                x_ed_list_by_cam = list()
                y_st_list_by_cam = list()
                y_ed_list_by_cam = list()
                ratio_list_by_cam = list()

                for idx, row in label_df_with_cam.iterrows():
                    defect_category_list_by_cam.append(row['DEFECT_CATEGORY'])
                    x_st_list_by_cam.append(int(row['START_X']))
                    x_ed_list_by_cam.append(int(row['END_X']))
                    y_st_list_by_cam.append(int(row['START_Y']))
                    y_ed_list_by_cam.append(int(row['END_Y']))
                    ratio_list_by_cam.append(row['RESIZE_RATE'])

                data_dict['serial_number_list'].append(serial_number)
                data_dict['image_path_list'].append(image_path)
                data_dict['is_NG_list'].append(1)
                data_dict['cam_list'].append(cam_num)
                data_dict['defect_category_list'].append(defect_category_list_by_cam)
                data_dict['x_st_list'].append(x_st_list_by_cam)
                data_dict['x_ed_list'].append(x_ed_list_by_cam)
                data_dict['y_st_list'].append(y_st_list_by_cam)
                data_dict['y_ed_list'].append(y_ed_list_by_cam)
                data_dict['ratio_list'].append(ratio_list_by_cam)
            else:
                data_dict['serial_number_list'].append(serial_number)
                data_dict['image_path_list'].append(image_path)
                data_dict['is_NG_list'].append(0)
                data_dict['cam_list'].append(cam_num)
                data_dict['defect_category_list'].append([0])
                data_dict['x_st_list'].append([0])
                data_dict['x_ed_list'].append([0])
                data_dict['y_st_list'].append([0])
                data_dict['y_ed_list'].append([0])
                data_dict['ratio_list'].append([0])

        return pd.DataFrame(data_dict)

    @classmethod
    def split_dataframe_with_defect_category(cls, df, defect_category: int):
        defect_discolor_index_list = list()

        for idx, defect_category_list in enumerate(df['defect_category_list']):
            if defect_category in defect_category_list:
                defect_discolor_index_list.append(idx)

        defect_df = df.iloc[defect_discolor_index_list].reset_index()
        other_df = df.drop(index=defect_df.index).reset_index()

        defect_df = defect_df.drop(columns="index")
        other_df = other_df.drop(columns="index")

        return defect_df, other_df

In [3]:
label_df = DataFrameFactory.get_label_df(label_path=DATASET_PROPERTIES.LABEL_PATH, sep=",")

## Label Dataframe

In [8]:
label_df.head()

Unnamed: 0_level_0,CAMERA_INFO,LABEL_BAD_STATUS,START_X,END_X,START_Y,END_Y,RESIZE_RATE,WORK_SHOP_ID,PICTURE_IDX,CAM_INDEX,DEFECT_CATEGORY,PRODUCT_CATEGORY
SERIAL_NO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
20A13B2C2174C,CAM3,B003,64.0,68.0,131.0,136.0,0.398261,1680,1,3,3,Housing
20A13B2C2174C,CAM3,B003,236.0,248.0,188.0,198.0,0.398261,1680,1,3,3,Housing
20A13B2C2178B,CAM3,B003,55.0,62.0,136.0,144.0,0.398261,1680,1,3,3,Housing
20A13B2C2184B,CAM3,B003,73.0,79.0,99.0,104.0,0.398261,1680,1,3,3,Housing
20A13B2C2184B,CAM3,B003,79.0,84.0,81.0,85.0,0.398261,1680,1,3,3,Housing


In [13]:
label_df.iloc[1, :]

CAMERA_INFO             CAM3
LABEL_BAD_STATUS        B003
START_X                  236
END_X                    248
START_Y                  188
END_Y                    198
RESIZE_RATE         0.398261
WORK_SHOP_ID            1680
PICTURE_IDX                1
CAM_INDEX                  3
DEFECT_CATEGORY            3
PRODUCT_CATEGORY     Housing
Name: 20A13B2C2174C, dtype: object

In [14]:
label_df.iloc[:, 1]

SERIAL_NO
20A13B2C2174C    B003
20A13B2C2174C    B003
20A13B2C2178B    B003
20A13B2C2184B    B003
20A13B2C2184B    B003
                 ... 
20H31B2C4930B    B003
20H31B2C4930B    B003
20H31B2C4945C    B003
20H31B2C4962C    B003
20H31B2C5231C    B003
Name: LABEL_BAD_STATUS, Length: 3699, dtype: object

In [33]:
label_df.loc["20A13B2C2174C", :]

Unnamed: 0_level_0,CAMERA_INFO,LABEL_BAD_STATUS,START_X,END_X,START_Y,END_Y,RESIZE_RATE,WORK_SHOP_ID,PICTURE_IDX,CAM_INDEX,DEFECT_CATEGORY,PRODUCT_CATEGORY
SERIAL_NO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
20A13B2C2174C,CAM3,B003,64.0,68.0,131.0,136.0,0.398261,1680,1,3,3,Housing
20A13B2C2174C,CAM3,B003,236.0,248.0,188.0,198.0,0.398261,1680,1,3,3,Housing


In [26]:
defect_category_list = [row["LABEL_BAD_STATUS"] for i, row in label_df.iterrows()]
defect_category_list[:10]

['B003',
 'B003',
 'B003',
 'B003',
 'B003',
 'B003',
 'B003',
 'B001',
 'B003',
 'B003']

In [10]:
label_df.query("(CAMERA_INFO == 'CAM3') and (SERIAL_NO == '20A13B2C2184B')")

Unnamed: 0_level_0,CAMERA_INFO,LABEL_BAD_STATUS,START_X,END_X,START_Y,END_Y,RESIZE_RATE,WORK_SHOP_ID,PICTURE_IDX,CAM_INDEX,DEFECT_CATEGORY,PRODUCT_CATEGORY
SERIAL_NO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
20A13B2C2184B,CAM3,B003,73.0,79.0,99.0,104.0,0.398261,1680,1,3,3,Housing
20A13B2C2184B,CAM3,B003,79.0,84.0,81.0,85.0,0.398261,1680,1,3,3,Housing


In [11]:
label_df.query("LABEL_BAD_STATUS == 'B001'").index

Index(['20A13B2C2220C', '20A14B2C3221C', '20A14B2C3260B', '20A21B2C2628B',
       '20A29B2C3186B', '20A29B2C3190C', '20A30B2C2082C', '20A30B2C2384C',
       '20A30B2C2384C', '20A30B2C2578C', '20A30B2C2994C', '20A30B2C2994C',
       '20B04B2C2682C', '20B04B2C2872C', '20B04B2C2902B', '20B04B2C3400C',
       '20B04B2C3572C', '20B11B2C2838B', '20B11B2C3594B', '20B12B2C2681C',
       '20B12B2C3281C', '20B17B2C3558B', '20B17B2C3610B', '20B18B2C2941C',
       '20B20B2C2120B', '20B20B2C2120B', '20B20B2C2239C', '20B20B2C3214C',
       '20B20B2C3214C', '20B20B2C3244B', '20B20B2C3369C', '20B20B2C3369C',
       '20B24B2C2192C', '20B24B2C2343C', '20B24B2C3268C', '20B25B2C2508B',
       '20B25B2C3059B', '20B25B2C3227C', '20C02B2C2088C', '20D02B2C3145B',
       '20D02B2C3145B', '20D17B2C2050C', '20D24B2C2880C', '20D27B2C3357B',
       '20D28B2C2023C', '20D28B2C2108C', '20E13B2C3525C', '20E20B2C3299B',
       '20G06B2C2701C', '20G06B2C3249C', '20G06B2C3249C', '20G07B2C2100B',
       '20G07B2C2120C', '

In [30]:
list(set(label_df.index))[:10]

['20B03B2C3232B',
 '20B18B2C2738C',
 '20B24B2C3384B',
 '20B12B2C3320C',
 '20G15B2C2237C',
 '20B25B2C3067C',
 '20B24B2C2248B',
 '20B03B2C2179D',
 '20H19B2C2958B',
 '20H18B2C3022B']

## Dictionary of Image Path

In [35]:
housing_cam1_NG_image_path_dict = DataFrameFactory.get_image_path_dict(dir_path=DATASET_PROPERTIES.HOUSING_NG_PATH, cam_number=1)
housing_cam2_NG_image_path_dict = DataFrameFactory.get_image_path_dict(dir_path=DATASET_PROPERTIES.HOUSING_NG_PATH, cam_number=2)
housing_cam3_NG_image_path_dict = DataFrameFactory.get_image_path_dict(dir_path=DATASET_PROPERTIES.HOUSING_NG_PATH, cam_number=3)
housing_cam4_NG_image_path_dict = DataFrameFactory.get_image_path_dict(dir_path=DATASET_PROPERTIES.HOUSING_NG_PATH, cam_number=4)

In [36]:
housing_cam1_OK_image_path_dict = DataFrameFactory.get_image_path_dict(dir_path=DATASET_PROPERTIES.HOUSING_OK_PATH, cam_number=1)
housing_cam2_OK_image_path_dict = DataFrameFactory.get_image_path_dict(dir_path=DATASET_PROPERTIES.HOUSING_OK_PATH, cam_number=2)
housing_cam3_OK_image_path_dict = DataFrameFactory.get_image_path_dict(dir_path=DATASET_PROPERTIES.HOUSING_OK_PATH, cam_number=3)
housing_cam4_OK_image_path_dict = DataFrameFactory.get_image_path_dict(dir_path=DATASET_PROPERTIES.HOUSING_OK_PATH, cam_number=4)

In [37]:
cover_cam1_NG_image_path_dict = DataFrameFactory.get_image_path_dict(dir_path=DATASET_PROPERTIES.COVER_NG_PATH, cam_number=1)
cover_cam2_NG_image_path_dict = DataFrameFactory.get_image_path_dict(dir_path=DATASET_PROPERTIES.COVER_NG_PATH, cam_number=2)
cover_cam3_NG_image_path_dict = DataFrameFactory.get_image_path_dict(dir_path=DATASET_PROPERTIES.COVER_NG_PATH, cam_number=3)
cover_cam4_NG_image_path_dict = DataFrameFactory.get_image_path_dict(dir_path=DATASET_PROPERTIES.COVER_NG_PATH, cam_number=4)

In [38]:
cover_cam1_OK_image_path_dict = DataFrameFactory.get_image_path_dict(dir_path=DATASET_PROPERTIES.COVER_OK_PATH, cam_number=1)
cover_cam2_OK_image_path_dict = DataFrameFactory.get_image_path_dict(dir_path=DATASET_PROPERTIES.COVER_OK_PATH, cam_number=2)
cover_cam3_OK_image_path_dict = DataFrameFactory.get_image_path_dict(dir_path=DATASET_PROPERTIES.COVER_OK_PATH, cam_number=3)
cover_cam4_OK_image_path_dict = DataFrameFactory.get_image_path_dict(dir_path=DATASET_PROPERTIES.COVER_OK_PATH, cam_number=4)

## Dataframe for training

In [39]:
housing_cam1_NG_df = DataFrameFactory.get_data(cam_num=1, image_path_dict=housing_cam1_NG_image_path_dict, label_df=label_df, is_NG=True)
housing_cam2_NG_df = DataFrameFactory.get_data(cam_num=2, image_path_dict=housing_cam2_NG_image_path_dict, label_df=label_df, is_NG=True)
housing_cam3_NG_df = DataFrameFactory.get_data(cam_num=3, image_path_dict=housing_cam3_NG_image_path_dict, label_df=label_df, is_NG=True)
housing_cam4_NG_df = DataFrameFactory.get_data(cam_num=4, image_path_dict=housing_cam4_NG_image_path_dict, label_df=label_df, is_NG=True)

In [40]:
housing_cam1_OK_df = DataFrameFactory.get_data(cam_num=1, image_path_dict=housing_cam1_OK_image_path_dict, label_df=label_df, is_NG=False)
housing_cam2_OK_df = DataFrameFactory.get_data(cam_num=2, image_path_dict=housing_cam2_OK_image_path_dict, label_df=label_df, is_NG=False)
housing_cam3_OK_df = DataFrameFactory.get_data(cam_num=3, image_path_dict=housing_cam3_OK_image_path_dict, label_df=label_df, is_NG=False)
housing_cam4_OK_df = DataFrameFactory.get_data(cam_num=4, image_path_dict=housing_cam4_OK_image_path_dict, label_df=label_df, is_NG=False)

In [41]:
cover_cam1_NG_df = DataFrameFactory.get_data(cam_num=1, image_path_dict=cover_cam1_NG_image_path_dict, label_df=label_df, is_NG=True)
cover_cam2_NG_df = DataFrameFactory.get_data(cam_num=2, image_path_dict=cover_cam2_NG_image_path_dict, label_df=label_df, is_NG=True)
cover_cam3_NG_df = DataFrameFactory.get_data(cam_num=3, image_path_dict=cover_cam3_NG_image_path_dict, label_df=label_df, is_NG=True)
cover_cam4_NG_df = DataFrameFactory.get_data(cam_num=4, image_path_dict=cover_cam4_NG_image_path_dict, label_df=label_df, is_NG=True)

In [42]:
cover_cam1_OK_df = DataFrameFactory.get_data(cam_num=1, image_path_dict=cover_cam1_OK_image_path_dict, label_df=label_df, is_NG=False)
cover_cam2_OK_df = DataFrameFactory.get_data(cam_num=2, image_path_dict=cover_cam2_OK_image_path_dict, label_df=label_df, is_NG=False)
cover_cam3_OK_df = DataFrameFactory.get_data(cam_num=3, image_path_dict=cover_cam3_OK_image_path_dict, label_df=label_df, is_NG=False)
cover_cam4_OK_df = DataFrameFactory.get_data(cam_num=4, image_path_dict=cover_cam4_OK_image_path_dict, label_df=label_df, is_NG=False)

In [43]:
cover_cam1_OK_df

Unnamed: 0,serial_number_list,image_path_list,is_NG_list,cam_list,defect_category_list,x_st_list,x_ed_list,y_st_list,y_ed_list,ratio_list
0,20C18B2C0053,D:\AI_Lab\MH\git\newnewnew\Myunghwa-AI-Vision-...,0,1,[0],[0],[0],[0],[0],[0]
1,20C18B2C0066,D:\AI_Lab\MH\git\newnewnew\Myunghwa-AI-Vision-...,0,1,[0],[0],[0],[0],[0],[0]
2,20C18B2C0141,D:\AI_Lab\MH\git\newnewnew\Myunghwa-AI-Vision-...,0,1,[0],[0],[0],[0],[0],[0]
3,20C18B2C0150,D:\AI_Lab\MH\git\newnewnew\Myunghwa-AI-Vision-...,0,1,[0],[0],[0],[0],[0],[0]
4,20C18B2C0960,D:\AI_Lab\MH\git\newnewnew\Myunghwa-AI-Vision-...,0,1,[0],[0],[0],[0],[0],[0]
5,20C18B2C0974,D:\AI_Lab\MH\git\newnewnew\Myunghwa-AI-Vision-...,0,1,[0],[0],[0],[0],[0],[0]
6,20C18B2C1014,D:\AI_Lab\MH\git\newnewnew\Myunghwa-AI-Vision-...,0,1,[0],[0],[0],[0],[0],[0]
7,20C18B2C1021,D:\AI_Lab\MH\git\newnewnew\Myunghwa-AI-Vision-...,0,1,[0],[0],[0],[0],[0],[0]
