# [농업 영상 AI 데이터 활용 아이디어 해커톤 대회](http://k-farmdata.com/hackathon/)
NIA와 경상대에서 주최한 AI 해커톤 대회입니다  
최우수상을 수상하였고 머신러닝과 딥러닝의 차이점에 대해 명확히 알 수 있었습니다


[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DongChanKIM2/AI-Data-Hackathon-Competiton/blob/main/Fruit_CNN_pre.ipynb)

# Data preprocessing
colab에 제공할 데이터를 image data를 pickle(X, y) 형태로 가공하는 과정<br>
작업폴더구조<br>
data/[apple_img, apple_label,..., radish_img, radish_label]<br>
apple_img/[apple_fuji_L_1-1.png, ...]<br>
apple_label/[apple_fuji_L_1-1.json, ...]

In [None]:
import json 
import glob # 경로명을 이용해 파일 리스트 추출
import os # 현재 위치 및 경로 병합
import pandas as pd # 데이터
from tqdm import tqdm # 실행시간 보여주기
import numpy as np # 수학연산
import pickle # pickle

In [None]:
# 이미지, plotting 관련
import matplotlib.pyplot as plt
from skimage.io import imread
from skimage.transform import resize

In [None]:
def get_label(filenames):
  """
  각각의 파일명(key)에 대응하는 상품 등급(value)
  담은 dictionary 반환
  """ 
  labels = {}

  for f in tqdm(filenames):
    with open(f, "r", encoding='utf-8') as json_file:
      try:
        d = json.load(json_file)
      except UnicodeDecodeError:
        print('Unicode_error')
        continue
    
    file_name = f.split('\\')[-1]
    file_name = file_name.split('.')[0]
    labels[file_name] = d['cate3']
  return labels

In [None]:
def get_raw_imgs(img_names):
  """
  파일명을 input, 각각의 이미지를 100x100x3 list로 반환
  """
  imgs = []
  for i in img_names:
    imgs.append(imread(i))
  
  return imgs

In [None]:
def make_dataset(img_names, labels):
  X = []
  y = []
  fn = []
  for img_nm in tqdm(img_names):
    try:
      img_raw = imread(img_nm)  # 0~255 이미지 데이터
      if img_raw.shape[-1] == 4:
            img_raw = img_raw[:,:,:-1]
    except ValueError:
      print('Error about image')
      continue
        
    pure_nm = img_nm.split('\\')[-1].split('.')[0]  # 순수한 파일 명(확장자 제외)
    # img_raw = resize(img_raw, (100, 100))  # min_max scaling
    label = labels.get(pure_nm, 0)
    if label:  # label이 0이 아니면
      X.append(img_raw)
      y.append(label)
      fn.append(pure_nm)
  
  X = np.array(X)
  y = np.array(y)
  fn = np.array(fn)
  return X, y, fn

In [None]:
# 총 10가지의 작물 이름
all_file_names = glob.glob('data/*')
fruit_names = list(set(x.split('\\')[-1].split('_')[0] for x in all_file_names))
fruit_names.sort()

In [None]:
subclass_dict = {}

In [None]:
for fruit_name in tqdm(fruit_names):

    img_names = glob.glob(f'data/{fruit_name}_img/*')
    subclass = set([x.split('\\')[-1].split('_')[1] for x in img_names])

    subclass_dict[fruit_name] = subclass

In [None]:
subclass_dict['chinese'] = set(['cabbage'])

In [None]:
subclass_dict  # 서브클래스 확인

{'apple': {'fuji', 'yanggwang'},
 'cabbage': {'green', 'red'},
 'chinese': {'cabbage'},
 'garlic': {'uiseong'},
 'mandarin': {'hallabong', 'onjumilgam'},
 'onion': {'red', 'white'},
 'pear': {'chuhwang', 'singo'},
 'persimmon': {'bansi', 'booyu', 'daebong'},
 'potato': {'seolbong', 'sumi'},
 'radish': {'winter radish'}}

## Plotting

In [None]:
def get_raw_imgs(img_names):
    """
    파일명을 input, 각각의 이미지를 100x100x3 list로 반환
    """
    imgs = []
    for i in img_names:
        imgs.append(imread(i))

    return imgs

In [None]:
def img_plotting(imgs, titles, *args):
    """
    plotting 함수
    """
    plt.figure(figsize=(12, 6))
    titles = ['_'.join(x.split('\\')[-1].split('_')[:3]) for x in img_locs]
    for i in range(args[0]*args[1]):
        ax = plt.subplot(args[0], args[1], i + 1)
        plt.imshow(imgs[i])
        plt.title(titles[i])
        plt.axis("off")

In [None]:
n = 20
np.random.seed(2021)
for fruit_name in fruit_names:
    for subclass in  subclass_dict[fruit_name]:
        img_names = glob.glob(f'data/{fruit_name}_img/*')
        if subclass == 'cabbage':
            img_locs = np.random.choice(img_names, n, replace=False)
        else:
            img_locs = np.random.choice([x for x in img_names if x.split('\\')[-1].split('_')[1] == subclass],
                                n, replace=False)
        imgs_info = get_raw_imgs(img_locs)
        img_plotting(imgs_info, img_locs, 2, 5)
        plt.savefig(f'./{fruit_name}_{subclass}.png')


In [None]:
# X, y, filename 을 담고있는 pickle 데이터 저장
for fruit_name in fruit_names:
    
    print(f'process {fruit_name}')
    filenames = glob.glob(f'data/{fruit_name}_label/*')
    labels = get_label(filenames)
    img_names = glob.glob(f'data/{fruit_name}_img/*')

    X, y, fn = make_dataset(img_names, labels)

    dataset = {'X': X, 'y': y, 'file_name': fn}

    with open(f'data_colab/{fruit_name}.pkl', 'wb') as f:
        pickle.dump(dataset, f)