In [None]:
'''
preparation des donnees pour le challenge makerere
 1. separer les donnees originales en train, val, test
 2. transformer les labels du format coco vers le format yolo
 3. creer manuellement le fichier data.yaml

In [None]:
# here we link our notebook to our gdrive space
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
# imports
import shutil
import os
import csv

In [None]:
# to unip files
def unzip_file(src, dest):
  shutil.unpack_archive(src, dest)

# split data into train, vaal, test
def split_data(src, dest):
  train_src = src+'Train_Images/'
  test_src = src+'Test_Images/'
  train_dest = dest+'train/images/'
  test_dest = dest+'test/images/'
  valid_dest = dest+'valid/images/'
  train_files = os.listdir(train_src)
  test_files = os.listdir(test_src)

  # copy the first 2500 initial train images to make final train images
  for fn in train_files[:2500]:
    shutil.copy(train_src+fn, train_dest+fn)

  # copy the rest 500 initial train images to make final valid images
  for fn in train_files[2500:]:
    shutil.copy(train_src+fn, valid_dest+fn)

  # copy the initial test images to make final test images
  for fn in test_files:
    shutil.copy(test_src+fn, test_dest+fn)

# Dictionary that maps class names to IDs
class_name_to_id_mapping = {"fruit_healthy": 0,
                            "fruit_brownspot": 1,
                            "fruit_woodiness": 2}

# to convert annotations into yolov5 format
def convert_to_yolov5(info):
  class_id = class_name_to_id_mapping[info[1]]
  xmin = float(info[2])
  ymin = float(info[3])
  b_width = float(info[4])
  b_height = float(info[5])

  # Transform the bbox co-ordinates as per the format required by YOLO 
  b_center_x = xmin + b_width/2 
  b_center_y = ymin + b_height/2
        
  # Normalise the co-ordinates by the dimensions of the image
  image_w, image_h = 512, 512  
  b_center_x /= image_w 
  b_center_y /= image_h 
  b_width    /= image_w 
  b_height   /= image_h

  return class_id, b_center_x, b_center_y, b_width, b_height

# create yolov5 annotations of each images in src
def label_images(src):
  train_src_img = src+'train/images/'
  valid_src_img = src+'valid/images/'
  train_src_label = src+'train/labels/'
  valid_src_label = src+'valid/labels/'
  train_files = os.listdir(train_src_img)
  valid_files = os.listdir(valid_src_img)

  # read csv file of coco annotations
  with open('/content/drive/MyDrive/Zindi/makerere-passion-fruit-disease-detection/initial_data/Train.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
      if(line_count == 0):
        line_count += 1
        continue
      # get image filename
      filename = row[0]
      if filename+'.jpg' in train_files:
        txt_dest = train_src_label+filename+'.txt'
      if filename+'.jpg' in valid_files:
        txt_dest = valid_src_label+filename+'.txt'

      # get current file coco annotations and convert to yolo
      class_id, b_center_x, b_center_y, b_width, b_height =convert_to_yolov5(row)

      # write yolov5 aannotations in corresponding file
      with open(txt_dest, 'a') as f:
        f.write(str(class_id) + " " + str(b_center_x) + " " + str(b_center_y) + " " + str(b_width) + " " + str(b_height))
        f.write('\n')
      #print(filename, txt_dest, class_id, b_center_x, b_center_y, b_width, b_height)
      line_count += 1

def data_preparation():
  # 0. unzip initial train and test files
  print(">>> step 0: unzip initial train and test files")
  initial_src = '/content/drive/MyDrive/Zindi/makerere-passion-fruit-disease-detection/initial_data/'
  train_zip_src = initial_src+'Train_Images.zip'
  train_zip_dest = initial_src
  test_zip_src = initial_src+'Test_Images.zip'
  test_zip_dest = initial_src
  unzip_file(train_zip_src, train_zip_dest)
  unzip_file(test_zip_src, test_zip_dest)
  print("step 0 done !")

  # 1. make final train, valid and test images
  print(">>> step 1: make final train, valid and test images")
  dest = '/content/drive/MyDrive/Zindi/makerere-passion-fruit-disease-detection/final_data/'
  split_data(initial_src, dest)
  print("step 1 done !")

  # 2. create yolov5 labels
  print(">>> step 2: create yolov5 labels")
  src = '/content/drive/MyDrive/Zindi/makerere-passion-fruit-disease-detection/final_data/'
  label_images(src)
  print("step 2 done !")

data_preparation()

>>> step 0: unzip initial train and test files
step 0 done !
>>> step 1: make final train, valid and test images
step 1 done !
>>> step 2: create yolov5 labels
step 2 done !


In [None]:
train_src_img = src+'train/images/'
valid_src_img = src+'valid/images/'
train_src_label = src+'train/labels/'
valid_src_label = src+'valid/labels/'
train_files = os.listdir(train_src_img)
valid_files = os.listdir(valid_src_img)
train_labels = os.listdir(train_src_label)
valid_labels = os.listdir(valid_src_label)

e0 = 0
e1 = 0
for fn in train_files:
  if not(fn.split(".")[0]+'.txt' in train_labels):
    print(fn)
    e0 += 1

for fn in valid_files:
  if not(fn.split(".")[0]+'.txt' in valid_labels):
    e1 += 1

print(e0, e1)

.DS_Store
1 0
