<a href="https://colab.research.google.com/github/DhruvMakwana/Global-Wheat-Detection/blob/master/Create_Label_For_Yolo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# importing libraries
import os
import pandas as pd
import numpy as np
from glob import glob
from ast import literal_eval
import shutil

In [2]:
# setup directory and files
base_dir = "/content/drive/My Drive/Dataset"
train_dataframe = os.path.join(base_dir, 'train.csv')
train_dir = os.path.join(base_dir, 'Train/')
data_dir = os.path.join(base_dir, 'Data/')
if not os.path.exists(data_dir):
  print("Directory Created")
  os.makedirs(data_dir)

Directory Created


In [3]:
# load dataframe
train_df = pd.read_csv(train_dataframe)
train_df.head()

Unnamed: 0,image_id,width,height,bbox,source
0,b6ab77fd7,1024,1024,"[834.0, 222.0, 56.0, 36.0]",usask_1
1,b6ab77fd7,1024,1024,"[226.0, 548.0, 130.0, 58.0]",usask_1
2,b6ab77fd7,1024,1024,"[377.0, 504.0, 74.0, 160.0]",usask_1
3,b6ab77fd7,1024,1024,"[834.0, 95.0, 109.0, 107.0]",usask_1
4,b6ab77fd7,1024,1024,"[26.0, 144.0, 124.0, 117.0]",usask_1


The bounding box format as given is [xmin, ymin, widht, height] in pixels. For yolov4, we have to convert these labels into Yolo format. Below is how to do so.

In [0]:
def convert(size, box):
  dw = 1. / size[0]
  dh = 1. / size[1]
  x = (box[0] + box[1]) / 2.0
  y = (box[2] + box[3]) / 2.0
  w = box[1] - box[0]
  h = box[3] - box[2]
  x = x * dw
  w = w * dw
  y = y * dh
  h = h * dh
  return [x, y, w, h]

def convert_to_yolo_label(coco_format_box, w = 1024, h = 1024):
  bbox = literal_eval(coco_format_box)
  xmin = bbox[0]
  xmax = bbox[0] + bbox[2]
  ymin = bbox[1]
  ymax = bbox[1] + bbox[3]
  b = (float(xmin), float(xmax), float(ymin), float(ymax))
  yolo_box = convert((w, h), b)
  if np.max(yolo_box) > 1 or np.min(yolo_box) < 0: 
    print("BOX HAS AN ISSUE")
  return yolo_box

In [5]:
%%time
train_df['yolo_box'] = train_df.bbox.apply(convert_to_yolo_label)

CPU times: user 4.38 s, sys: 53.2 ms, total: 4.43 s
Wall time: 4.42 s


In [6]:
train_df.head()

Unnamed: 0,image_id,width,height,bbox,source,yolo_box
0,b6ab77fd7,1024,1024,"[834.0, 222.0, 56.0, 36.0]",usask_1,"[0.841796875, 0.234375, 0.0546875, 0.03515625]"
1,b6ab77fd7,1024,1024,"[226.0, 548.0, 130.0, 58.0]",usask_1,"[0.2841796875, 0.5634765625, 0.126953125, 0.05..."
2,b6ab77fd7,1024,1024,"[377.0, 504.0, 74.0, 160.0]",usask_1,"[0.404296875, 0.5703125, 0.072265625, 0.15625]"
3,b6ab77fd7,1024,1024,"[834.0, 95.0, 109.0, 107.0]",usask_1,"[0.86767578125, 0.14501953125, 0.1064453125, 0..."
4,b6ab77fd7,1024,1024,"[26.0, 144.0, 124.0, 117.0]",usask_1,"[0.0859375, 0.19775390625, 0.12109375, 0.11425..."


In [0]:
unique_img_ids = train_df.image_id.unique()

In [0]:
folder_location = data_dir
for img_id in unique_img_ids: # loop through all unique image ids.
    filt_df = train_df.query("image_id == @img_id") # filter the df to a specific id
    all_boxes = filt_df.yolo_box.values
    file_name = "{}/{}.txt".format(folder_location,img_id) # specify the name of the folder and get a file name

    s = "0 %s %s %s %s \n" # the first number is the identifier of the class. If you are doing multi-class, make sure to change that
    with open(file_name, 'a') as file: # append lines to file
        for i in all_boxes:
            new_line = (s % tuple(i))
            file.write(new_line)  

In [0]:
all_imgs = glob(train_dir + "*.jpg")
all_imgs = [i.split("/")[-1].replace(".jpg", "") for i in all_imgs]
positive_imgs = train_df.image_id.unique()

In [10]:
negative_images = set(all_imgs) - set(positive_imgs)
print("Length of all images is: {} \nLength of all images with bbox is: {} \nLength of all images without bbox is: {}".format(len(all_imgs), len(positive_imgs), len(negative_images)))

Length of all images is: 3422 
Length of all images with bbox is: 3373 
Length of all images without bbox is: 49


We need to also make labels for the training images that do not have bounding boxes in them.

In [0]:
for i in list(negative_images):
  file_name = data_dir + "{}.txt".format(i)
  with open(file_name, 'w') as fp: 
    pass

In [12]:
# Expected length should be 3422 one txt file for each image
print("Length of label folder we created is: {}".format(len(glob(data_dir + "/*"))))

Length of label folder we created is: 3422


Now we need to move all images to data directory where we have all labels ready.

In [0]:
files = os.listdir(train_dir)
for f in files:
  shutil.copy(train_dir + "/" + f, data_dir)

In [14]:
# Expected length should be 3422*2 = 6844
print("Length of label folder we created is: {}".format(len(glob(data_dir + "/*"))))

Length of label folder we created is: 6844


Now we need to create train.txt file which has all images name.

In [0]:
image_files = []
for filename in os.listdir(data_dir):
  if filename.endswith(".jpg"):
    image_files.append(data_dir + filename)

with open(base_dir + "/train.txt", "w") as outfile:
  for image in image_files:
    outfile.write(image)
    outfile.write("\n")
  outfile.close()

Done! Now we have train.txt file with all images location.