# Conversion of Dataset to COCO Format

In this notebook, we'll convert our Custom Dataset format to the COCO Format which can be used to train a model in Detectron2.

- This dataset consists of 428 real images in the image folder. Annotation corresponding to image presents in Annotation folder. Out of which **401 images are used for training and the remaining 27 images are used for validation**. 
- The Annotation for the image has the same name that of the image just with the difference of extension. 
For example, if the image name is "1.jpg" then the corresponding annotation will be "1.txt".
- The format for the storage of the annotation file is as such.
- The no. of the lines in annotation text file denotes no of bounding box present in that image.
- A single line represents a single bounding box. format is as follow x1, x2, x3, x4, y1, y2, y3, y4, Language. 
Where (x1,y1) is the top left, (x2,y2) is top right, (x3,y3) bottom right, (x4,y4) bottom left.
- the order of point is in the clockwise order starting from the top-left points.


You can download the dataset here: https://drive.google.com/file/d/1gZW8WiQz5UYPXo97nmcP7AI8dHH1yqPM/view?usp=sharing

Upload it either to your session storage or upload it to your drive and mount your drive to this notebook.


In [None]:
#If you did upload the zip file to your session storage, use this command or else modify accordingly.
# !unzip "/content/Text Detection Dataset.zip"

In [None]:
train_path = '/content/drive/MyDrive/datasets/Text_Detection/Text Detection Dataset/Train'
test_path = '/content/drive/MyDrive/datasets/Text_Detection/Text Detection Dataset/Val'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## For Train Data

In [None]:
import pandas as pd
import json
pd.set_option('display.max_colwidth', None) 

In [None]:
main_result=pd.DataFrame(columns=["file_name","height","width","annotations"])
main_result["annotations"]=main_result["annotations"].astype('object')

In [None]:
import os
from natsort import natsorted
os.chdir(train_path+"/Annotations")
file_list =  natsorted(os.listdir())
for i in natsorted(os.listdir()):
  print(i)

In [None]:
cat_dict = {"HINDI":"0","ENGLISH":"1","OTHER":"2"}
import cv2
from google.colab.patches import cv2_imshow

# print(file_list)

In [None]:
file_list
# with open(file_list[0],'r') as f:
#   y = f.readlines()
# f.close()
# y

### 0.txt has 3 boxes
1. 122,294,295,126,207,214,277,270,HINDI
2. 291,469,472,303,213,221,283,287,HINDI
3. 464,581,581,473,214,221,287,281,HINDI


In [None]:
  # h = cv2.imread(train_path+'/0'+".jpeg").shape[:2]
  f"{train_path}/Images/{str(0)}.jpeg"
  /content/drive/MyDrive/datasets/Text_Detection/Text Detection Dataset/Train/Images/0.jpeg

'/content/drive/MyDrive/datasets/Text_Detection/Text Detection Dataset/Train/Images/0.jpeg'

In [None]:
# /content/drive/MyDrive/datasets/Text_Detection/Text Detection Dataset/Train/Images

In [None]:
header_list = ["x1", "x2", "x3", "x4","y1", "y2","y3", "y4", "category_id"]

k=0
for i in file_list:
  #collecting box data
  # print(i)
  df = pd.read_csv(i,header = None,index_col=False,names=header_list)
  print(i)
  df["height"] = abs(df["y1"]-df["y3"])
  df["width"] = abs(df["x1"]-df["x3"])
  df=df[["x1","y1","width","height","category_id"]]
  df1=df
  
  df1["bbox"] = df1.iloc[:,0:4].values.tolist()
  df1["bbox_mode"] = 1
  df1 = df1.replace({"category_id": cat_dict})
  df1=df1[["bbox",	"bbox_mode",	"category_id"]]
  annotations = df1.T.to_dict().values()
  l = []
  for j in annotations:
    l.append(j)
  res=pd.DataFrame(columns=["file_name","height","width","annotations"])
  res["annotations"]=res["annotations"].astype('object')
  res.at[0,"file_name"] = i[:-4]+".jpeg"
  res.at[0,"annotations"] = l
  # h = cv2.imread(train_path+'/Images'+str(k)+".jpeg").shape[:2] 
  h = cv2.imread(f"{train_path}/Images/{str(k)}.jpeg").shape[:2]  
  res.at[0,"height"] = h[0]
  res.at[0,"width"] = h[1]
  k=k+1
  main_result = main_result.append(res)
  main_result.reset_index(drop=True,inplace=True)

You'll see the Json file in the annotations folder of train data.


In [None]:
main_result.reset_index(inplace=True)
main_result.rename(columns={"index":"image_id"},inplace=True)
main_result.to_json("dict_train.json",orient="records")


## For Validation Data

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

In [None]:
main_result=pd.DataFrame(columns=["file_name","height","width","annotations"])
main_result["annotations"]=main_result["annotations"].astype('object')

In [None]:
import os
from natsort import natsorted
os.chdir(test_path+"/Annotations")
file_list =  natsorted(os.listdir())
for i in natsorted(os.listdir()):
  print(i)

In [None]:
cat_dict = {"HINDI":"0","ENGLISH":"1","OTHER":"2"}
import cv2
from google.colab.patches import cv2_imshow

print(file_list)

['401.txt', '402.txt', '403.txt', '404.txt', '405.txt', '406.txt', '407.txt', '408.txt', '409.txt', '410.txt', '411.txt', '412.txt', '413.txt', '414.txt', '415.txt', '416.txt', '417.txt', '418.txt', '419.txt', '420.txt', '421.txt', '422.txt', '423.txt', '424.txt', '425.txt', '426.txt', '427.txt']


In [None]:
header_list = ["x1", "x2", "x3", "x4","y1", "y2","y3", "y4", "category_id"]

k=401
for i in file_list:
  df = pd.read_csv(i,header = None,index_col=False,names=header_list)
  df["height"] = abs(df["y1"]-df["y3"])
  df["width"] = abs(df["x1"]-df["x3"])
  df=df[["x1","y1","width","height","category_id"]]
  df1=df
  
  df1["bbox"] = df1.iloc[:,0:4].values.tolist()
  df1["bbox_mode"] = 1
  df1 = df1.replace({"category_id": cat_dict})
  df1=df1[["bbox",	"bbox_mode",	"category_id"]]
  annotations = df1.T.to_dict().values()
  l = []
  for j in annotations:
    l.append(j)
  res=pd.DataFrame(columns=["file_name","height","width","annotations"])
  res["annotations"]=res["annotations"].astype('object')
  res.at[0,"file_name"] = i[:-4] + ".jpeg"
  res.at[0,"annotations"] = l
  h = cv2.imread("../Images/"+str(k)+".jpeg").shape[:2]
  res.at[0,"height"] = h[0]
  res.at[0,"width"] = h[1]
  k=k+1
  main_result = main_result.append(res)
  main_result.reset_index(drop=True,inplace=True)

In [None]:

main_result.reset_index(inplace=True)
main_result.rename(columns={"index":"image_id"},inplace=True)
main_result.to_json("dict_val.json",orient="records")

In [None]:
!pip install coco-assistant

Collecting coco-assistant
  Downloading https://files.pythonhosted.org/packages/7c/63/13e07f570402bdd1cc940e516e3535369f64c712c0d6c056cf1a165614fb/coco_assistant-0.3.4-py3-none-any.whl
Collecting Pillow>=8.1.1
[?25l  Downloading https://files.pythonhosted.org/packages/33/34/542152297dcc6c47a9dcb0685eac6d652d878ed3cea83bf2b23cb988e857/Pillow-8.2.0-cp37-cp37m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 14.0MB/s 
[31mERROR: albumentations 0.1.12 has requirement imgaug<0.2.7,>=0.2.5, but you'll have imgaug 0.2.9 which is incompatible.[0m
Installing collected packages: Pillow, coco-assistant
  Found existing installation: Pillow 7.1.2
    Uninstalling Pillow-7.1.2:
      Successfully uninstalled Pillow-7.1.2
Successfully installed Pillow-8.2.0 coco-assistant-0.3.4


In [None]:
import os
from coco_assistant import COCO_Assistant

# Specify image and annotation directories
img_path = '/content/drive/MyDrive/datasets/sample_merging/images'
ann_path = '/content/drive/MyDrive/datasets/sample_merging/annotations'
img_dir = os.path.join(os.getcwd(), img_path)
ann_dir = os.path.join(os.getcwd(), ann_path)

In [None]:
cas = COCO_Assistant(img_dir, ann_dir)

loading annotations into memory...
Done (t=0.00s)
creating index...
index created!


In [None]:
dir(cas)

In [None]:
cas.merge(merge_images=True)

In [None]:
def merge(self, merge_images=True):
        """
        Function for merging multiple coco datasets
        """

        self.resim_dir = os.path.join(self.res_dir, "merged", "images")
        self.resann_dir = os.path.join(self.res_dir, "merged", "annotations")

        # Create directories for merged results and clear the previous ones
        # The exist_ok is for dealing with merged folder
        # TODO: Can be done better
        if os.path.exists(self.resim_dir) is False:
            os.makedirs(self.resim_dir, exist_ok=True)
        else:
            shutil.rmtree(self.resim_dir)
            os.makedirs(self.resim_dir, exist_ok=True)
        if os.path.exists(self.resann_dir) is False:
            os.makedirs(self.resann_dir, exist_ok=True)
        else:
            shutil.rmtree(self.resann_dir)
            os.makedirs(self.resann_dir, exist_ok=True)

        if merge_images:
            print("Merging image dirs")
            im_dirs = [os.path.join(self.img_dir, folder) for folder in self.imgfolders]
            imext = [".png", ".jpg"]

            logging.debug("Merging Image Dirs...")

            for imdir in tqdm(im_dirs):
                ims = [i for i in os.listdir(imdir) if i[-4:].lower() in imext]
                for im in ims:
                    shutil.copyfile(os.path.join(imdir, im), os.path.join(self.resim_dir, im))

        else:
            logging.debug("Not merging Image Dirs...")

        cann = {"images": [], "annotations": [], "info": None, "licenses": None, "categories": None}

        logging.debug("Merging Annotations...")

        dst_ann = os.path.join(self.resann_dir, "merged.json")

        print("Merging annotations")
        for j in tqdm(self.jsonfiles):
            with open(os.path.join(self.ann_dir, j)) as a:
                cj = json.load(a)

            ind = self.jsonfiles.index(j)
            # Check if this is the 1st annotation.
            # If it is, continue else modify current annotation
            if ind == 0:
                cann["images"] = cann["images"] + cj["images"]
                cann["annotations"] = cann["annotations"] + cj["annotations"]
                if "info" in list(cj.keys()):
                    cann["info"] = cj["info"]
                if "licenses" in list(cj.keys()):
                    cann["licenses"] = cj["licenses"]
                cann["categories"] = sorted(cj["categories"], key=lambda i: i["id"])

                last_imid = cann["images"][-1]["id"]
                last_annid = cann["annotations"][-1]["id"]

                # If last imid or last_annid is a str, convert it to int
                if isinstance(last_imid, str) or isinstance(last_annid, str):
                    logging.debug("String Ids detected. Converting to int")
                    id_dict = {}
                    # Change image id in images field
                    for i, im in enumerate(cann["images"]):
                        id_dict[im["id"]] = i
                        im["id"] = i

                    # Change annotation id & image id in annotations field
                    for i, im in enumerate(cann["annotations"]):
                        im["id"] = i
                        if isinstance(last_imid, str):
                            im["image_id"] = id_dict[im["image_id"]]

                last_imid = cann["images"][-1]["id"]
                last_annid = cann["annotations"][-1]["id"]

            else:

                id_dict = {}
                # Change image id in images field
                for i, im in enumerate(cj["images"]):
                    id_dict[im["id"]] = last_imid + i + 1
                    im["id"] = last_imid + i + 1

                # Change annotation and image ids in annotations field
                for i, ann in enumerate(cj["annotations"]):
                    ann["id"] = last_annid + i + 1
                    ann["image_id"] = id_dict[ann["image_id"]]

                # Remap categories
                cmapper = CatRemapper(cann["categories"], cj["categories"])
                cann["categories"], cj["annotations"] = cmapper.remap(cj["annotations"])

                cann["images"] = cann["images"] + cj["images"]
                cann["annotations"] = cann["annotations"] + cj["annotations"]
                if "info" in list(cj.keys()):
                    cann["info"] = cj["info"]
                if "licenses" in list(cj.keys()):
                    cann["licenses"] = cj["licenses"]

                last_imid = cann["images"][-1]["id"]
                last_annid = cann["annotations"][-1]["id"]

        with open(dst_ann, "w") as aw:
            json.dump(cann, aw)
