## Prepping the TACO data

In [1]:
from pathlib import Path
import pandas as pd
import json
from typing import List
import yaml

Need to check in TACO directory - if not then : cd TACO/

In [2]:
pwd

'c:\\repositoryProf\\Project_E2\\TACO'

## Next download the TACO dataset from http://tacodataset.org/

In [87]:
#Once it is downloaded:
cd detector/

!python split_dataset.py --dataset_dir ../data

The annotations are split into 30 batchs of json files:
10 Train, 10 Test, 10 Validation

In [3]:
#return to TACO directory
cd ..

#asign variable for the data
DATA_TACO = Path.cwd()/"data"
DATA_TACO

WindowsPath('c:/repositoryProf/Project_E2/TACO/data')

### Function to build images directories

In [95]:
def build_split_images_and_json_file(path_dir_annotations:Path, list_dir_split:List[str]):
    """Build a mapping directeries of yalov7. 
    This funtion create images directories of train, test and val directeries.
    Also it create the annotations json files of splits directories

    Args:
        path_dir_annotations (Path): It's a path of annotations directory (e.g TACO/data/annotations)
        list_dir_split (List[str]): split's names list (e.g ["train","test","val"])
    """
    # project directory
    project_dir = Path.cwd().parent
    # create a new data directory contains the splits directories train, test and val
    datasets = project_dir/"datasets"
    datasets.mkdir(exist_ok=True)
    # Get a list all of json files annatations
    file_json = [f for f in path_dir_annotations.parent.iterdir() if f.is_file() and str(f).endswith(".json")]
    # Get a list all of images batchs directories
    dir_imgs = [d for d in path_dir_annotations.iterdir() if d.is_dir()]
    # Get a parent of images directories
    path_dir_image_batch=dir_imgs[0].parent
    print(path_dir_image_batch)
    # Iterate over the list that contains the names of created folders
    for annot in list_dir_split:
        print(annot)
        #Dictionary of the new specific annotations json file
        dict_js = {}
        # Name of split folder (e.g "datasets/train")
        dir_split = datasets/annot
        dir_split.mkdir(exist_ok=True)
        images_dir = dir_split/"images"
        images_dir.mkdir(exist_ok=True)
        # Get a list all of json files annotations of specific split directory
        file_split = [f for f in file_json if f.is_file() and str(f).__contains__(annot)]

        # Iterate over annotations json files list
        for fic in file_split:
            with open(fic,"r", encoding="utf-8") as f:
                json_load = json.load(f)
            # update dictionary with the contains annotations json files 
            dict_js.update(json_load)
        for el in dict_js["images"]:
            file_name = el["file_name"]
            path_img = path_dir_image_batch/file_name
            
            file_out = "_".join(file_name.split("/"))
            output_file = dir_split/"images"/file_out
            el["file_name"]=f"{annot}/images/{file_out}"
            if not output_file.exists():
                output_file.write_bytes(path_img.read_bytes())
            
        with open(dir_split/f"annotations-{annot}.json","w", encoding="utf-8") as f:
            json.dump(dict_js,f,indent=4)     

### Run Function build_split_images_and_json_file

In [97]:
split_list_dir = ["train","test","val"]
annotations = DATA_TACO/"annotations"

# Run Function
build_split_images_and_json_file(annotations,split_list_dir)

c:\repositoryProf\Project_E2\TACO\data\annotations
train
test
val


### Checking the image transfer

In [98]:
# Function checking count of images in directory
def check_count_imgs_in_split_dir(list_path_images_split_dir:List[Path]):
    for img in list_path_images_split_dir:
        name_dir = img.parent.name
        list_path = [f for f in img.iterdir() if f.is_file()]
        print(f"count images of {name_dir} folder is : {len(list_path)}")


In [99]:
datasets = DATA_TACO.parent.parent/"datasets"
train_images_dir = datasets/"train"/"images"
test_images_dir = datasets/"test"/"images"
val_images_dir = datasets/"val"/"images"
list_split_pathImgs = [train_images_dir,test_images_dir,val_images_dir]

In [100]:
check_count_imgs_in_split_dir(list_split_pathImgs)

count images of train folder is : 1200
count images of test folder is : 150
count images of val folder is : 150


## Creating the labels directory for each split directory 

In [None]:
def build_labels_txt_with_segmentations(data_path_dir:Path,names_dir:List[str]):
    """Create a folders for each split directory (e.g train, test, val), in each folder we create labels folder
    These labels folders contains txt files 

    Args:
        data_path_dir (Path): Path of dataset directory for all of mapping data for yolov7
        names_dir (List[str]): The list names of each splits directories (e.g ["train","test","val"])
    """
    #Iterate over names split diretories list
    for name in names_dir:
        path_annotations_dir = datasets/name

        #Get transformed annotations json file in this directory
        path_annotations = [f for f in path_annotations_dir.iterdir() if str(f).endswith(".json")][0]
        # Create labels directory
        labelsTrain_path = data_path_dir/name/"labels"
        labelsTrain_path.mkdir(exist_ok=True, parents=True)
        #Get data from annotations json file
        with open(path_annotations, "r", encoding="utf-8") as f:
            annotates = json.load(f)
        #Create DataFrame from data json file
        images = pd.DataFrame(annotates["images"], columns=["id","file_name"])
        images.rename(columns={"id":"image_id"}, inplace=True)
        annot = pd.DataFrame(annotates["annotations"], columns=["id","image_id","category_id","segmentation",])
        df = annot.merge(images)
        # Loop to create  labels txt files
        for img in df["image_id"].unique():
            seg = df[df["image_id"]==img]
            length = len(seg.index)
            i = 0
            name_file = Path(seg['file_name'].values[0])

            path_txt = labelsTrain_path/f"{name_file.stem}.txt"
            for j in range(length):
                labels = seg.iloc[i:j+1,:]
                seg_value = labels['segmentation'].values[0][0]
                coord = ",".join([str(x) for x in seg_value]).replace(","," ")
                lab = f"{labels['category_id'].values[0]} {coord}\n"
                i+=1
                with open(path_txt,"a", encoding="utf-8") as f:
                    f.write(lab)
    print("==================finished===========================")

In [111]:
build_labels_txt_with_segmentations(datasets,["train","test","val"])

Index(['image_id', 'file_name'], dtype='object')
Index(['image_id', 'file_name'], dtype='object')
Index(['image_id', 'file_name'], dtype='object')


In [4]:
with open(DATA_TACO/"annotations.json","r", encoding="utf-8") as f:
    load_json= json.load(f)

In [15]:
cat = pd.DataFrame(load_json["categories"])

In [32]:
supercategory=cat.groupby("supercategory").name.apply(list).to_dict()

In [60]:
supcat = pd.DataFrame({k:[v] for k,v in supercategory.items()})

In [None]:
def create_labels_txt

In [6]:
def create_yaml_taco_data():
    cur_dir = Path.cwd()
    taco_data = cur_dir/"data"
    yolov7_dir = cur_dir.parent/"yolov7"
    yolov7_data = yolov7_dir/"data"
    yaml_data = yolov7_data/"taco.yaml"
    global_annotations_path = [f for f in taco_data.iterdir() if str(f).endswith("annotations.json")][0]
    with open(global_annotations_path, "r", encoding="utf-8") as f:
        global_annotations = json.load(f)
        categories = pd.DataFrame(global_annotations["categories"], columns=["id","name"]).rename(columns={"id":"category_id"})
        annotations_global = pd.DataFrame(global_annotations["annotations"],columns=["category_id"]).merge(categories).sort_values(by="category_id")
    classes = list(annotations_global["name"].unique())
    nc = len(annotations_global["name"].unique())

    dict_taco_yaml={
        "train": "../datasets/train/images",
        "val": "../datasets/val/images",
        "test": "../datasets/test/images",
        
        "nc": nc,
        "names":classes,    
    }
    with open(yaml_data, "w", encoding="utf-8") as f:
        yaml.dump(dict_taco_yaml,f, indent=4)
    

In [7]:
create_yaml_taco_data()