In [24]:
# Install requirements, uncomment to run
# !pip install geopandas pillow label-studio-converter

# Uploading Savmap dataset to label studio

In [55]:
import geopandas as gdp
from PIL import Image
from tqdm import tqdm
import os
from pathlib import Path
import numpy as np
import json
import pandas as pd
from torchvision.ops import nms
import torch
from label_studio_sdk import Client
from dotenv import load_dotenv
from label_studio_tools.core.utils.io import get_local_path
from urllib.parse import unquote

In [17]:
#  load data
root = r"D:\savmap_dataset_v2"
polygons = gdp.read_file(os.path.join(root,'savmap_annotations_2014.geojson'))

path_to_images = os.path.join(root,'images')
path_to_labels = os.path.join(root,'labels')

In [57]:
# polygon points
xx,yy = polygons['geometry'].iloc[0].exterior.coords.xy
np.array(list(zip(xx,yy))) * 100 / np.array([4000,3000]).reshape((1,2))

In [18]:
# get bounding boxs
for cat in ['x_min', 'y_min', 'x_max', 'y_max','width','height','x','y']:
    polygons[cat] = None
for i in tqdm(range(len(polygons))):
    x_min, y_min, x_max, y_max = polygons['geometry'].iloc[i].bounds
    image_path = os.path.join(path_to_images,f"{polygons['IMAGEUUID'].iloc[i]}.JPG")
    # try:
    width, height = Image.open(image_path).size
    # except:
    #     continue
    polygons['x_min'].iat[i] = max(int(x_min),0)
    polygons['x_max'].iat[i] = max(int(x_max),0)
    polygons['y_min'].iat[i] = max(int(y_min),0)
    polygons['y_max'].iat[i] = max(int(y_max),0)
    polygons['x'].iat[i] = round(0.5*(x_max+x_min))
    polygons['y'].iat[i] = round(0.5*(y_max+y_min))
    polygons['width'].iat[i] = width
    polygons['height'].iat[i] = height

polygons['bbox_w'] = polygons['x_max'] - polygons['x_min']
polygons['bbox_h'] = polygons['y_max'] - polygons['y_min']
polygons['class'] = 0

100%|██████████| 7474/7474 [00:05<00:00, 1357.18it/s]


In [19]:
# non-max suppresion
# https://pytorch.org/vision/stable/generated/torchvision.ops.nms.html

def nms_to_bbox(df_annotations:pd.DataFrame,iou_threshold:float=0.5):
    dfs = list()

    for IMAGEUUID, df in tqdm(df_annotations.groupby('IMAGEUUID')):

        df.reset_index(inplace=True)

        bbox = df[['x_min','y_min','x_max','y_max']].to_numpy().astype(float)
        scores = np.ones((bbox.shape[0],)).astype(float)

        bbox_indices= nms(boxes=torch.from_numpy(bbox),
                            scores=torch.from_numpy(scores),
                            iou_threshold=iou_threshold).numpy()
        
        # print(bbox[bbox_indices],'\n',df.iloc[bbox_indices,:])

        dfs.append(df.iloc[bbox_indices,:].copy())

    return pd.concat(dfs,axis=0).reset_index()

In [20]:
# nms filtering of bbox
df_filtered = nms_to_bbox(polygons,iou_threshold=0.5)

  0%|          | 0/654 [00:00<?, ?it/s]

100%|██████████| 654/654 [00:01<00:00, 625.36it/s]


In [21]:
# save filtered annotations as csv
savepath = os.path.join(root,'savmap_annotations_2014_filtered.csv')
df_filtered['images'] = df_filtered['IMAGEUUID'].apply(lambda x: f"{x}.JPG")
df_filtered[['images','class','x_min', 'y_min', 'x_max', 'y_max']].rename(columns={'class':'labels'}).to_csv(savepath,index=False)

In [None]:
# Tiling images
# !python ../HerdNet/tools/patcher.py "D:\savmap_dataset_v2\images" 2000 2000 100 -dest "D:\savmap_dataset_v2\images_splits" -csv "D:\savmap_dataset_v2\savmap_annotations_2014_filtered.csv" -min 0.5 -all False

In [58]:
# load groundtruth for split images
df_splits = pd.read_csv(r"D:\savmap_dataset_v2\images_splits\gt.csv")
df_splits["IMAGEUUID"] = df_splits['images'].apply(lambda x: str(x).split(".")[0])
df_splits['x'] = 0.5*(df_splits["x_min"]+df_splits["x_max"])
df_splits['y'] = 0.5*(df_splits["y_min"]+df_splits["y_max"])
df_splits["width"] = 0.
df_splits["height"] = 0.
df_splits['bbox_w'] = (df_splits['x_max'] - df_splits['x_min'])*1.0
df_splits['bbox_h'] = (df_splits['y_max'] - df_splits['y_min'])*1.0
for name,df in tqdm(df_splits.groupby("images")):
    image_path = os.path.join(r"D:\savmap_dataset_v2\images_splits",name)
    width, height = Image.open(image_path).size
    mask = (df_splits["images"] == name)
    df_splits.loc[mask,'width']  = width
    df_splits.loc[mask,'height'] = height

100%|██████████| 1161/1161 [00:06<00:00, 167.38it/s]


In [59]:
df_splits.head(3)

Unnamed: 0,images,labels,base_images,x_min,y_min,x_max,y_max,IMAGEUUID,x,y,width,height,bbox_w,bbox_h
0,003a34ee6b7841e6851b8fe511ebe102_0.JPG,0,003a34ee6b7841e6851b8fe511ebe102.JPG,1503,962,1537,996,003a34ee6b7841e6851b8fe511ebe102_0,1520.0,979.0,2000.0,2000.0,34.0,34.0
1,003a34ee6b7841e6851b8fe511ebe102_0.JPG,0,003a34ee6b7841e6851b8fe511ebe102.JPG,1679,858,1695,875,003a34ee6b7841e6851b8fe511ebe102_0,1687.0,866.5,2000.0,2000.0,16.0,17.0
2,0078d29a8d0b489caa3425969c7477ac_0.JPG,0,0078d29a8d0b489caa3425969c7477ac.JPG,620,1653,677,1689,0078d29a8d0b489caa3425969c7477ac_0,648.5,1671.0,2000.0,2000.0,57.0,36.0


In [10]:
# filtered df vs unfiltered df
len(df_filtered), len(polygons)

(3593, 7474)

In [11]:
df_filtered.head(1)

Unnamed: 0,level_0,index,IMAGEUUID,TAGUUID,geometry,x_min,y_min,x_max,y_max,width,height,x,y,bbox_w,bbox_h,class
0,0,5425,003a34ee6b7841e6851b8fe511ebe102,d57402df14a84b58a411bade4f9d14f1,"POLYGON ((1514 962, 1524 970, 1536 968, 1536 9...",1503,962,1537,996,4000,3000,1520,979,34,34,0


In [None]:
# Select labels
# data = polygons.loc[:,['x_min','x_max','y_min','y_max','IMAGEUUID','width','height','class']].copy()
# data.rename(columns={'IMAGEUUID':'filename',
#                      'x_max':'xmax',
#                      'x_min':'xmin',
#                      'y_min':'ymin',
#                      'y_max':'ymax'},inplace=True)
# data['filename'] = data['filename'].apply(lambda x: f"{x}.JPG")
# data = data.dropna()
# data = data[['filename','class','width', 'height','xmin','ymin','xmax','ymax']]
# data.head(1)


In [None]:
# # Convert CSV to COCO
# save_json_path = ... #'cocoformat.json'

# images = []
# categories = []
# annotations = []

# category = {}
# category["supercategory"] = 'None'
# category["id"] = 0
# category["category_name"] = 'wildlife'
# categories.append(category)

# data['fileid'] = data['filename'].astype('category').cat.codes
# data['categoryid']= pd.Categorical(data['class'],ordered= True).codes
# # data['categoryid'] = data['categoryid']+1
# data['annid'] = data.index

# def image(row):
#     image = {}
#     image["height"] = row.height
#     image["width"] = row.width
#     image["id"] = row.fileid
#     image["file_name"] = row.filename
#     return image

# # def category(row):
# #     category = {}
# #     category["supercategory"] = 'None'
# #     category["id"] = row.categoryid
# #     category["category_name"] = row[2]
# #     return category

# def annotation(row):
#     annotation = {}
#     area = (row.xmax -row.xmin)*(row.ymax - row.ymin)
#     annotation["segmentation"] = []
#     annotation["iscrowd"] = 0
#     annotation["area"] = area
#     annotation["image_id"] = row.fileid
#     annotation["score"] = 1.0

#     annotation["bbox"] = [row.xmin, row.ymin, row.xmax -row.xmin,row.ymax-row.ymin ]

#     annotation["category_id"] = row.categoryid
#     annotation["id"] = row.annid
#     return annotation

# for row in data.itertuples():
#     annotations.append(annotation(row))

# imagedf = data.drop_duplicates(subset=['fileid']).sort_values(by='fileid')
# for row in imagedf.itertuples():
#     images.append(image(row))

# # catdf = data.drop_duplicates(subset=['categoryid']).sort_values(by='categoryid')
# # for row in catdf.itertuples():
# #     categories.append(category(row))

# data_coco = {}
# data_coco["images"] = images
# data_coco["categories"] = categories
# data_coco["annotations"] = annotations
# # json.dump(data_coco, open(save_json_path, "w"), indent=4)

In [None]:
# groundtruth = {annot['file_name']:[] for annot in data_coco['images']}
# for annot,image_data in zip(data_coco['annotations'],data_coco['images']):
#     annot.update(image_data)
#     # pprint.pp(annot)
#     annot['category_name'] = category['category_name']
#     groundtruth[annot['file_name']].append(annot)

In [56]:
dotenv_path = r"..\.env"
if dotenv_path is not None:
    load_dotenv(dotenv_path=dotenv_path)

In [57]:
# Connect to the Label Studio API and check the connection
LABEL_STUDIO_URL = os.getenv('LABEL_STUDIO_URL')
API_KEY = os.getenv("LABEL_STUDIO_API_KEY")
labelstudio_client = Client(url=LABEL_STUDIO_URL, api_key=API_KEY)

In [None]:
# url = "/data/local-files/?d=Users%5Cfadel%5COneDrive%5CBureau%5CWILD-AI%5Cdatalabeling%5Cdata%5Ctrain_wildai%5Cimages%5C003a34ee6b7841e6851b8fe511ebe102.JPG"

In [None]:
# unquote(url)

In [82]:
# Uploading Polygons
def format_prediction_polygon(points:list,width:int,height:int,label:list=['wildlife',]):
    
    template = {
            "original_width": width,
            "original_height": height,
            "image_rotation": 0,
            "value": {
                "points": points,
                "closed":True,
                "polygonlabels": label
            }
    }

    return template

def format_predictions_polygon(polygons:list,widths:list,heights:list,label:list=['wildlife',]):

    results = list()
    for polygon,w,h in zip(polygons,widths,heights):
        xx,yy=polygon.exterior.coords.xy
        points = list(zip(xx,yy))

        # convert points to percent
        points = np.array(points) * 100 / np.array([w,h]).reshape((1,2))
        points = points.tolist()      

        # append result
        results.append(format_prediction_polygon(points,width=w,height=h,label=label))

    return results


# uploading polygons
 # Select project
# project = labelstudio_client.get_project(id=project_id)
# # Upload predictions for each task
# tasks = project.get_tasks()
# if top_n > 0:
#     tasks = sorted(tasks,key=lambda x:x['id'])[:top_n]

for task in tqdm(tasks[:10],desc="Uploading predictions"):
    task_id = task['id']
    img_url = unquote(task['data']['image'])
    img_path = get_local_path(img_url)
    img_name = Path(img_path).stem
    mask = polygons['IMAGEUUID']==img_name
    polys = polygons.loc[mask,['IMAGEUUID','geometry','width','height']]
    formatted_pred = format_predictions_polygon(polygons=polys['geometry'],widths=polys['width'],heights=polys['height'],label=['wildlife',])
    # conf_scores = [pred['score'] for pred in prediction]
    # max_score = 0.0
    project.create_prediction(task_id=task_id,
                               result=formatted_pred,model_version='gt')

In [72]:
# uploading gt bbox
def format_prediction_bbox(pred:dict,img_height:int,img_width:int,
                           from_name:str='label',
                           to_name:str='image',
                           label_type:str='rectanglelabels') -> dict:
        """Converts prediction Label studio format

        Args:
            pred (dict): prediction in coco format
            img_height (int): image height
            img_width (int): image width

        Returns:
            dict: Label studio formated prediction
        """
        # formatting the prediction to work with Label studio
        x, y, width, height = pred['bbox']
        label = pred['category_name']
        # score = pred['score']
        # if not isinstance(score,float):
        #     score = 0.0
        template = {
                    "from_name": from_name,
                    "to_name": to_name,
                    "type": label_type,
                    "original_width":img_width,
                    "original_height":img_height,
                    "image_rotation":0,
                    'value': {
                        label_type: [label,],
                        'x': x / img_width * 100.,
                        'y': y / img_height * 100.,
                        'width': width / img_width * 100.,
                        'height': height / img_height * 100.,
                        'rotation':0
                    },
                    # 'score': score
        }
        return template

def format_predictions_bbox(xs:list,ys:list,ws:list,hs:list,img_width:int,img_height:int,label:str='wildlife'):
      
    results = list()
    for x,y,w,h in zip(xs,ys,ws,hs):
        
        annot = {'bbox':[x,y,w,h],
                 'category_name':label,
                 'score':None
                 }

        # append result
        results.append(format_prediction_bbox(annot,img_width=img_width,img_height=img_height))

    return results

def upload_predictions_bbox(df_annotations:pd.DataFrame,project_id:int):

    # get tasks
    project = labelstudio_client.get_project(id=project_id)
    tasks = project.get_tasks()

    # upload
    failed_uploads = set()
    for task in tqdm(tasks[:],desc="Uploading predictions"):
        task_id = task['id']
        img_url = unquote(task['data']['image'])
        img_path = get_local_path(img_url)
        img_name = Path(img_path).stem
        try:
            mask = df_annotations['IMAGEUUID']==img_name
            df_bbox = df_annotations.loc[mask,['x','y','bbox_w','bbox_h','width','height']]
            img_width = df_bbox['width'].iat[0]
            img_height = df_bbox['height'].iat[0]
            formatted_pred = format_predictions_bbox(xs=df_bbox['x'],
                                                    ys=df_bbox['y'],
                                                    ws=df_bbox['bbox_w'],
                                                    hs=df_bbox['bbox_h'],
                                                    img_width=img_width,
                                                    img_height=img_height,
                                                    label='wildlife'
                                                    )
            # project.create_annotation(task_id=task_id,
            #                         result=formatted_pred,
            #                         # model_version='gt'
            #                         )
            project.create_prediction(task_id=task_id,
                                    result=formatted_pred,
                                    model_version='gt')
        except Exception as e:
            failed_uploads.add(img_name)
            #  print("Failed for: ",img_name,e,end="\n")
    return failed_uploads

def delete_tasks(project_id:int,failed_uploads:set):

    # get tasks
    project = labelstudio_client.get_project(id=project_id)
    tasks = project.get_tasks()
    # delete tasks
    for task in tqdm(tasks[:],desc="deleting tasks"):
        task_id = task['id']
        img_url = unquote(task['data']['image'])
        img_path = get_local_path(img_url)
        img_name = Path(img_path).stem
        if img_name in failed_uploads:
            project.delete_task(task_id=task_id)

In [62]:
project_id = 3
failed_uploads = upload_predictions_bbox(df_annotations=df_splits,project_id=project_id)

Uploading predictions:   0%|          | 0/3925 [00:00<?, ?it/s]Using `localhost` (http://localhost:8080) in LABEL_STUDIO_URL, `localhost` is not accessible inside of docker containers. You can check your IP with utilities like `ifconfig` and set it as LABEL_STUDIO_URL.
Using `localhost` (http://localhost:8080) in LABEL_STUDIO_URL, `localhost` is not accessible inside of docker containers. You can check your IP with utilities like `ifconfig` and set it as LABEL_STUDIO_URL.
Using `localhost` (http://localhost:8080) in LABEL_STUDIO_URL, `localhost` is not accessible inside of docker containers. You can check your IP with utilities like `ifconfig` and set it as LABEL_STUDIO_URL.
Using `localhost` (http://localhost:8080) in LABEL_STUDIO_URL, `localhost` is not accessible inside of docker containers. You can check your IP with utilities like `ifconfig` and set it as LABEL_STUDIO_URL.
Using `localhost` (http://localhost:8080) in LABEL_STUDIO_URL, `localhost` is not accessible inside of docker

In [66]:
len(failed_uploads), '2da691c095e64729811fe7fe9c2241e9_5' in failed_uploads

(2764, True)

In [71]:
# delete empty images
delete_tasks(df_annotations=df_splits,project_id=project_id,failed_uploads=failed_uploads)

deleting tasks:   0%|          | 0/3925 [00:00<?, ?it/s]Using `localhost` (http://localhost:8080) in LABEL_STUDIO_URL, `localhost` is not accessible inside of docker containers. You can check your IP with utilities like `ifconfig` and set it as LABEL_STUDIO_URL.
Using `localhost` (http://localhost:8080) in LABEL_STUDIO_URL, `localhost` is not accessible inside of docker containers. You can check your IP with utilities like `ifconfig` and set it as LABEL_STUDIO_URL.
Using `localhost` (http://localhost:8080) in LABEL_STUDIO_URL, `localhost` is not accessible inside of docker containers. You can check your IP with utilities like `ifconfig` and set it as LABEL_STUDIO_URL.
deleting tasks:   0%|          | 3/3925 [00:00<03:43, 17.56it/s]Using `localhost` (http://localhost:8080) in LABEL_STUDIO_URL, `localhost` is not accessible inside of docker containers. You can check your IP with utilities like `ifconfig` and set it as LABEL_STUDIO_URL.
Using `localhost` (http://localhost:8080) in LABEL_S

In [79]:
# format_predictions(polygons=polys['geometry'],widths=polys['width'],heights=polys['height'],label=['wildlife',])

# Formatting savmap dataset
Formatting the savmap dataset so it can be imported in label studio through a json file

In [84]:
import geopandas as gdp
from PIL import Image
from tqdm import tqdm
import os
from pathlib import Path

In [92]:
#  load data
root = r"D:\savmap_dataset_v2"
polygons = gdp.read_file(os.path.join(root,'savmap_annotations_2014.geojson'))

# create directories
path_to_images = os.path.join(root,'images')
path_to_labels = os.path.join(root,'labels')
Path(path_to_images).mkdir(exist_ok=True,parents=True)
Path(path_to_labels).mkdir(exist_ok=True,parents=True)

In [3]:
# Move files
# ! mv ../data/savmap_dataset_v2/*.JPG ../data/savmap_dataset_v2/images

In [93]:
# get bounding boxs
for cat in ['x_min', 'y_min', 'x_max', 'y_max','width','height','x','y']:
    polygons[cat] = None
for i in range(len(polygons)):
    x_min, y_min, x_max, y_max = polygons['geometry'].iloc[i].bounds
    image_path = os.path.join(path_to_images,f"{polygons['IMAGEUUID'].iloc[i]}.JPG")
    width, height = Image.open(image_path).size
    polygons['x_min'].iat[i] = max(0,int(x_min))
    polygons['x_max'].iat[i] = max(0,int(x_max))
    polygons['y_min'].iat[i] = max(0,int(y_min))
    polygons['y_max'].iat[i] = max(0,int(y_max))
    polygons['x'].iat[i] = 0.5*(x_max+x_min)
    polygons['y'].iat[i] = 0.5*(y_max+y_min)
    polygons['width'].iat[i] = width
    polygons['height'].iat[i] = height

# creat bbox width and height
polygons['bbox_w'] = polygons['x_max'] - polygons['x_min']
polygons['bbox_h'] = polygons['y_max'] - polygons['y_min']
polygons['class'] = 0

In [None]:
polygons.head(1)

In [94]:
# convert to yolo format
def save_df_as_yolo(df_annotation:gdp.GeoDataFrame,dest_path_labels:str):
    
    cols = ['class','x','y','bbox_w','bbox_h']
    for col in cols:
        assert df_annotation[col].isna().sum()<1,'there are NaN values. Check out.'
        # df_annotation[col] = df_annotation[col].apply(int)

    # normalize values
    df_annotation.loc[:,'x'] = df_annotation['x']/df_annotation['width']
    df_annotation.loc[:,'y'] = df_annotation['y']/df_annotation['height']
    df_annotation.loc[:,'bbox_w'] = df_annotation['bbox_w']/df_annotation['width']
    df_annotation.loc[:,'bbox_h'] = df_annotation['bbox_h']/df_annotation['height']
    
    for image_name,df in tqdm(df_annotation.groupby('IMAGEUUID'),desc='Saving yolo labels'):
        txt_file = f'{image_name}.txt'
        df[cols].to_csv(os.path.join(dest_path_labels,txt_file),sep=' ',index=False,header=False)

In [95]:
save_df_as_yolo(df_annotation=polygons,dest_path_labels=path_to_labels)

Saving yolo labels:   0%|          | 0/654 [00:00<?, ?it/s]

Saving yolo labels: 100%|██████████| 654/654 [00:01<00:00, 463.09it/s]


In [None]:
# tutorial here: https://github.com/HumanSignal/label-studio-converter/tree/master
# if it does not work, use a terminal
# !label-studio-converter import yolo -i "C:\Users\fadel\OneDrive\Bureau\WILD-AI\datalabeling\data\train_wildai" --image-ext ".JPG" --out-type "predictions" -o "C:\Users\fadel\OneDrive\Bureau\WILD-AI\datalabeling\data\train_wildai\ls_tasks.json"

# Formatting General Dataset from Delplanque 2021

In [9]:
import pandas as pd
import os
from pathlib import Path
from PIL import Image
from tqdm import tqdm


## Tiling

In [None]:
# Tiling test
# !python ../../HerdNet/tools/patcher.py "D:\PhD\Data per camp\Extra training data\general_dataset\test" 640 640 64  -dest "D:\PhD\Data per camp\Extra training data\general_dataset\tiled_data\test_tiled\images" -pattern "**/*.JPG" -csv "D:\PhD\Data per camp\Extra training data\general_dataset\groundtruth\csv\test_big_size_A_B_E_K_WH_WB.csv"

In [None]:
# Tiling val
# !python ../../HerdNet/tools/patcher.py "D:\PhD\Data per camp\Extra training data\general_dataset\val" 640 640 64  -dest "D:\PhD\Data per camp\Extra training data\general_dataset\val_tiled" -pattern "**/*.JPG" -csv "D:\PhD\Data per camp\Extra training data\general_dataset\groundtruth\csv\val_big_size_A_B_E_K_WH_WB.csv"

In [None]:
# Tiling Train
# !python ../../HerdNet/tools/patcher.py "D:\PhD\Data per camp\Extra training data\general_dataset\train" 640 640 64  -dest "D:\PhD\Data per camp\Extra training data\general_dataset\train_tiled" -pattern "**/*.JPG" -csv "D:\PhD\Data per camp\Extra training data\general_dataset\groundtruth\csv\train_big_size_A_B_E_K_WH_WB.csv"

## Converting to YOLO format

In [56]:
# convert to yolo format
def save_df_as_yolo(df_annotation:pd.DataFrame,dest_path_labels:str,is_detector:bool=False):
    
    cols = ['class','x','y','bbox_w','bbox_h']
    for col in cols:
        assert df_annotation[col].isna().sum()<1,'there are NaN values. Check out.'
        # df_annotation[col] = df_annotation[col].apply(int)
    
    for col in ['x','y','bbox_w','bbox_h']:
        df_annotation[col] = df_annotation[col].astype(float)

    # normalize values
    df_annotation.loc[:,'x'] = df_annotation['x']/df_annotation['width']
    df_annotation.loc[:,'y'] = df_annotation['y']/df_annotation['height']
    df_annotation.loc[:,'bbox_w'] = df_annotation['bbox_w']/df_annotation['width']
    df_annotation.loc[:,'bbox_h'] = df_annotation['bbox_h']/df_annotation['height']

    if is_detector:
        df_annotation['class'] = 0
    
    for image_name,df in tqdm(df_annotation.groupby('images'),desc='Saving yolo labels'):
        txt_file = f'{Path(image_name).stem}.txt'
        df[cols].to_csv(os.path.join(dest_path_labels,txt_file),sep=' ',index=False,header=False)

In [67]:
split='train' # 'train' 'test', 'val
root = Path(rf"D:\PhD\Data per camp\Extra training data\general_dataset\tiled_data\{split}_tiled")
path_to_csv = root/"gt.csv"
path_images = root/'images'
path_to_labels = root/'labels'
detection_mode=False # save label for wildlife detection only

In [68]:
df_annotations = pd.read_csv(path_to_csv)
# df_annotations.head(2)

# update df_annotations
df_annotations['width'] = 0.0
df_annotations['height'] = 0.0


for name in set(df_annotations.images):
    width, height = Image.open(root/f"images/{name}").size
    df_annotations.loc[df_annotations.images==name,'width'] = float(width)
    df_annotations.loc[df_annotations.images==name,'height'] = float(height)

df_annotations['x'] = 0.5*(df_annotations['x_min'] + df_annotations['x_max'])
df_annotations['y'] = 0.5*(df_annotations['y_min'] + df_annotations['y_max'])
df_annotations['bbox_h'] = df_annotations['y_max'] - df_annotations['y_min']
df_annotations['bbox_w'] = df_annotations['x_max'] - df_annotations['x_min']

df_annotations.rename(columns={'labels':'class'},inplace=True)

In [69]:
# df_annotations.head(2)
# df_annotations['class'].plot(kind='hist')

In [None]:
save_df_as_yolo(df_annotations,path_to_labels,is_detector=detection_mode)

# WAID 

## Trasforming labels for detector training

In [17]:
from pathlib import Path
import pandas as pd
from tqdm import tqdm

In [None]:
path = r"C:\Users\Machine Learning\Desktop\workspace-wildAI\datalabeling\data\WAID\labels"
path = Path(path)

for p in tqdm(path.glob("*/**/*.txt")):
    df = pd.read_csv(p,sep=" ",header=None)
    df.columns = ["class",'x','y','w','h']
    df['class'] = 0
    # un-comment to run
    # df.to_csv(p, sep=" ", header=False, index=False)

# Hard sample mining

In [34]:
from ultralytics import YOLO
# from ultralytics.data import YOLODataset
import yaml
import pandas as pd
import os
from pathlib import Path
from PIL import Image
import numpy as np
from torchmetrics.detection import MeanAveragePrecision
import torch
from tqdm import tqdm

In [2]:
# load model
weight_path = r"C:\Users\FADELCO\OneDrive\Bureau\datalabeling\models\best_openvino_model"
model= YOLO(weight_path,task='obb')

In [96]:
# Load dataset
# with open("..\data\data_config.yaml","r") as file:
#     data = yaml.load(file,Loader=yaml.FullLoader)
# dataset = YOLODataset(data=data,task='obb',img_path="")

In [None]:
# compute metrics per image
results = model.val(data="..\data\data_config.yaml",
                    imgsz=1280,
                    iou=0.6,
                    half=True,
                    save_json=True,
                    batch=1)

In [2]:
df_results = pd.read_json(r'runs\obb\val5\predictions.json',orient='records')
df_results.head(2)

Unnamed: 0,image_id,category_id,score,rbox,poly
0,003a34ee6b7841e6851b8fe511ebe102,0,0.05283,"[1521.094, 976.562, 27.661, 27.93, 0.771]","[1521.291, 996.216, 1540.745, 976.175, 1520.89..."
1,003a34ee6b7841e6851b8fe511ebe102,0,0.01287,"[2803.125, 360.938, 26.025, 45.947, 0.841]","[2794.683, 385.954, 2828.922, 355.314, 2811.56..."


In [19]:
df_results['image_id'].dtype

dtype('O')

In [17]:
df_results['poly'].iat[0], df_results['rbox'].iat[0]

([1521.291, 996.216, 1540.745, 976.175, 1520.896, 956.909, 1501.443, 976.95],
 [1521.094, 976.562, 27.661, 27.93, 0.771])

In [3]:
# load groundtruth
with open(r"..\data\data_config.yaml",'r') as file:
    yolo_config = yaml.load(file,Loader=yaml.FullLoader)
yolo_config
val_labels_path = os.path.join(yolo_config['path'],yolo_config['val'][0]).replace('images','labels')
val_images_path = os.path.join(yolo_config['path'],yolo_config['val'][0])
df_list = list()

col_names = None
for path in Path(val_labels_path).glob('*.txt'):
    df = pd.read_csv(path,sep=' ',header=None)
    if len(df.columns) == 9:
        df.columns = ['id','x1','y1','x2','y2','x3','y3','x4','y4']
    elif len(df.columns) == 5:
        df.columns = ['id','x','y','w','h']
    else:
        raise ValueError("Check features in label file.")
    
    # record features
    if col_names is None:
        col_names = list(df.columns)

    df['image_id'] = path.stem
    image_path = os.path.join(val_images_path,f"{path.stem}.JPG")
    width, height = Image.open(image_path).size
    df['width'] = width
    df['height'] = height

    # unnormalize values
    for i in range(1,5):
        df[f"x{i}"] = df[f"x{i}"]*width
        df[f"y{i}"] = df[f"y{i}"]*height

    # get x_min, x_max, y_min, y_max
    
    
    df_list.append(df)
df_labels = pd.concat(df_list,axis=0)

In [40]:
df_labels.head(3)

Unnamed: 0,id,x1,y1,x2,y2,x3,y3,x4,y4,image_id,width,height
0,0,1503.0,962.0,1537.0,962.0,1537.0,996.0,1503.0,996.0,003a34ee6b7841e6851b8fe511ebe102,4000,3000
1,0,1679.0,858.0,1695.0,858.0,1695.0,875.0,1679.0,875.0,003a34ee6b7841e6851b8fe511ebe102,4000,3000
0,0,620.0,1653.0,677.0,1653.0,677.0,1689.0,620.0,1689.0,0078d29a8d0b489caa3425969c7477ac,4000,3000


In [53]:
# df_merge = df_labels.merge(df_results,on=['image_id'],how='left')
# df_merge.head(2)

In [41]:
# compute mAP@50
m_ap = MeanAveragePrecision(box_format="xyxy",iou_type="bbox",max_detection_thresholds=[10,100,300])

def get_bbox(gt:np.ndarray):
    xs = [0,2,4,6]
    ys = [1,3,5,7]
    x_min = np.min(gt[:,xs],axis=1).reshape((-1,1))
    x_max = np.max(gt[:,xs],axis=1).reshape((-1,1))
    y_min = np.min(gt[:,ys],axis=1).reshape((-1,1))
    y_max = np.max(gt[:,ys],axis=1).reshape((-1,1))
    return np.hstack([x_min,y_min,x_max,y_max])

map_50s = list()
maps_75s = list()
max_scores = list()
imgs_ids = df_results['image_id'].unique()
for image_id in tqdm(imgs_ids):

    # get gt
    mask_gt = df_labels['image_id'] == image_id
    gt = df_labels.loc[mask_gt,col_names].iloc[:,1:].to_numpy()
    labels = df_labels.loc[mask_gt,'id'].to_numpy()

    # get preds
    mask_pred = df_results['image_id'] == image_id
    pred = df_results.loc[mask_pred,'poly'].to_list()
    pred = np.array(pred)
    pred = np.clip(pred,a_min=0,a_max=pred.max())
    pred_score = df_results.loc[mask_pred,'score'].to_numpy()
    classes = df_results.loc[mask_pred,'category_id'].to_numpy()
    max_scores.append(pred_score.max())

    # print(gt.shape,pred.shape)

    pred_list = [{'boxes':torch.from_numpy(get_bbox(gt=pred)),
             'scores':torch.from_numpy(pred_score),
             'labels':torch.from_numpy(classes)}]
    target_list = [{"boxes":torch.from_numpy(get_bbox(gt=gt)),
                    "labels":torch.from_numpy(labels)}]

    metric = m_ap(preds=pred_list,target=target_list)
    map_50s.append(metric['map_50'].item())
    maps_75s.append(metric['map_75'].item())


100%|██████████| 653/653 [00:13<00:00, 47.35it/s]


In [None]:
results_per_img = {"map50":map_50s,"map75":maps_75s,"max_scores":max_scores,"image_ids":imgs_ids}
df_results_per_img = pd.DataFrame.from_dict(results_per_img,orient='columns')

In [44]:
df_results_per_img.head(2)

Unnamed: 0,map50,map75,max_scores,image_ids
0,0.504951,0.0,0.05283,003a34ee6b7841e6851b8fe511ebe102
1,0.366337,0.009901,0.11316,0078d29a8d0b489caa3425969c7477ac


In [50]:
# select images with low mAP@50 but high confidence
map_thrs = 0.3
score_thrs = 0.7

mask_low_map = (df_results_per_img['map50']<map_thrs) * (df_results_per_img['map75']<map_thrs)
mask_high_scores = df_results_per_img['max_scores']>score_thrs

mask_selected = mask_low_map * mask_high_scores 
df_hard_negatives = df_results_per_img.loc[mask_selected]

In [51]:
df_hard_negatives.describe()

Unnamed: 0,map50,map75,max_scores
count,40.0,40.0,40.0
mean,0.084865,0.018778,0.75022
std,0.079451,0.025501,0.033289
min,0.0,0.0,0.70361
25%,0.011041,0.0,0.717527
50%,0.077707,0.004685,0.752685
75%,0.133558,0.031851,0.772213
max,0.272277,0.087621,0.82031


In [52]:
df_hard_negatives.sample(2)

Unnamed: 0,map50,map75,max_scores,image_ids
397,0.0,0.0,0.73633,9d9d02b0550544b68928760d1eec02f2
506,0.092201,0.003046,0.79492,c8135057690b494aad449b969b6fe7b8


# Converting datasets to yolo<>OBB

In [63]:
from pathlib import Path
import pandas as pd
from tqdm import tqdm

def convert_yolo_to_obb(yolo_dataset_dir:str,output_dir:str)->None:

    cols = ['id','x1','y1','x2','y2','x3','y3','x4','y4']
    names = ['id','x','y','w','h']

    # Iterate through labels
    for label_path in tqdm(Path(yolo_dataset_dir).glob("*.txt"),desc='yolo->obb'):
        df = pd.read_csv(label_path,sep=' ',names=names)

        # check bounds
        assert df[names[1:]].all().max() <=1., "max value <= 1"
        assert df[names[1:]].all().min() >= 0., "min value >=0"

        for col in names[1:]:
            df[col] = df[col].astype(float)
        df['id'] = df['id'].astype(int)

        df['w'] = 0.5*df['w']
        df['h'] = 0.5*df['h']
        # top left
        df['x1'] = df['x'] - df['w']
        df['y1'] = df['y'] - df['h']
        # top right
        df['x2'] = df['x'] + df['w']
        df['y2'] = df['y'] - df['h']
        # bottom right
        df['x3'] = df['x'] + df['w']
        df['y3'] = df['y'] + df['h']
        # bottom left
        df['x4'] = df['x'] - df['w']
        df['y4'] = df['y'] + df['h']

        # check bounds
        assert df[names[1:]].all().max() <=1., "max value <= 1"
        assert df[names[1:]].all().min() >= 0., "min value >=0"

        # save file
        df[cols].to_csv(Path(output_dir)/label_path.name,
                        sep=' ',index=False,header=False)

def convert_obb_to_yolo(obb_dataset_dir:str,output_dir:str)->None:

    names = ['id','x1','y1','x2','y2','x3','y3','x4','y4']
    cols = ['id','x','y','w','h']

    # Iterate through labels
    for label_path in tqdm(Path(obb_dataset_dir).glob("*.txt"),desc='obb->yolo'):
        df = pd.read_csv(label_path,sep=' ',names=names)

        # check bounds
        assert df[names[1:]].all().max() <=1., "max value <= 1"
        assert df[names[1:]].all().min() >= 0., "min value >=0"

        # center
        df['x'] = (df['x1'] + df['x2'])/2.
        df['y'] = (df['y1'] + df['y4'])/2.
        # width
        df['w'] = df['x2'] - df['x1']
        # height
        df['h'] = df['y4'] - df['y1']

        # check bounds
        assert df[names[1:]].all().max() <=1., "max value <= 1"
        assert df[names[1:]].all().min() >= 0., "min value >=0"

        # save file
        df[cols].to_csv(Path(output_dir)/label_path.name,sep=' ',index=False,header=False)

In [None]:
#uncomment to run
# convert_yolo_to_obb(yolo_dataset_dir=r"C:\Users\fadel\OneDrive\Bureau\WILD-AI\datalabeling\data\train_wildai\labels",
#                     output_dir=r"C:\Users\fadel\OneDrive\Bureau\WILD-AI\datalabeling\data\train_wildai\labels")

In [None]:
# uncomment to run
# convert_obb_to_yolo(obb_dataset_dir=r"C:\Users\fadel\OneDrive\Bureau\WILD-AI\datalabeling\data\train_wildai\labels",
#                     output_dir=r"C:\Users\fadel\OneDrive\Bureau\WILD-AI\datalabeling\data\train_wildai\labels")