# Postprocessing the data

- Please keep in mind that step 1 and 2 are independant and should be apply to your _original_ json

### 0- Formatting of prediction COCO json

For localization, competitors are required to follow the prediction json format defined by ms COCO: https://cocodataset.org/#format-results , which is a list of annotations, structured in a standard defined by COCO. For each domain, you need to generate a different JSON file called utokyo_1, utokyo_2, nau_1, uq_1. You need to assign each annotations to one image: please use the correspondance.csv file to retrieve the right image_id. You can find a sample in the github

If you have predicted a full COCO JSON, you can just use the code below

In [None]:
for sess in sessions_path:
    data = json.load(sess.open())
    data = data["annotations"]
    sess.write_text(json.dumps(data))

## 1-Localization

- To avoid the ambiguity of detection wheat head on the border, all boxes on the border are removed for the localization task. You are required to filter your solution with the following script in order to not overflow the result server :)
- A square of 1004x1004px is centered on the image. All boxes that don't have a complete overlap with this square are removed.

In [107]:
import json
from pathlib import Path
import numpy as np 
from tqdm.notebook import tqdm



In [108]:
def clean_boxes(boxes,min_area=10):
    area = boxes[:,2]*boxes[:,3]

    return np.squeeze(np.argwhere(area > min_area))


def filter_border(boxes, patch_size,sensitivity):
    (x1_a,y1_a) = (((np.array(patch_size)*sensitivity)/2)-1).astype(int)
    (x2_a, y2_a) = [x1_a, y1_a] + np.array(patch_size)*(1-sensitivity)
    
    boxes = np.array(boxes)
    x = boxes[:,0]
    y = boxes[:,1]
    h = boxes[:,2]
    w = boxes[:,3]

    areas = h*w

    xx = x+h
    yy= w+y

    # a réécrire
    xx1 = np.maximum(x, x1_a)
    yy1 = np.maximum(y, y1_a)
    xx2 = np.minimum(xx, x2_a)
    yy2 = np.minimum(yy, y2_a)

    w = np.maximum(0, xx2 - xx1 + 1)
    h = np.maximum(0, yy2 - yy1 + 1)
    
    overlap = (w * h) / areas
    overlap[overlap >1] = 1
    
    pick_boxes = np.argwhere(overlap == 1.)

    return np.squeeze(pick_boxes)

def clean_json(
    annotations,
    sensitivity=0.01,
    patch_size=(1024,1024)
    
    ):


    list_img = list(set([ann["image_id"] for ann in annotations]))
    new_ann = []
    
    for img_id in tqdm(list_img):


        temp_ann = np.array([ann for ann in annotations if ann["image_id"] == img_id])

        boxes = np.array([np.array(ann["bbox"])for ann in annotations if ann["image_id"] == img_id])
        if len(boxes) > 1:

            pick_1 = clean_boxes(boxes)

            pick_2 = filter_border(boxes[pick_1], patch_size,sensitivity)
            temp_ann = temp_ann[pick_1][pick_2]


        new_ann += list(temp_ann)


    return(new_ann)

In [127]:
import copy


reference_data = Path("/home/etdavid/Projects/1_research/3_wheat_counting/3-GWHD/kaggle-globalwheathead-winners/corrected")
output = Path(f"/home/etdavid/Projects/1_research/3_wheat_counting/3-GWHD/biased-result/dungnb/submit")

output.mkdir(exist_ok=True,parents=True)

for jsonp in reference_data.glob("*.json"):
    
    sess = jsonp.with_suffix("").name
    print(sess)

    annotations = json.load(jsonp.open())
    print(len(annotations))
    
    # You can uncomment the line below if you have a complete COCO JSON
    # annotations = annotations["annotations"]
    new_annotations = []
    for ann in annotations:
        ann["score"] = 0.7
        new_annotations.append(copy.copy(ann))

    annotations = clean_json(new_annotations)
    new_annotations = []
    for ann in annotations:
        try:
            ann["score"]
            new_annotations.append(ann)
        except:
            del ann

    for ann in new_annotations:
        try:
            ann["score"]
        except:
            print("fail")


    (output / (f"{jsonp.name}")).write_text(json.dumps(new_annotations))
    

uq_1
4791


HBox(children=(FloatProgress(value=0.0, max=119.0), HTML(value='')))


utokyo_2
2647


HBox(children=(FloatProgress(value=0.0, max=113.0), HTML(value='')))


utokyo_1
27419


HBox(children=(FloatProgress(value=0.0, max=994.0), HTML(value='')))


nau_1
1094


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




## Counting

- For the sake of simplicity, counting per image is equal to the number of annotated boxes

In [132]:
import pandas as pd

def generate_countcsv(
    sessions_path,
    correspondance_path,
    out_name,
    ):
    
    count_csv = []
    
    correspondance = pd.read_csv(correspondance_path)
    
    correspondance_dict = {img_id:img_name for img_id,img_name in zip(correspondance["image_id"].values,correspondance["image_name"].values)}
    for sessp in sessions_path:
        annotations = json.load(sessp.open())
        
        if type(annotations) == dict:

            annotations=annotations["annotations"]
            
        session_name = sessp.with_suffix("").name
        
        list_img = list(set([ann["image_id"] for ann in annotations]))


        
        

        for img_id in tqdm(list_img):
            img_name = correspondance_dict[img_id]

            boxes = np.array([np.array(ann["bbox"]) for ann in annotations if ann["image_id"] == img_id])
            scores = np.array([np.array(ann["score"]) for ann in annotations if ann["image_id"] == img_id])

            if len(boxes) > 1:

                pick_1 = clean_boxes(boxes)
                
                count = len(boxes[pick_1])
                score = np.mean(scores[pick_1])
            else:
                count  = 0
                
            


            count_csv.append([session_name, img_name, count, len(boxes),score])

    count_df = pd.DataFrame(count_csv,columns=["session","image_name","count","control_count","score"])

    count_df.to_csv(out_name)

In [133]:
sessions_path =list(reference_data.glob("*.json"))

generate_countcsv(sessions_path, "correspondance.csv",(output / "count.csv"))

    

HBox(children=(FloatProgress(value=0.0, max=119.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=113.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=994.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


