In [1]:
import os
import numpy as np
from PIL import Image
import yaml

In [2]:
DATASET_FOLDER = r'/gpfs/gibbs/project/jetz/eec42/data/formatted_data_/global_birds_pfeifer'
IMG_FOLDER = os.path.join(DATASET_FOLDER, "images")
LABEL_FOLDER = os.path.join(DATASET_FOLDER, "labels")

In [3]:
# Check img size is the same for all images

# Retrieve img list
img_list = os.listdir(IMG_FOLDER)
IM_w, IM_h = 0, 0

for i, img in enumerate(img_list):
    im = Image.open(os.path.join(IMG_FOLDER, img))
    image_w, image_h = im.size[0], im.size[1]
    if i==0:
        IM_w, IM_h = image_w, image_h
        print(f"Pfeifer dataset dimensions: {IM_w}, {IM_h}")
    im.close()

    if image_w != IM_w or image_h != IM_h:
        print("!!!!! Alll images are not the same size !!!!!")

print(f"{len(img_list)} in Pfeifer dataset")


Pfeifer dataset dimensions: 448, 448
3384 in Pfeifer dataset


In [4]:
# Check duplicated labels

label_list = os.listdir(LABEL_FOLDER)

for i, label in enumerate(label_list):
    with open(os.path.join(LABEL_FOLDER, label)) as label_file:
        seen = set()
        duplicates = 0
        for line in label_file:
            line_lower = line.lower()
            if line_lower in seen:
                duplicates += 1
            else:
                seen.add(line_lower)
        if duplicates != 0:
            print(f"{duplicates} duplicated annotations found in dataset file {label} !")
            # Get rid of duplicates
            '''
            file = open(os.path.join(LABEL_FOLDER, label), 'w')
            for line in seen:
                file.write(line)

            file.close()
        '''
        


In [1]:
# Count number of annotations before preprocessing (patches et tout)

DATASET_FOLDER = r'/gpfs/gibbs/project/jetz/eec42/data/original/global_birds_pfeifer'
CSV_FILES = ['pfeifer_test.csv', 'pfeifer_train.csv'] 

In [7]:
import glob
import pandas as pd
from tqdm import tqdm


def retrieve_detections_from_csv(current_folder):
    '''
    retrieve all bounding boxes and labels annotations from csv files given in a dataset
    Args:
        - current_folder (str): path to the current images and labels folder
    Returns: 
        df (pd.DataFrame) with all images annotations
    '''
    csv_files = glob.glob(current_folder + '/**/*.csv', recursive=True) # should be 1 or 2 max (train+test or all together)
    df = pd.DataFrame()
    for annotation_file in tqdm(csv_files):
        df_ = pd.read_csv(annotation_file)
        df = pd.concat([df, df_])
    return df

In [8]:
#from preprocessing_utils import retrieve_detections_from_csv

df = retrieve_detections_from_csv(DATASET_FOLDER)
df = df.drop_duplicates(subset=['image_path', 'label', 'xmin', 'ymin', 'xmax', 'ymax'])

len(df)

100%|██████████| 2/2 [00:00<00:00, 29.67it/s]


27226

In [9]:
# Check labels are all 0

label_list = os.listdir(LABEL_FOLDER)
print(len(label_list))

for i, label in enumerate(label_list):
    with open(os.path.join(LABEL_FOLDER, label)) as label_file:
        seen = set()
        duplicates = 0
        for line in label_file:
            if line[0] != '0':
                print("Annotations are wrong !!")


3009
