## This code extracts labels from a zooniverse export and reformats it into the yolo format

In [1]:
import pandas as pd
import ast
import numpy as np
import re
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import os
import cv2
from collections import Counter
import shutil
from tqdm import tqdm
import filecmp
import ast

In [2]:
# re formats the coordanates for use with yolo v5
# In:
#      x2 (list) : x coordanates for all labels for a single image 
#      y2 (list) : y coordanates for all labels for a single image 
#      w2 (list) : width distances for all labels for a single image 
#      h2 (list) : height distances for all labels for a single image 
#      origW (list) : Original width of the target image
#      origH (list) : Original height of the target image
# Out: 
#      x3 (list) : x coordanates after yoloV5 preperation
#      y3 (list) : y coordanates after yoloV5 preperation
#      w3 (list) : Height distances after yoloV5 preperation
#      h3 (list) : height distances after yoloV5 preperation
def PrepForYolo (x,y,w,h,origW,origH):
    x2 = []
    y2 = []
    w2 = []
    h2 = []
    for q in range(len(x)):
        # moves anchor to the middle of the bounding box
        x2.append(x[q]+(w[q]/2))
        y2.append(y[q]+(h[q]/2))
        # normalises the values
        x2[q] = x2[q]/origW
        y2[q] = y2[q]/origH
        w2.append(w[q]/origW)
        h2.append(h[q]/origH)
    return(x2, y2, w2, h2)


# This function creates the dataset for all volunteers. It creates the relivant file structures.
# In:
#      fmat (dictionary) : formatted bounding box information in yolo format for a single image
#      ip (list) : the encrypted ip address of the annotators for each bounding box for a single image
#      path (str) :  location of the intended save path 
#      pathType (str) : IIC or all users 
#      saveName (str) : name of the current image
#      imgLoc (str) : location of the images 
# Out: 
#      None
def SaveCrowd (fmat, ip, path, pathType, saveName, imgLoc):
        
    np.savetxt(path+'\\'+pathType+'\\labels\\train\\'+saveName+'.txt', fmat.values, fmt='%i %1.10f %1.10f %1.10f %1.10f')
    np.savetxt(path+'\\'+pathType+'\\volunteers\\train\\'+saveName+'.txt', ip, fmt='%s')
    
    shutil.copyfile(imgLoc+'\\'+saveName+'.JPG' , path+'\\'+pathType+'\\images\\train\\'+saveName+'.JPG')
    
    # return(saveName+' Done')

# This function creates the IID dataset. It creates the relivant file structures.
# In:
#      fmat (list) : formatted bounding box information in yolo format for a single image and annotator
#      ip (str) : the encrypted ip address for a single annotators for a single image
#      path (str) :  location of the intended save path 
#      pathType (str) : IIC or all users 
#      saveName (str) : name of the current image
#      imgLoc (str) : location of the images 
# Out: 
#      None
def SaveIID (fmat, ip, path, pathType, saveName, imgLoc):
        
    np.savetxt(path+'\\'+pathType+'\\labels\\train\\'+saveName+'.'+ip+'.txt', fmat.values, fmt='%i %1.10f %1.10f %1.10f %1.10f')
    
    shutil.copyfile(imgLoc+'\\'+saveName+'.JPG' , path+'\\'+pathType+'\\images\\train\\'+saveName+'.'+ip+'.JPG')
    # return(None)

    
    
    
# This function handels the train val splitting
def valSplit(TTs, path, pathType, isIID):
    direc = os.listdir(path+'\\'+pathType+'\\images\\train')
    count = len(direc)
    valSplit = round(count * (1-TTs))
    for i in range(valSplit):
        name, extension= os.path.splitext(direc[i])

        shutil.move(path+'\\'+pathType+'\\images\\train\\'+direc[i] , path+'\\'+pathType+'\\images\\val\\'+direc[i])
        shutil.move(path+'\\'+pathType+'\\labels\\train\\'+name+'.txt' , path+'\\'+pathType+'\\labels\\val\\'+name+'.txt')
        if isIID == False:
            shutil.move(path+'\\'+pathType+'\\volunteers\\train\\'+name+'.txt' , path+'\\'+pathType+'\\volunteers\\val\\'+name+'.txt')
        
        
    
    
# This function gets the location of the test dataset, and copys it into the correct format
def injectTestDS(testLocImages, testLocLabels, trgtLoc, pathType, iid):
    direc = os.listdir(testLocImages)
    for i in direc:
        name, extension= os.path.splitext(i)
        shutil.copyfile(testLocImages+'\\'+i , trgtLoc+'\\'+pathType+'\\images\\test\\'+i)
        shutil.copyfile(testLocLabels+'\\'+name+'.txt' , trgtLoc+'\\'+pathType+'\\labels\\test\\'+name+'.txt')
        if iid == False:
            label = open(testLocLabels+'\\'+name+'.txt', "r")
            rows = len([f.strip() for f in label])
            expert = []
            for l in range(rows):
                expert.append('expert')
            np.savetxt(trgtLoc+'\\'+pathType+'\\volunteers\\test\\'+name+'.txt', expert, fmt='%s')
            
            

# This function removes repeated images in the train dataset that are already in the test dataset. So the test dataset is fully unseen
def removeRepeated(testLocImg, trainLoc, pathType):
    delImg=[]
    trainLoc2 = trainLoc+'\\'+pathType+'\\images\\train'
    direcTest = os.listdir(testLocImg)
    direcTrain = os.listdir(trainLoc2)
    for tst in direcTest:
        for trn in direcTrain:
            name, extension= os.path.splitext(tst)
            name2, extension2= os.path.splitext(trn)
            if name in name2:
                # Counts repeated images
                delImg.append(name2)
                # deletes image in the train dataset
                os.remove(trainLoc2 + '\\' + trn)
                # deletes label in the train dataset
                os.remove(trainLoc + '\\' + pathType + '\\labels\\train\\' + name2 + '.txt')
                # deletes volunteer information in the train dataset
                try:
                    os.remove(trainLoc + '\\' + pathType + '\\volunteers\\train\\' + name2 + '.txt')
                except:
                    pass
    print(str(len(delImg)) + ' Repeated Images. They Have Been Deleted: ')
    print(delImg)


# this function ensures each bounding box is within the bounds of the image
def fixOutOfBoundsBoxes(x, y, w, h, l, ip):
    count = 0
    x2 = x
    for pos in range(len(x)):
        # the bounding box extends past the right of the image
        if (x2[pos-count]+(w[pos-count]/2))>1:
            w[pos-count] = w[pos-count]-((x2[pos-count]+(w[pos-count]/2))-1)
            x2[pos-count] = 1-(w[pos-count]/2)
        # the bounding box extends past the left of the image
        elif (x2[pos-count]-(w[pos-count]/2))<0:
            w[pos-count] = w[pos-count]+(x2[pos-count]-(w[pos-count]/2))
            x2[pos-count] = w[pos-count]/2
        # the bounding box extends past the bottom of the image
        if (y[pos-count]+(h[pos-count]/2))>1:
            h[pos-count] = h[pos-count]-((y[pos-count]-(h[pos-count]/2))-1)
            y[pos-count] = 1-(h[pos-count]/2)
        # the bounding box extends past the top of the image
        elif (y[pos-count]-(h[pos-count]/2))<0:
            h[pos-count] = h[pos-count]+(y[pos-count]-(h[pos-count]/2))
            y[pos-count] = h[pos-count]/2
            
        # removes bounding boxes that are still out of bounds, as well as boudning boxes that span an unreasonable distance
        if (x2[pos-count]>1) or (x2[pos-count]<0) or (y[pos-count]>1) or (y[pos-count]<0) or (w[pos-count]>0.7) or (w[pos-count]<0) or (h[pos-count]>0.7) or (h[pos-count]<0):
            remvx = x2.pop(pos-count)
            remvy = y.pop(pos-count)
            remvw = w.pop(pos-count)
            remvh = h.pop(pos-count)
            remvl = l.pop(pos-count)
            if ip:
                remvip = ip.pop(pos-count)
            count += 1
    if ip and len(x2) != len(ip):
        print('ERROR: volunteer list doesnt match the labels')
    
    return(x2,y,w,h,l,ip)

# switches 2 labels of the users choice
def switchLabel(swapLabel, label):
    if label["value"] == swapLabel[0]:
        label["value"] = swapLabel[1]
    elif label["value"] == swapLabel[1]:
        label["value"] = swapLabel[0]
    return(label)
        
# # tests in turn A(anchor before bounds), A(anchor past bounds), C(anchor before bounds), C(anchor past bounds), B(anchor before bounds), B(anchor past bounds), D(anchor before bounds), D(anchor past bounds)
# # then repeated for the corner examples (AB, CD, DA, BC) with anchor inside and outside of bounds
# X1, Y1, W1, H1 = fixOutOfBoundsBoxes([0.95,1.1,0.002,-0.065,0.5,0.5,0.5,0.5],[0.5,0.5,0.5,0.5,0.95,1.1,0.002,-0.065],[0.28,0.28,0.28,0.28,0.28,0.28,0.28,0.28],[0.24,0.24,0.24,0.24,0.24,0.24,0.24,0.24])
# X2, Y2, W2, H2 = fixOutOfBoundsBoxes([0.95,1.1,0.002,-0.065,0.95,1.1,0.002,-0.065],[0.95,1.1,0.002,-0.065,0.002,-0.065,0.95,1.1],[0.28,0.28,0.28,0.28,0.28,0.28,0.28,0.28],[0.24,0.24,0.24,0.24,0.24,0.24,0.24,0.24])
# print('test1 (A1,A2,C1,C2,B1,B2,D1,D2): ',X1,W1)
# print('test1 (AB1,AB2,CD1,CD2,DA1,DA2,BC1,BC2): ',X2,W2)

In [3]:
# this section extracts all images and labels from the zooniverse output csv


##### File Locations #####

# file path to zooniverse
dataset = pd.read_csv (r'C:\Users\rb01243\OneDrive - University of Surrey\Desktop\zoon data\Labels\dental-disease-labelling-easy-classifications.csv')
# file path that contains the images
imageLocation = r'C:\Users\rb01243\OneDrive - University of Surrey\Desktop\zoon data\Data'
# The location you want to save the output to
saveLoaction = r'C:\Users\rb01243\OneDrive - University of Surrey\Desktop\zoon data\Finished Data'
# location of the test image dataset labels (compleeted by a single expert 'ground truth labels')
testLocationImg = r'C:\Users\rb01243\OneDrive - University of Surrey\Desktop\zoon data\expert\images' #testLocationImg = r'C:\Users\rb01243\OneDrive - University of Surrey\Documents\GitHub\miccai\miccai_dental_disease\data\datasets\master\images'
# location of the test label dataset labels (compleeted by a single expert 'ground truth labels')
testLocationLbl = r'C:\Users\rb01243\OneDrive - University of Surrey\Desktop\zoon data\expert\labels'


#expertTestData = r'test'

#########################

######## Params #########

# type of prefix dental experts use
dentPrefix = 'dnt_'     # 'dnt_'
# Toggle if you want to save the results or not
saveToggle = True
# the split ratio for train and val datasets (ground truth Train data is already in the correct format) (between 0.0 and 1 foat)
trainValSplit = 1.0
# if you want only expert labels change this filter to True
filterExpert = False
# minimum number of annotators per image
minAnnotators = 3
# secondary remove annotators, removes images without a minimum number of annotators after all the preprocessing is completed
secMinAnno = 2
# True if you want to create an IID dataset as well as a crowdsourced dataset (IID: crowdsourced labels for a single image are split up into their own .txt files, to act as repeated ground truth labels)
includeIID = True   # True
# if you want to generate a test dataset using the 
includeTestDS = True
# Parameter to choose the minimum number of classifications a volunteer has to complete
minClss = 50
# The minimum workflow version
minWorkflowVersion = 20.00
# user this to remove a label type from the dataset
remveLbl = 0 #       -1 = none     0 = Calculus Plaque,   1 = Dental Caries,      2 = Bone Loss        This parameter will also shift all subsequent labels down by 1 to compensate of the removal of the value
# swap label (done before label removal [remveLbl])
swapLabel = [1,2] #    Leave empty to not swap any labels      ZOONIVERSE EPORT IS IN THE FORMAT OF (0 = CALCULUS, 1 = CARIES, 2 = BONE LOSS) [0,2] = 3 label format, [1,2] = 2 label format
# specific images and labels to remove from the final dataset
imgRemove = ['Unknown-X-20200928-091843-XIQ+LFYOKA+S-0-YunpengÔÇÖs iMac.JPG']
########################

# Name of the folder storing the images
pthType = 'Zoon_50min_3perImg' # All_Volunteers   All_Volunteers_Calc_Removed_John_Test # Calc_Removed_New_Test_50min

# filters expert labels only
if filterExpert == True:
    pthType = 'Expert_Volunteers'
    tempDS = dataset
    dataset = tempDS.loc[(tempDS['expert'] == 'expert')|(tempDS['gold_standard'] == True)|(tempDS['user_name'].str.contains(dentPrefix))] # filters for experts only
    dataset.reset_index()
    

# removes banned users
fle = open("banned.txt", "r")
banedUsers = eval(fle.read())
for bnd in banedUsers:
    dataset = dataset[dataset['user_name'] != bnd]
    


dataset = dataset[dataset["workflow_version"] > minWorkflowVersion].reset_index(drop=True)# removes pre-beta test data

dataset = dataset[dataset["annotations"].str.contains('"value":null') == False].reset_index(drop=True) # removes null bounding boxes
dataset = dataset[dataset["annotations"].str.contains('"value":\[]') == False].reset_index(drop=True) # removes empty classifications

dataset = dataset[dataset.groupby('user_name').user_name.transform('count') >= minClss].reset_index(drop=True) # removes users with less total classifications than the minClss variable




# Creates Directories
pathType = pthType+'_Crowdsourced'
try:
    os.mkdir(saveLoaction+'\\'+pathType)
except:
    print('Root Already Created')
try:
    os.mkdir(saveLoaction+'\\'+pathType+'\\images')
    os.mkdir(saveLoaction+'\\'+pathType+'\\images\\train')
    os.mkdir(saveLoaction+'\\'+pathType+'\\images\\test')
    os.mkdir(saveLoaction+'\\'+pathType+'\\images\\val')
except:
    print('Images Folder Already Created')
try:
    os.mkdir(saveLoaction+'\\'+pathType+'\\labels')
    os.mkdir(saveLoaction+'\\'+pathType+'\\labels\\train')
    os.mkdir(saveLoaction+'\\'+pathType+'\\labels\\test')
    os.mkdir(saveLoaction+'\\'+pathType+'\\labels\\val')
except:
    print('Labels Folder Already Created')
try:
    os.mkdir(saveLoaction+'\\'+pathType+'\\volunteers')
    os.mkdir(saveLoaction+'\\'+pathType+'\\volunteers\\train')
    os.mkdir(saveLoaction+'\\'+pathType+'\\volunteers\\test')
    os.mkdir(saveLoaction+'\\'+pathType+'\\volunteers\\val')
except:
    print('Volunteers Folder Already Created')


if includeIID == True:
    pathTypeIID = pthType+'_IID'
    try:
        os.mkdir(saveLoaction+'\\'+pathTypeIID)
    except:
        print('IID Root Already Created')
    try:
        os.mkdir(saveLoaction+'\\'+pathTypeIID+'\\images')
        os.mkdir(saveLoaction+'\\'+pathTypeIID+'\\images\\train')
        os.mkdir(saveLoaction+'\\'+pathTypeIID+'\\images\\test')
        os.mkdir(saveLoaction+'\\'+pathTypeIID+'\\images\\val')
    except:
        print('Images Folder Already Created')
    try:
        os.mkdir(saveLoaction+'\\'+pathTypeIID+'\\labels')
        os.mkdir(saveLoaction+'\\'+pathTypeIID+'\\labels\\train')
        os.mkdir(saveLoaction+'\\'+pathTypeIID+'\\labels\\test')
        os.mkdir(saveLoaction+'\\'+pathTypeIID+'\\labels\\val')
    except:
        print('Labels Folder Already Created')




    
indexDone = []
#first volunteer for one image
for index, row in tqdm(dataset.iterrows()):
    width, height, xCord, yCord, labels, IP = [], [], [], [], [], []
    if index not in indexDone:
        # Removes Unwanted Images
        remove = eval(row['subject_data'].replace("null", "'null'"))
        remove = remove[str(row['subject_ids'])]
        remove = remove['Filename'].replace(".jpg", ".JPG")
        if remove in imgRemove:
            indexDone.append(index)
            print('REMOVED UNWANTED IMAGES: ', remove)
            continue
        temp = dataset.loc[dataset['subject_ids'] == (row['subject_ids'])]
        temp = temp.reset_index(drop=True)
        numAnno = len(temp)        
        # removes annotations and images if there aren't enough annotatiors (see parameters)
        if numAnno < minAnnotators:
            continue
        # all other volunteers for one image
        for index2, row2, in temp.iterrows():
            
            # extracts the username (encrypted IP address)
            userIP = row2['user_name']
            
            # extracts the name of the image
            imageName = row2['subject_data']
            imageName = imageName.replace("null", "'null'")
            imageName = eval(imageName)
            imageName = imageName[str(row2["subject_ids"])]
            imageName = imageName['Filename']
            name, extension= os.path.splitext(imageName)
            
            # fixes file extension for consistency
            if extension == '.jpg':
                imageName.replace(".jpg", ".JPG")
                imageName = (name+'.JPG')
            
            # extracts the annotation made by the user (encrypted IP address)
            annotation = row2['annotations']
            annotation = annotation.replace("null", "'null'")
            annotation = eval(annotation)
            annotation = annotation[0]["value"]
            
            #extracts the image width and height
            try:
                imgSze = row2['metadata']
                imgSze = imgSze.replace("true", "True")
                imgSze = imgSze.replace("null", "'null'")
                imgSze = eval(imgSze.replace("false", "False"))
                imgSze = imgSze['subject_dimensions'][0]
                originalwidth, originalheight = imgSze['naturalWidth'], imgSze['naturalHeight']
            except:
                pass
            
            # traverses the annotation dictionary inside the dataframe to extract all annotations.
            widthIID, heightIID, xCordIID, yCordIID, labelsIID = [], [], [], [], []
            for ann in annotation:
                label = ann["details"]
                label = label[0]
                # removes unwanted labels and null classes
                if label["value"] == 'null' or label["value"] == remveLbl:  
                    continue
                else:
                    # removes label
                    if swapLabel:
                        label = switchLabel(swapLabel, label)
                    # changes class label numbers based on the removed value
                    if label["value"] > remveLbl:
                        if remveLbl != -1:
                            label["value"] -= 1
                    
                    
                    # original values to view all the labels
                    labels.append(label["value"])
                    width.append(ann["width"])
                    height.append(ann["height"])
                    xCord.append(ann["x"])
                    yCord.append(ann["y"])
                    IP.append(userIP)
                    if includeIID == True:
                        labelsIID.append(label["value"])
                        widthIID.append(ann["width"])
                        heightIID.append(ann["height"])
                        xCordIID.append(ann["x"])
                        yCordIID.append(ann["y"])
                        IPIID = userIP
            if xCordIID:
                x2, y2, w2, h2 = PrepForYolo(xCordIID,yCordIID,widthIID,heightIID,originalwidth,originalheight)
                x2, y2, w2, h2, labelsIID, _ = fixOutOfBoundsBoxes(x2, y2, w2, h2, labelsIID, [])
                formats = pd.DataFrame({'label':labelsIID, 'xCord':x2, 'yCord':y2, 'width':w2, 'height':h2})
                if saveToggle == True and not formats.empty:
                    SaveIID(formats, IPIID, saveLoaction, pathTypeIID, name, imageLocation)
                    
            # adds compleeted rows to a list to be skpped later on. To avoid repeating labels.
            indexDone.append(index2)
        if xCord:
            x3, y3, w3, h3 = PrepForYolo(xCord,yCord,width,height,originalwidth,originalheight)
            x3, y3, w3, h3, labels, IP = fixOutOfBoundsBoxes(x3, y3, w3, h3, labels, IP)
            formats = pd.DataFrame({'label':labels, 'xCord':x3, 'yCord':y3, 'width':w3, 'height':h3})
            if saveToggle == True and not formats.empty:
                SaveCrowd(formats, IP, saveLoaction, pathType, name, imageLocation)
        
        # adds data to a new csv for easy data analysis
      
        
print("############ EXTRACTION DONE ############")
if saveToggle == True:         
    # removes repeated images in the train dataset that are already in the test dataset
    removeRepeated(testLocationImg, saveLoaction, pathType)
    if includeIID == True:
        removeRepeated(testLocationImg, saveLoaction, pathTypeIID)

    # performs train val split, and imports test dataset
    valSplit(trainValSplit, saveLoaction, pathType, False)
    if includeTestDS:
        injectTestDS(testLocationImg, testLocationLbl, saveLoaction, pathType, False)
    if includeIID == True:
        valSplit(trainValSplit, saveLoaction, pathTypeIID, True)
        if includeTestDS:
            injectTestDS(testLocationImg, testLocationLbl, saveLoaction, pathTypeIID, True)
            
# removes images with less than the minimmum number of volunteers
directory = saveLoaction+'\\'+pathType+'\\volunteers\\train'
for file in os.listdir(directory):
    f = open(directory+'\\'+file, "r")
    users = f.read().split()
    users = len(set(users))
    if users < secMinAnno:
        os.remove(saveLoaction+'\\'+pathType+'\\volunteers\\train\\'+file)
        os.remove(saveLoaction+'\\'+pathType+'\\labels\\train\\'+file)
        os.remove(saveLoaction+'\\'+pathType+'\\images\\train\\'+file)

print("############ DONE ############")

763it [00:09, 74.49it/s] 

REMOVED UNWANTED IMAGES:  Unknown-X-20200928-091843-XIQ+LFYOKA+S-0-YunpengÔÇÖs iMac.JPG


3628it [00:44, 77.58it/s]

REMOVED UNWANTED IMAGES:  Unknown-X-20200928-091843-XIQ+LFYOKA+S-0-YunpengÔÇÖs iMac.JPG


5069it [01:02, 84.69it/s]

REMOVED UNWANTED IMAGES:  Unknown-X-20200928-091843-XIQ+LFYOKA+S-0-YunpengÔÇÖs iMac.JPG


5545it [01:08, 81.46it/s]

REMOVED UNWANTED IMAGES:  Unknown-X-20200928-091843-XIQ+LFYOKA+S-0-YunpengÔÇÖs iMac.JPG


8293it [01:41, 79.19it/s]

REMOVED UNWANTED IMAGES:  Unknown-X-20200928-091843-XIQ+LFYOKA+S-0-YunpengÔÇÖs iMac.JPG


11215it [02:20, 80.01it/s]


597 Repeated Images. They Have Been Deleted: 
['CI20191121_073916_0128_0000399F', 'IS20180611_123847_0494_000000AF', 'IS20180614_161235_0835_000000CD', 'IS20180621_130042_0380_000000F0', 'IS20180711_081634_0805_00000127', 'IS20180711_081804_0006_00000128', 'IS20180730_082940_0490_00000184', 'IS20180730_083057_0336_00000185', 'IS20180914_142754_0665_00000253', 'IS20180914_142904_0007_00000254', 'IS20180924_121421_0662_00000284', 'IS20180926_090826_0085_0000028E', 'IS20180926_095803_0241_0000028F', 'IS20181001_080944_0233_000002A0', 'IS20181015_114142_0734_000002E6', 'IS20181015_114252_0445_000002E7', 'IS20181018_082427_0315_000002FA', 'IS20181024_151228_0198_0000032E', 'IS20181120_101614_0371_000003D4', 'IS20190121_094120_0903_00000702', 'IS20190204_091438_0079_0000089B', 'IS20190204_091558_0702_0000089C', 'IS20190319_120205_0492_00000C98', 'IS20190319_120205_0960_00000C99', 'IS20190326_150935_0640_00000D73', 'IS20190402_112255_0594_00000EA6', 'IS20190408_113253_0402_00000F8F', 'IS20190