In [1]:
import json
import pandas as pd
import os
from pathlib import Path
from sklearn.preprocessing import LabelEncoder

In [2]:
# Opening JSON file
main_file = open('annotations.json')

# returns JSON object as a dictionary
main_data = json.load(main_file)

In [3]:
#Create a dataframe with images from json file
df_images = pd.DataFrame(main_data['images'])
df_images = df_images.rename(columns={'id':'image_id'})
df_images = df_images.drop(['license', 'coco_url', 'flickr_url', 'date_captured', 'flickr_640_url'], axis=1)
df_images

Unnamed: 0,image_id,width,height,file_name
0,0,1537,2049,batch_1/000006.jpg
1,1,1537,2049,batch_1/000008.jpg
2,2,1537,2049,batch_1/000010.jpg
3,3,2049,1537,batch_1/000019.jpg
4,4,1537,2049,batch_1/000026.jpg
...,...,...,...,...
1495,1495,1824,4000,batch_9/000095.jpg
1496,1496,1824,4000,batch_9/000096.jpg
1497,1497,4000,1824,batch_9/000097.jpg
1498,1498,1824,4000,batch_9/000098.jpg


In [4]:
#Create a dataframe with annotations from json file
df_annot = pd.DataFrame(main_data['annotations'])
df_annot = df_annot.drop([ 'id', 'segmentation', 'area', 'iscrowd'], axis=1)
df_annot

Unnamed: 0,image_id,category_id,bbox
0,0,6,"[517.0, 127.0, 447.0, 1322.0]"
1,1,18,"[1.0, 457.0, 1429.0, 1519.0]"
2,1,14,"[531.0, 292.0, 1006.0, 672.0]"
3,2,5,"[632.0, 987.0, 500.0, 374.0]"
4,2,7,"[632.0, 989.0, 44.0, 51.0]"
...,...,...,...
4779,1498,16,"[228.7143, 1550.0476, 1007.9998999999999, 578...."
4780,1498,7,"[1041.3334, 1721.7142, 141.0, 138.0]"
4781,1499,39,"[862.0274, 1331.25, 505.97260000000006, 612.5]"
4782,1499,14,"[966.0, 1996.0, 211.0, 336.0]"


In [5]:
#Merge the 2 first dataframe
df_merge = pd.merge(df_images, df_annot, how='left', on='image_id')
df_merge.head(5)

Unnamed: 0,image_id,width,height,file_name,category_id,bbox
0,0,1537,2049,batch_1/000006.jpg,6,"[517.0, 127.0, 447.0, 1322.0]"
1,1,1537,2049,batch_1/000008.jpg,18,"[1.0, 457.0, 1429.0, 1519.0]"
2,1,1537,2049,batch_1/000008.jpg,14,"[531.0, 292.0, 1006.0, 672.0]"
3,2,1537,2049,batch_1/000010.jpg,5,"[632.0, 987.0, 500.0, 374.0]"
4,2,1537,2049,batch_1/000010.jpg,7,"[632.0, 989.0, 44.0, 51.0]"


In [15]:
#Create a dataframe with categories from json file
df_cat = pd.DataFrame(main_data['categories'])
df_cat = df_cat.rename(columns={'id':'category_id'})
df_cat.head(5)

Unnamed: 0,supercategory,category_id,name
0,Aluminium foil,0,Aluminium foil
1,Battery,1,Battery
2,Blister pack,2,Aluminium blister pack
3,Blister pack,3,Carded blister pack
4,Bottle,4,Other plastic bottle


In [7]:
#Create a column supercategory_id
label_encoder = LabelEncoder()
df_cat['supercategory_id'] = label_encoder.fit_transform(df_cat['supercategory'])
df_cat.head(5)

Unnamed: 0,supercategory,category_id,name,supercategory_id
0,Aluminium foil,0,Aluminium foil,0
1,Battery,1,Battery,1
2,Blister pack,2,Aluminium blister pack,2
3,Blister pack,3,Carded blister pack,2
4,Bottle,4,Other plastic bottle,3


In [19]:
df_final = pd.merge(df_merge, df_cat, how='left', on='category_id')
df_final['name'].value_counts()

Cigarette                    667
Unlabeled litter             517
Plastic film                 451
Clear plastic bottle         285
Other plastic                273
Other plastic wrapper        260
Drink can                    229
Plastic bottle cap           209
Plastic straw                157
Broken glass                 138
Styrofoam piece              112
Disposable plastic cup       104
Glass bottle                 104
Pop tab                       99
Other carton                  93
Normal paper                  82
Metal bottle cap              80
Plastic lid                   77
Paper cup                     67
Corrugated carton             64
Aluminium foil                62
Single-use carrier bag        61
Other plastic bottle          50
Drink carton                  45
Tissues                       42
Crisp packet                  39
Disposable food container     38
Plastic utensils              37
Food Can                      34
Garbage bag                   31
Meal carto

In [None]:
df_final['supercategory'].value_counts()

Plastic bag & wrapper    850
Cigarette                667
Unlabeled litter         517
Bottle                   439
Bottle cap               289
Can                      273
Other plastic            273
Carton                   251
Cup                      192
Straw                    161
Paper                    148
Broken glass             138
Styrofoam piece          112
Pop tab                   99
Lid                       87
Plastic container         72
Aluminium foil            62
Plastic utensils          37
Rope & strings            29
Paper bag                 27
Scrap metal               20
Food waste                 8
Shoe                       7
Squeezable tube            7
Blister pack               7
Glass jar                  6
Plastic glooves            4
Battery                    2
Name: supercategory, dtype: int64

In [20]:
df_final = df_final.drop(['category_id', 'name'], axis=1)
df_final

Unnamed: 0,image_id,width,height,file_name,bbox,supercategory
0,0,1537,2049,batch_1/000006.jpg,"[517.0, 127.0, 447.0, 1322.0]",Bottle
1,1,1537,2049,batch_1/000008.jpg,"[1.0, 457.0, 1429.0, 1519.0]",Carton
2,1,1537,2049,batch_1/000008.jpg,"[531.0, 292.0, 1006.0, 672.0]",Carton
3,2,1537,2049,batch_1/000010.jpg,"[632.0, 987.0, 500.0, 374.0]",Bottle
4,2,1537,2049,batch_1/000010.jpg,"[632.0, 989.0, 44.0, 51.0]",Bottle cap
...,...,...,...,...,...,...
4779,1498,1824,4000,batch_9/000098.jpg,"[228.7143, 1550.0476, 1007.9998999999999, 578....",Carton
4780,1498,1824,4000,batch_9/000098.jpg,"[1041.3334, 1721.7142, 141.0, 138.0]",Bottle cap
4781,1499,1824,4000,batch_9/000099.jpg,"[862.0274, 1331.25, 505.97260000000006, 612.5]",Plastic bag & wrapper
4782,1499,1824,4000,batch_9/000099.jpg,"[966.0, 1996.0, 211.0, 336.0]",Carton


In [10]:
#loop on 1500 images to create .txt file with categorie id and BB localisation
for index in range(1500):
    image = df_final[df_final['image_id'] == index]
    
    #get image name
    file_name = image.iloc[0]['file_name']
    file_name = file_name.replace('.jpg', '.txt').replace('.JPG', '.txt')
    
    #Create file
    with open(file_name, mode = "w") as f:
    
        #Create line with category_id and localisation
        for index, row in image.iterrows():
            category_id= row['supercategory_id']

            #get info from dataframe
            b_x_top_left = int(row['bbox'][0])
            b_y_top_left = int(row['bbox'][1])
            b_width = int(row['bbox'][2])
            b_height = int(row['bbox'][3])
            image_width = int(row['width'])
            image_height = int(row['height'])
            
            #get x/y center
            b_x_center = b_x_top_left + b_width/2
            b_y_center = b_y_top_left + b_height/2

            # Normalise the co-ordinates by the dimensions of the image
            b_x_center /= image_width 
            b_y_center /= image_height 
            b_width /= image_width 
            b_height /= image_height

            line = f"{category_id} {b_x_center} {b_y_center} {b_width} {b_height}"

            f.write(line + "\n")
        f.close()

In [12]:
#Move and rename images with labels to the good folder
train_file = open('annotations_0_train.json')
val_file = open('annotations_0_val.json')
test_file = open('annotations_0_test.json')

train_data = json.load(train_file)
val_data = json.load(val_file)
test_data = json.load(test_file)

def move_to_folder(dataset, folder):
    count = 0
    dataframe = pd.DataFrame(dataset['images'])

    #Looop on the file_name
    for index in range(len(dataframe['file_name'])):
        path = dataframe.iloc[index]['file_name']
        batch = path.split('/')[0]
        picture = path.split('/')[1]

        os.chdir(f"/Users/loyk/Desktop/E2/data/{batch}")
        picture_name, picture_ext = os.path.splitext(picture)

        os.rename(f"{os.getcwd()}/{str(picture)}", 
                  f"/Users/loyk/Desktop/E2/data/split_data/{folder}/images/{str(count)}.jpg")
        os.rename(f"{os.getcwd()}/{str(picture_name)}.txt", 
                  f"/Users/loyk/Desktop/E2/data/split_data/{folder}/labels/{str(count)}.txt")
        count += 1

    os.chdir('/Users/loyk/Desktop/E2/data')
    print(count)

In [13]:
move_to_folder(train_data, 'train_data')
move_to_folder(val_data, 'val_data')
move_to_folder(test_data, 'test_data')

1200
150
150


In [12]:
len(df_cat['supercategory'].unique())

28