In [2]:
!cp /kaggle/input/gdcm-conda-install/gdcm.tar .
!tar -xvzf gdcm.tar
!conda install --offline ./gdcm/gdcm-2.8.9-py37h71b2a6d_0.tar.bz2
!rm -rf ./gdcm.tar

gdcm/
gdcm/conda-4.8.4-py37hc8dfbb8_2.tar.bz2
gdcm/gdcm-2.8.9-py37h71b2a6d_0.tar.bz2
gdcm/libjpeg-turbo-2.0.3-h516909a_1.tar.bz2

Downloading and Extracting Packages
######################################################################## | 100% 
Preparing transaction: done
Verifying transaction: done
Executing transaction: done


# Imports

In [3]:
import os
import ast
import numpy as np
import pandas as pd
from path import Path
import datetime
import glob
import json
import shutil
import random
from PIL import Image
from tqdm.auto import tqdm
import pydicom
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import cv2
import wandb
from sklearn.model_selection import train_test_split
from pydicom.pixel_data_handlers.util import apply_voi_lut


In [4]:
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
wandb_api = user_secrets.get_secret("wandb-key") 
wandb.login(key=wandb_api,relogin=True)

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [5]:
run = wandb.init(project='Final-Covid19-Detection',
                 name='various_sizes_train_test_coco_file_generation',
                job_type='split_file_generation',
                notes='Generation of a single artifact containing all different sizes annotation files')

[34m[1mwandb[0m: Currently logged in as: [33malvaromoureupm[0m. Use [1m`wandb login --relogin`[0m to force relogin


# Paths Definition

In [6]:
SEED = 42

## Duplicate images without bounding boxes

In [7]:
image_level_df = pd.read_csv('/kaggle/input/siim-covid19-detection/train_image_level.csv')
study_level_df = pd.read_csv('/kaggle/input/siim-covid19-detection/train_study_level.csv')
study_level_df['StudyInstanceUID'] = study_level_df['id'].apply(lambda x: x.replace('_study',''))
study_level_df.drop('id',axis=1,inplace=True)
train_df = image_level_df.merge(study_level_df,on='StudyInstanceUID')

In [13]:
image_ids = []
for study_id in tqdm(set(train_df['StudyInstanceUID'])):
    study_df = train_df[train_df['StudyInstanceUID']==study_id]
    imgs_to_append = []

    if len(study_df) == 1:
    # if study only contains one image, append it
        imgs_to_append.append(study_df['id'].values.tolist())

    else:
        rows = study_df[study_df['label'] != 'none 1 0 0 1 1']['id']
        if len(rows) >= 1:
    # If study contains more that one image with bounding boxes, append the images with bounding boxes (discarding those w/bboxes)
            imgs_to_append.append(rows.values.tolist())
        elif len(rows) == 0:
    # If study contains more that one image, all of them without bounding boxes, append those images
            imgs_to_append.append(study_df[study_df['label'] == 'none 1 0 0 1 1']['id'].values.tolist())
    for img in imgs_to_append[0]:
        image_ids.append(img)


  0%|          | 0/6054 [00:00<?, ?it/s]

In [14]:
len(image_ids) 

6117

# Resizing Images 

### Util Functions

In [8]:
from pydicom.pixel_data_handlers.util import apply_voi_lut

def read_xray(path, voi_lut = True, fix_monochrome = True):
    # Original from: https://www.kaggle.com/raddar/convert-dicom-to-np-array-the-correct-way
    dicom = pydicom.read_file(path)
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array           
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data 
    data = data - np.min(data) 
    data = data / np.max(data) 
    data = (data * 255).astype(np.uint8)    
    return data

def get_image_metadata(study_id, df):
    data = df[df["id"] == study_id]
    
    if data["Negative for Pneumonia"].values == 1:
        label = "negative_for_pneumonia"
        label = "Negative for Pneumonia"
    elif data["Typical Appearance"].values == 1:
        label = "typical"
        label = "Typical Appearance"
    elif data["Indeterminate Appearance"].values == 1:
        label = "indeterminate"
        label = "Indeterminate Appearance"
    else:
        label = "atypical"
        label = 'Atypical Appearance'
        
    bboxes = list(data["boxes"].values)
    
    return label, bboxes

def get_box_cords(box):
    x1,y1,x2,y2 = box['x'],box['y'], box['x'] + box['width'], box['y'] + box['height']
    return (int(x1),int(y1),int(x2),int(y2))

def scale_bbox(img, bboxes,img_size):
    # Get scaling factor
    scale_x = img.shape[0]/img_size[0]
    scale_y = img.shape[1]/img_size[1]
    
    scaled_bboxes = []
    for bbox in bboxes:
        x = int(np.round(bbox[0]/scale_y, 4))
        y = int(np.round(bbox[1]/scale_x, 4))
        x1 = int(np.round(bbox[2]/scale_y, 4))
        y1= int(np.round(bbox[3]/scale_x, 4))

        scaled_bboxes.append([x, y, x1, y1]) # xmin, ymin, xmax, ymax
        
    return scaled_bboxes
    

In [9]:
images_folder = '/kaggle/input/siim-covid19-images-metadata-256-512-768/images_metadata_256_512_768'
os.makedirs(images_folder, exist_ok=True)

# Define sizes
new_sizes = [(256,256),(512,512),(768,768)]

In [None]:
for new_size in new_sizes:
    print(new_size)
    for split in ['train']:
    # for split in ['test']:
        save_dir = f'{images_folder}/{split}_{new_size[0]}x{new_size[1]}/'
        dcm_paths = glob.glob(f'/kaggle/input/siim-covid19-detection/{split}/*/*/*')
        os.makedirs(save_dir, exist_ok=True)

        image_ids = []
        folder_ids = []
        study_ids = []
        widths = []
        heights = []

        for path in tqdm(dcm_paths):
            # set keep_ratio=True to have original aspect ratio
            xray = read_xray(path)
            im = cv2.resize(xray,new_size)

            path_split = path.split('/')
            study_id = path_split[-3]
            folder_id = path_split[-2]
            image_name = path_split[-1].replace('.dcm', '_image')

            cv2.imwrite(os.path.join(save_dir, image_name+'.png'),im)

            image_ids.append(image_name)
            folder_ids.append(folder_id)
            study_ids.append(study_id)
            widths.append(xray.shape[1])
            heights.append(xray.shape[0])
            
        df = pd.DataFrame.from_dict({'id': image_ids, 'folder_id': folder_ids,
                                     'study_id': study_ids, 'width': widths,
                                     'height': heights})
        df.to_csv(f'{images_folder}/{split}_meta_{new_size[0]}x{new_size[1]}.csv', index=False)

: 

## Resizing Bounding Boxes (pipeline)

In [13]:
for new_size in new_sizes:
    print(f"*****{new_size}*****\n")
    df_train_meta = pd.read_csv(f'{images_folder}/train_meta_{new_size[0]}x{new_size[1]}.csv')
    # Comment to use the df_train without duplicate bounding boxes
    df_train = pd.read_csv("/kaggle/input/siim-covid19-detection/train_image_level.csv")
    df_train_meta = df_train.merge(df_train_meta, on='id')
    
    #
    #df_train_meta = df_train_meta.dropna() # Drop all rows of images without annotations
    imagepaths = df_train_meta.id.unique()
    print("Number of Images with Covid_Abnormality:",len(imagepaths))
    
    display(df_train_meta.head(3))
    print()
    
    df_idx=0

    for idx, row in tqdm(df_train_meta.iterrows(), total=df_train_meta.shape[0]):
        img = cv2.imread(os.path.join(f"/kaggle/tmp/train_{new_size[0]}x{new_size[1]}/",
                                      row.id.replace("_image", ".png")))
        bboxes = [list(bbox.values()) for bbox in ast.literal_eval(row.boxes)]
        height_ratio, width_ratio = (new_size[0]/row.height, new_size[1]/row.width)

        for box in bboxes:
            box[2] = box[2]+box[0]
            box[3] = box[3]+box[1]
            box = (box[0]*width_ratio, box[1]*height_ratio,
                   box[2]*width_ratio, box[3]*height_ratio)

            row_df = pd.DataFrame({'id':row.id,
                           'StudyInstanceUID':row.StudyInstanceUID,
                           'folder_id':row.folder_id,
                           'study_id':row.study_id,
                           'width':row.width,
                           'height':row.height,
                           'xmin':round(box[0]),
                           'ymin':round(box[1]),
                           'xmax':round(box[2]),
                           'ymax':round(box[3])}, index=[df_idx])

            if df_idx==0:
                df_train_processed = row_df
            else:
                df_train_processed = pd.concat([df_train_processed, row_df])

            df_idx+=1

    display(df_train_processed.head(3))
    print()
    df_train_processed.to_csv(f'{images_folder}/df_train_processed_meta_{new_size[0]}x{new_size[1]}.csv',
                              index=False)
    df_train_processed.shape

*****(256, 256)*****

Number of Images with Covid_Abnormality: 4294


Unnamed: 0,id,boxes,label,StudyInstanceUID,folder_id,study_id,width,height
0,000a312787f2_image,"[{'x': 789.28836, 'y': 582.43035, 'width': 102...",opacity 1 789.28836 582.43035 1815.94498 2499....,5776db0cec75,81456c9c5423,5776db0cec75,4256,3488
2,0012ff7358bc_image,"[{'x': 677.42216, 'y': 197.97662, 'width': 867...",opacity 1 677.42216 197.97662 1545.21983 1197....,9d514ce429a7,22897cd1daa0,9d514ce429a7,3056,2544
3,001398f4ff4f_image,"[{'x': 2729, 'y': 2181.33331, 'width': 948.000...",opacity 1 2729 2181.33331 3677.00012 2785.33331,28dddc8559b2,4d47bc042ee6,28dddc8559b2,4280,3520





  0%|          | 0/4294 [00:00<?, ?it/s]

Unnamed: 0,id,StudyInstanceUID,folder_id,study_id,width,height,xmin,ymin,xmax,ymax
0,000a312787f2_image,5776db0cec75,81456c9c5423,5776db0cec75,4256,3488,47,43,109,183
1,000a312787f2_image,5776db0cec75,81456c9c5423,5776db0cec75,4256,3488,135,43,201,173
2,0012ff7358bc_image,9d514ce429a7,22897cd1daa0,9d514ce429a7,3056,2544,57,20,129,121



*****(512, 512)*****

Number of Images with Covid_Abnormality: 4294


Unnamed: 0,id,boxes,label,StudyInstanceUID,folder_id,study_id,width,height
0,000a312787f2_image,"[{'x': 789.28836, 'y': 582.43035, 'width': 102...",opacity 1 789.28836 582.43035 1815.94498 2499....,5776db0cec75,81456c9c5423,5776db0cec75,4256,3488
2,0012ff7358bc_image,"[{'x': 677.42216, 'y': 197.97662, 'width': 867...",opacity 1 677.42216 197.97662 1545.21983 1197....,9d514ce429a7,22897cd1daa0,9d514ce429a7,3056,2544
3,001398f4ff4f_image,"[{'x': 2729, 'y': 2181.33331, 'width': 948.000...",opacity 1 2729 2181.33331 3677.00012 2785.33331,28dddc8559b2,4d47bc042ee6,28dddc8559b2,4280,3520





  0%|          | 0/4294 [00:00<?, ?it/s]

Unnamed: 0,id,StudyInstanceUID,folder_id,study_id,width,height,xmin,ymin,xmax,ymax
0,000a312787f2_image,5776db0cec75,81456c9c5423,5776db0cec75,4256,3488,95,85,218,367
1,000a312787f2_image,5776db0cec75,81456c9c5423,5776db0cec75,4256,3488,270,87,402,345
2,0012ff7358bc_image,9d514ce429a7,22897cd1daa0,9d514ce429a7,3056,2544,113,40,259,241



*****(768, 768)*****

Number of Images with Covid_Abnormality: 4294


Unnamed: 0,id,boxes,label,StudyInstanceUID,folder_id,study_id,width,height
0,000a312787f2_image,"[{'x': 789.28836, 'y': 582.43035, 'width': 102...",opacity 1 789.28836 582.43035 1815.94498 2499....,5776db0cec75,81456c9c5423,5776db0cec75,4256,3488
2,0012ff7358bc_image,"[{'x': 677.42216, 'y': 197.97662, 'width': 867...",opacity 1 677.42216 197.97662 1545.21983 1197....,9d514ce429a7,22897cd1daa0,9d514ce429a7,3056,2544
3,001398f4ff4f_image,"[{'x': 2729, 'y': 2181.33331, 'width': 948.000...",opacity 1 2729 2181.33331 3677.00012 2785.33331,28dddc8559b2,4d47bc042ee6,28dddc8559b2,4280,3520





  0%|          | 0/4294 [00:00<?, ?it/s]

Unnamed: 0,id,StudyInstanceUID,folder_id,study_id,width,height,xmin,ymin,xmax,ymax
0,000a312787f2_image,5776db0cec75,81456c9c5423,5776db0cec75,4256,3488,142,128,328,550
1,000a312787f2_image,5776db0cec75,81456c9c5423,5776db0cec75,4256,3488,405,130,603,518
2,0012ff7358bc_image,9d514ce429a7,22897cd1daa0,9d514ce429a7,3056,2544,170,60,388,362



*****(1024, 1024)*****

Number of Images with Covid_Abnormality: 4294


Unnamed: 0,id,boxes,label,StudyInstanceUID,folder_id,study_id,width,height
0,000a312787f2_image,"[{'x': 789.28836, 'y': 582.43035, 'width': 102...",opacity 1 789.28836 582.43035 1815.94498 2499....,5776db0cec75,81456c9c5423,5776db0cec75,4256,3488
2,0012ff7358bc_image,"[{'x': 677.42216, 'y': 197.97662, 'width': 867...",opacity 1 677.42216 197.97662 1545.21983 1197....,9d514ce429a7,22897cd1daa0,9d514ce429a7,3056,2544
3,001398f4ff4f_image,"[{'x': 2729, 'y': 2181.33331, 'width': 948.000...",opacity 1 2729 2181.33331 3677.00012 2785.33331,28dddc8559b2,4d47bc042ee6,28dddc8559b2,4280,3520





  0%|          | 0/4294 [00:00<?, ?it/s]

Unnamed: 0,id,StudyInstanceUID,folder_id,study_id,width,height,xmin,ymin,xmax,ymax
0,000a312787f2_image,5776db0cec75,81456c9c5423,5776db0cec75,4256,3488,190,171,437,734
1,000a312787f2_image,5776db0cec75,81456c9c5423,5776db0cec75,4256,3488,540,174,804,691
2,0012ff7358bc_image,9d514ce429a7,22897cd1daa0,9d514ce429a7,3056,2544,227,80,518,482



*****(1280, 1280)*****

Number of Images with Covid_Abnormality: 4294


Unnamed: 0,id,boxes,label,StudyInstanceUID,folder_id,study_id,width,height
0,000a312787f2_image,"[{'x': 789.28836, 'y': 582.43035, 'width': 102...",opacity 1 789.28836 582.43035 1815.94498 2499....,5776db0cec75,81456c9c5423,5776db0cec75,4256,3488
2,0012ff7358bc_image,"[{'x': 677.42216, 'y': 197.97662, 'width': 867...",opacity 1 677.42216 197.97662 1545.21983 1197....,9d514ce429a7,22897cd1daa0,9d514ce429a7,3056,2544
3,001398f4ff4f_image,"[{'x': 2729, 'y': 2181.33331, 'width': 948.000...",opacity 1 2729 2181.33331 3677.00012 2785.33331,28dddc8559b2,4d47bc042ee6,28dddc8559b2,4280,3520





  0%|          | 0/4294 [00:00<?, ?it/s]

Unnamed: 0,id,StudyInstanceUID,folder_id,study_id,width,height,xmin,ymin,xmax,ymax
0,000a312787f2_image,5776db0cec75,81456c9c5423,5776db0cec75,4256,3488,237,214,546,917
1,000a312787f2_image,5776db0cec75,81456c9c5423,5776db0cec75,4256,3488,675,217,1005,863
2,0012ff7358bc_image,9d514ce429a7,22897cd1daa0,9d514ce429a7,3056,2544,284,100,647,603





# COCO Dataset Style Generation

In [11]:
now = datetime.datetime.now()
data = dict(
    info=dict(
        description='SIIM Covid 19 Train',
        url=None,
        version=1,
        year=now.year,
        contributor=None,
        date_created=now.strftime('%Y-%m-%d %H:%M:%S.%f')
        ),
    licenses=[dict(
        url=None,
        id=0,
        name=None)],
    images = [],
    type='instances',
    annotations=[],
    categories=[dict(
        id=1,
        name='Covid_Opacity')],
)
data

{'info': {'description': 'SIIM Covid 19 Train',
  'url': None,
  'version': 1,
  'year': 2022,
  'contributor': None,
  'date_created': '2022-10-01 18:41:43.073130'},
 'licenses': [{'url': None, 'id': 0, 'name': None}],
 'images': [],
 'type': 'instances',
 'annotations': [],
 'categories': [{'id': 1, 'name': 'Covid_Opacity'}]}

## Train/Val Spliting

In [15]:
#df = pd.read_csv(f'/kaggle/input/siim-covid19-images-metadata-256-512-768/images_metadata_256_512_768/train_meta_{SIZE}x{SIZE}.csv')
#df['id'] = df['id'].apply(lambda x: x.split('_')[0])
df = pd.read_csv("/kaggle/input/siim-covid19-detection/train_image_level.csv")
print(f'Original image level size: {len(df)}')
#df = df[df.label!='none 1 0 0 1 1'] # discard images without boundinx boxes
df = df[df['id'].isin(image_ids)]
print(f'Size of df after removing duplicates: {len(df)}')
train_df,val_df = train_test_split(df,train_size=0.9,random_state=SEED)
train_df['split'] = 'train'
train_df
val_df['split'] = 'val'
df = pd.concat([train_df,val_df])
df['split'].value_counts()

Original image level size: 6334
Size of df after removing duplicates: 6117


train    5505
val       612
Name: split, dtype: int64

## Training/Validation Annotation File Generation

In [None]:
for size in new_sizes:
    print(f'Creating json annotation file for size {size}')
    size = size[0]
    train_ids = train_df['id'].unique()
    val_ids = val_df['id'].unique()
    print(f'Train images: {len(train_ids)} \n Validation images: {len(val_ids)}')
    df_annotations = pd.read_csv(f'{images_folder}/df_train_processed_meta_{size}x{size}.csv')
    out_train_json = f'coco_train_{size}x{size}.json'
    out_val_json = f'coco_val_{size}x{size}.json'
    data_train = data.copy()
    data_train['images'] = []
    data_train['annotations'] = []
    data_val = data.copy()
    data_val['images'] = []
    data_val['annotations'] = []
    
    # Train annotations file loop
    for i,img_id in enumerate(tqdm(train_ids)):
        data_train['images'].append(
            dict(
                license=0,
                url=None,
                file_name=img_id+'.png',
                height=size,
                width=size,
                date_captured=None,
                id=i
            )
        )
        img_annotations = df_annotations[df_annotations['id']==img_id]
        if len(img_annotations) > 0:
            boxes = img_annotations[['xmin', 'ymin', 'xmax', 'ymax']].to_numpy()
            box_labels = np.zeros(img_annotations.shape[0])
            for box,label in zip(boxes,box_labels):
                x1,y1,x2,y2 = (box[0],box[1],box[2],box[3])
                area = round((x2-x1)*(y2-y1),1)
                bbox=[
                    int(x1),
                    int(y1),
                    int(x2-x1),
                    int(y2-y1)
                ]
                data_train['annotations'].append(
                    dict(
                    id=len(data_train['annotations']),
                    image_id=i,
                    category_id=int(label),
                    area=int(area),
                    bbox=bbox,
                    segmentation=[],
                    iscrowd=0)
                )
    with open(os.path.join('/kaggle/working',out_train_json),'w') as fp:
                json.dump(data_train,fp,indent=4)
            
    # Validation annotations loop
    
    for i,img_id in enumerate(tqdm(val_ids)):
        data_val['images'].append(
            dict(
                license=0,
                url=None,
                file_name=img_id+'.png',
                height=size,
                width=size,
                date_captured=None,
                id=i
            )
        )
        img_annotations = df_annotations[df_annotations['id']==img_id]
        if len(img_annotations) > 0:
            boxes = img_annotations[['xmin', 'ymin', 'xmax', 'ymax']].to_numpy()
            box_labels = np.zeros(img_annotations.shape[0])
            for box,label in zip(boxes,box_labels):
                x1,y1,x2,y2 = (box[0],box[1],box[2],box[3])
                area = round((x2-x1)*(y2-y1),1)
                bbox=[
                    int(x1),
                    int(y1),
                    int(x2-x1),
                    int(y2-y1)
                ]
                data_val['annotations'].append(
                    dict(
                    id=len(data_val['annotations']),
                    image_id=i,
                    category_id=int(label),
                    area=int(area),
                    bbox=bbox,
                    segmentation=[],
                    iscrowd=0)
                )
    with open(os.path.join('/kaggle/working',out_val_json),'w') as fp:
                json.dump(data_val,fp,indent=4)

               

: 

### Wandb Upload of the split files

In [22]:
wandb.finish()

VBox(children=(Label(value='18.360 MB of 18.360 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, m…

## K-Fold

In [15]:
from sklearn.model_selection import GroupKFold, train_test_split

# Remove images without bboxes
df_kfold = pd.DataFrame(pd.read_csv("/kaggle/input/siim-covid19-detection/train_image_level.csv"))
print("Shape before removing images without bboxes:", df_kfold.shape)
df_kfold = (df_kfold[df_kfold.label!='none 1 0 0 1 1']).reset_index(drop=True)
print("Shape after removing images without bboxes:", df_kfold.shape)

kfold = 5
df_kfold['fold'] = -1
group_kfold  = GroupKFold(n_splits = kfold)

for fold, (train_index, val_index) in enumerate(group_kfold.split(df_kfold,
                                                              groups=df_kfold.StudyInstanceUID.tolist())):
    df_kfold.loc[val_index, 'fold'] = fold
    
display(df_kfold.head(3))
df_kfold.to_csv("/kaggle/working/df_meta_kfold.csv")

Shape before removing images without bboxes: (6334, 4)
Shape after removing images without bboxes: (4294, 4)


Unnamed: 0,id,boxes,label,StudyInstanceUID,fold
0,000a312787f2_image,"[{'x': 789.28836, 'y': 582.43035, 'width': 102...",opacity 1 789.28836 582.43035 1815.94498 2499....,5776db0cec75,4
1,0012ff7358bc_image,"[{'x': 677.42216, 'y': 197.97662, 'width': 867...",opacity 1 677.42216 197.97662 1545.21983 1197....,9d514ce429a7,2
2,001398f4ff4f_image,"[{'x': 2729, 'y': 2181.33331, 'width': 948.000...",opacity 1 2729 2181.33331 3677.00012 2785.33331,28dddc8559b2,2


In [18]:
now = datetime.datetime.now()
data = dict(
    info=dict(
        description='SIIM Covid-19 GroupKfold',
        url=None,
        version=1,
        year=now.year,
        contributor=None,
        date_created=now.strftime('%Y-%m-%d %H:%M:%S.%f')
        ),
    licenses=[dict(
        url=None,
        id=0,
        name=None)],
    images = [],
    type='instances',
    annotations=[],
    categories=[dict(
        id=0,
        name='Covid_Opacity')],
)
data

{'info': {'description': 'SIIM Covid-19 GroupKfold',
  'url': None,
  'version': 1,
  'year': 2022,
  'contributor': None,
  'date_created': '2022-08-23 16:15:38.322615'},
 'licenses': [{'url': None, 'id': 0, 'name': None}],
 'images': [],
 'type': 'instances',
 'annotations': [],
 'categories': [{'id': 0, 'name': 'Covid_Opacity'}]}

In [19]:
SIZE = 512
K_FOLDS = 5

for fold in range(K_FOLDS):
    train_ids = df_kfold[df_kfold['fold'] != fold].id.unique()
    val_ids = df_kfold[df_kfold['fold'] == fold].id.unique()
    print(f'Train images: {len(train_ids)} \n Validation images: {len(val_ids)}')
    df_annotations = pd.read_csv(f'/kaggle/input/siim-covid-19-256-512-768-1024-1280/df_train_processed_meta_{SIZE}x{SIZE}.csv')
    out_train_json = f'coco_train_{SIZE}x{SIZE}_fold_{fold}.json'
    out_val_json = f'coco_val_{SIZE}x{SIZE}_fold_{fold}.json'
    data_train = data.copy()
    data_train['images'] = []
    data_train['annotations'] = []
    data_val = data.copy()
    data_val['images'] = []
    data_val['annotations'] = []
    
    # Train annotations file loop
    for i,img_id in enumerate(tqdm(train_ids)):
        data_train['images'].append(
            dict(
                license=0,
                url=None,
                file_name=img_id+'.png',
                height=SIZE,
                width=SIZE,
                date_captured=None,
                id=i
            )
        )
        img_annotations = df_annotations[df_annotations['id']==img_id]
        if len(img_annotations) > 0:
            boxes = img_annotations[['xmin', 'ymin', 'xmax', 'ymax']].to_numpy()
            box_labels = np.zeros(img_annotations.shape[0])
            for box,label in zip(boxes,box_labels):
                x1,y1,x2,y2 = (box[0],box[1],box[2],box[3])
                area = round((x2-x1)*(y2-y1),1)
                bbox=[
                    int(x1),
                    int(y1),
                    int(x2-x1),
                    int(y2-y1)
                ]
                data_train['annotations'].append(
                    dict(
                    id=len(data_train['annotations']),
                    image_id=i,
                    category_id=int(label),
                    area=int(area),
                    bbox=bbox,
                    segmentation=[],
                    iscrowd=0)
                )
    with open(os.path.join('/kaggle/working',out_train_json),'w') as fp:
                json.dump(data_train,fp,indent=4)
            
    # Validation annotations loop
    
    for i,img_id in enumerate(tqdm(val_ids)):
        data_val['images'].append(
            dict(
                license=0,
                url=None,
                file_name=img_id+'.png',
                height=SIZE,
                width=SIZE,
                date_captured=None,
                id=i
            )
        )
        img_annotations = df_annotations[df_annotations['id']==img_id]
        if len(img_annotations) > 0:
            boxes = img_annotations[['xmin', 'ymin', 'xmax', 'ymax']].to_numpy()
            box_labels = np.zeros(img_annotations.shape[0])
            for box,label in zip(boxes,box_labels):
                x1,y1,x2,y2 = (box[0],box[1],box[2],box[3])
                area = round((x2-x1)*(y2-y1),1)
                bbox=[
                    int(x1),
                    int(y1),
                    int(x2-x1),
                    int(y2-y1)
                ]
                data_val['annotations'].append(
                    dict(
                    id=len(data_val['annotations']),
                    image_id=i,
                    category_id=int(label),
                    area=int(area),
                    bbox=bbox,
                    segmentation=[],
                    iscrowd=0)
                )
    with open(os.path.join('/kaggle/working',out_val_json),'w') as fp:
                json.dump(data_val,fp,indent=4)

Train images: 3435 
 Validation images: 859


  0%|          | 0/3435 [00:00<?, ?it/s]

  0%|          | 0/859 [00:00<?, ?it/s]

Train images: 3435 
 Validation images: 859


  0%|          | 0/3435 [00:00<?, ?it/s]

  0%|          | 0/859 [00:00<?, ?it/s]

Train images: 3435 
 Validation images: 859


  0%|          | 0/3435 [00:00<?, ?it/s]

  0%|          | 0/859 [00:00<?, ?it/s]

Train images: 3435 
 Validation images: 859


  0%|          | 0/3435 [00:00<?, ?it/s]

  0%|          | 0/859 [00:00<?, ?it/s]

Train images: 3436 
 Validation images: 858


  0%|          | 0/3436 [00:00<?, ?it/s]

  0%|          | 0/858 [00:00<?, ?it/s]

In [20]:
artifact = wandb.Artifact('5_fold_train_val_coco_files', type='dataset')
for fold in range(K_FOLDS):
    artifact.add_file(os.path.join('/kaggle/working',f'coco_train_{SIZE}x{SIZE}_fold_{fold}.json'))
    artifact.add_file(os.path.join('/kaggle/working',f'coco_val_{SIZE}x{SIZE}_fold_{fold}.json'))
run.log_artifact(artifact)


<wandb.sdk.wandb_artifacts.Artifact at 0x7ff56d2fb410>

In [21]:
wandb.finish()

VBox(children=(Label(value='16.279 MB of 16.279 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, m…