In [1]:
import os, shutil
from keras.models import Model
from keras.optimizers import Adam
from keras.applications.vgg16 import VGG16, preprocess_input
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers import Dense, Dropout, Flatten
import numpy as np
from PIL import Image
from collections import defaultdict



### 1. PREPARE DATASET

#### EDA

In [2]:
def get_image_sizes(folder_path, image_sizes = defaultdict(int)):

    for root, dirs, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            try:
                with Image.open(file_path) as img:
                    width, height = img.size
                    size_str = f"{width}x{height}"
                    image_sizes[size_str] += 1
            except Exception as e:
                print(f"Error processing {file_path}: {e}")

    return image_sizes

In [3]:
covid_sizes = get_image_sizes('/kaggle/input/dlai3-hackathon-phase3-covid19-cxr-challenge/DLAI3_Phase3/COVID-19')
nonfind = get_image_sizes('/kaggle/input/dlai3-hackathon-phase3-covid19-cxr-challenge/DLAI3_Phase3/NOFINDING', covid_sizes)
thora = get_image_sizes('/kaggle/input/dlai3-hackathon-phase3-covid19-cxr-challenge/DLAI3_Phase3/THORAXDISEASE', nonfind)

In [4]:
top_3_size = sorted(thora.items(), key=lambda x: x[1], reverse=True)[:3]
top_3_size

[('1024x1024', 223), ('3050x2539', 103), ('2498x1971', 7)]

We can see that **1024x1024** size is the most popular shape for dataset. However, this ideal size is not approprate to our property so I decide to reduce it to **512x512** pixels to fit what we have.😢

#### Preprocessing

In [5]:
os.makedirs(os.path.join('/kaggle/working', 'Covid19_dataset'))

In [6]:
def copy_image_to_other(orig_folder, rate:list):
    # Parameters:
    # + orig_folder: original folder of dataset
    # + rate: list of ratio to split dataset into train/val/test
    des_path = '/kaggle/working/Covid19_dataset'
    if os.path.exists(os.path.join(des_path, 'train')) == False:
        os.makedirs(os.path.join(des_path, 'train'))
        os.makedirs(os.path.join(des_path, 'val'))
        os.makedirs(os.path.join(des_path, 'test'))
    
    # Split each category folder into train/val/test
    foldername = os.path.basename(orig_folder)
    folder_list = [train, val, test]
    img_list = os.listdir(orig_folder)
    flag = 0
    for i in range(3):
        point = flag + int(len(img_list)*rate[i])
        os.makedirs(os.path.join(des_path, folder_list[i], foldername))
        try:
            for img in img_list[flag:point+1]:
                src_img = os.path.join(orig_folder, img)
                des_img = os.path.join(des_path, folder_list[i],foldername, img)
                shutil.copyfile(src_img, des_img)
            flag = point
        except IndexError:
            for img in img_list[flag:point]:
                src_img = os.path.join(orig_folder, img)
                des_img = os.path.join(des_path, folder_list[i],foldername, img)
                shutil.copyfile(src_img, des_img)
            flag = point
        print(f'Copy image to {folder_list[i]} done!')

In [7]:
rate = [0.9, 0.05, 0.05]
covid = '/kaggle/input/dlai3-hackathon-phase3-covid19-cxr-challenge/DLAI3_Phase3/COVID-19'
copy_image_to_other(covid, rate)

Copy image to val done!
Copy image to test done!
Copy image to train done!


In [8]:
nonfiding = '/kaggle/input/dlai3-hackathon-phase3-covid19-cxr-challenge/DLAI3_Phase3/NOFINDING'
copy_image_to_other(nonfiding, rate)

Copy image to val done!
Copy image to test done!
Copy image to train done!


In [9]:
thora = '/kaggle/input/dlai3-hackathon-phase3-covid19-cxr-challenge/DLAI3_Phase3/THORAXDISEASE'
copy_image_to_other(thora, rate)

Copy image to val done!
Copy image to test done!
Copy image to train done!


In [10]:
generator = ImageDataGenerator(rescale = 1./255)

train = '/kaggle/working/Covid19_dataset/train'
val = '/kaggle/working/Covid19_dataset/val'
test = '/kaggle/working/Covid19_dataset/test'
val_generator = generator.flow_from_directory(train, target_size = (512, 512), class_mode='categorical')
train_generator = generator.flow_from_directory(val, target_size = (512, 512), class_mode='categorical')
test_generator = generator.flow_from_directory(test, target_size = (512, 512), class_mode='categorical')

Found 277 images belonging to 3 classes.
Found 4958 images belonging to 3 classes.
Found 277 images belonging to 3 classes.


### 2. DEFINE MODEL

In [11]:
vgg = VGG16(weights='imagenet', include_top=False, input_shape=(512, 512, 3))

# Freeze all layers in the base model
for layer in vgg.layers:
    layer.trainable = False

# Add custom classification layers
x = Flatten()(vgg.output)
x = Dense(1024, activation='relu')(x)
x = Dense(256, activation='relu')(x)
x = Dense(3, activation='softmax')(x)

model = Model(inputs=vgg.input, outputs=x)


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5


In [12]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [13]:
model.fit(train_generator,
          batch_size = 64,
          validation_data=val_generator, 
          epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7f0800fd7e80>