In [None]:
!pip install ultralytics
!pip install -qU python-gdcm pydicom pylibjpeg

In [None]:
from google.colab import files
files.upload()

!pip install kaggle

# make kaggle directory
!mkdir ~/.kaggle

# change token's directory
!mv kaggle.json ~/.kaggle/

# change the permissions of the file.
!chmod 600 ~/.kaggle/kaggle.json

!kaggle competitions download -c rsna-pneumonia-detection-challenge

# make data directory
!mkdir /content/data

# moving to the directory
!mv rsna-pneumonia-detection-challenge.zip /content/data

# unzip data
!unzip -q /content/data/rsna-pneumonia-detection-challenge.zip -d /content/data

# delete token
!rm /root/.kaggle/kaggle.json

In [5]:
import os
import sys

import numpy as np
import pandas as pd
import cv2
import glob
import gdcm
import yaml
import torch
import pydicom
import zipfile
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from joblib import Parallel, delayed
from skimage.io import imread, imshow, imread_collection, concatenate_images
from skimage.transform import resize
from skimage.morphology import label
from sklearn.model_selection import train_test_split

In [None]:
DATASET_FOLDER = "./dataset/"
TRAIN_SAVE_FOLDER = "./dataset/images/"
TEST_SAVE_FOLDER = "./dataset/test/"
SIZE = 640
EXTENSION = "png"

os.makedirs(DATASET_FOLDER, exist_ok=True)
os.makedirs(TRAIN_SAVE_FOLDER, exist_ok=True)
os.makedirs(TEST_SAVE_FOLDER, exist_ok=True)

In [None]:
# Function that Make a dcm files into an image files
def process(f, size=640, save_folder="", extension="png"):
    image = f.split('/')[-1][:-4]

    dicom = pydicom.dcmread(f)
    img = dicom.pixel_array

    img = (img - img.min()) / (img.max() - img.min())

    if dicom.PhotometricInterpretation == "MONOCHROME1":
        img = 1 - img

    img = cv2.resize(img, (size, size))

    cv2.imwrite(save_folder + f"{image}.{extension}", (img * 255).astype(np.uint8))

In [None]:
train_images = glob.glob('/content/data/stage_2_train_images/*.dcm')
test_images = glob.glob('/content/data/stage_2_test_images/*.dcm')

len(train_images), len(test_images)

In [None]:
# train images process
_ = Parallel(n_jobs=4)(
    delayed(process)(uid, size=SIZE, save_folder=TRAIN_SAVE_FOLDER, extension=EXTENSION)
    for uid in tqdm(train_images[:])
)

In [None]:
# test images process
_ = Parallel(n_jobs=4)(
    delayed(process)(uid, size=SIZE, save_folder=TEST_SAVE_FOLDER, extension=EXTENSION)
    for uid in tqdm(test_images[:])
)

In [6]:
train = pd.read_csv('/content/data/stage_2_train_labels.csv')
train

Unnamed: 0,patientId,x,y,width,height,Target
0,0004cfab-14fd-4e49-80ba-63a80b6bddd6,,,,,0
1,00313ee0-9eaa-42f4-b0ab-c148ed3241cd,,,,,0
2,00322d4d-1c29-4943-afc9-b6754be640eb,,,,,0
3,003d8fa0-6bf1-40ed-b54c-ac657f8495c5,,,,,0
4,00436515-870c-4b36-a041-de91049b9ab4,264.0,152.0,213.0,379.0,1
...,...,...,...,...,...,...
30222,c1ec14ff-f6d7-4b38-b0cb-fe07041cbdc8,185.0,298.0,228.0,379.0,1
30223,c1edf42b-5958-47ff-a1e7-4f23d99583ba,,,,,0
30224,c1f6b555-2eb1-4231-98f6-50a963976431,,,,,0
30225,c1f7889a-9ea9-4acb-b64c-b737c929599a,570.0,393.0,261.0,345.0,1


In [7]:
# train csv에 각 이미지 경로를 추가함
image_root = '/content/dataset/images'
paths = []
for k in tqdm(range(len(train))):
  row = train.iloc[k, :]
  path = os.path.join(image_root, row['patientId']) + '.png'
  paths.append(path)

train['path'] = paths

  0%|          | 0/30227 [00:00<?, ?it/s]

In [15]:
train

Unnamed: 0,patientId,x,y,width,height,Target,path
0,0004cfab-14fd-4e49-80ba-63a80b6bddd6,,,,,0,./dataset/train\0004cfab-14fd-4e49-80ba-63a80b...
1,00313ee0-9eaa-42f4-b0ab-c148ed3241cd,,,,,0,./dataset/train\00313ee0-9eaa-42f4-b0ab-c148ed...
2,00322d4d-1c29-4943-afc9-b6754be640eb,,,,,0,./dataset/train\00322d4d-1c29-4943-afc9-b6754b...
3,003d8fa0-6bf1-40ed-b54c-ac657f8495c5,,,,,0,./dataset/train\003d8fa0-6bf1-40ed-b54c-ac657f...
4,00436515-870c-4b36-a041-de91049b9ab4,264.0,152.0,213.0,379.0,1,./dataset/train\00436515-870c-4b36-a041-de9104...
...,...,...,...,...,...,...,...
30222,c1ec14ff-f6d7-4b38-b0cb-fe07041cbdc8,185.0,298.0,228.0,379.0,1,./dataset/train\c1ec14ff-f6d7-4b38-b0cb-fe0704...
30223,c1edf42b-5958-47ff-a1e7-4f23d99583ba,,,,,0,./dataset/train\c1edf42b-5958-47ff-a1e7-4f23d9...
30224,c1f6b555-2eb1-4231-98f6-50a963976431,,,,,0,./dataset/train\c1f6b555-2eb1-4231-98f6-50a963...
30225,c1f7889a-9ea9-4acb-b64c-b737c929599a,570.0,393.0,261.0,345.0,1,./dataset/train\c1f7889a-9ea9-4acb-b64c-b737c9...


In [9]:
# 데이터프레임을 넣으면 YOLO 형태의 txt가 생성되는 함수

def get_YOLO_txt(df):
  df = df.sort_values(by='patientId')

  for k in tqdm(range(len(df))):
    row = df.iloc[k, :]

    if row['Target'] == 0:
      with open(os.path.join('/content/dataset/labels', row['patientId'] + '.txt'), 'a') as f:
        f.write('')

    else:
      x_min, y_min, w, h = row['x'], row['y'], row['width'], row['height']
      dw, dh = 1024, 1024
      dw, dh = 1/dw, 1/dh

      x_center = (x_min + w/2) * dw
      y_center = (y_min + h/2) * dh
      w = w * dw
      h = h * dh

      line = '0' + ' ' + ' '.join(map(str, [x_center, y_center, w, h]))

      with open(os.path.join('/content/dataset/labels', row['patientId'] + '.txt'), 'a') as f:
        f.write(line + '\n')

In [10]:
# YOLO 디렉토리의 labels 채우기

!mkdir /content/dataset/labels
get_YOLO_txt(train)

  0%|          | 0/30227 [00:00<?, ?it/s]

In [22]:
# train, valid
images_paths = glob.glob('/content/dataset/images/*.png')

train_path, valid_path = train_test_split(images_paths,
                                          test_size=0.1,
                                          random_state=777,
                                          shuffle=True)

with open('/content/dataset/train.txt', 'w') as f:
  f.write('\n'.join(train_path) + '\n')

with open('/content/dataset/val.txt', 'w') as f:
  f.write('\n'.join(valid_path) + '\n')

In [23]:
# train.txt, val.txt 파일 경로
train_file = '/content/dataset/train.txt'
val_file = '/content/dataset/val.txt'

# 클래스 이름 리스트
classes = ['pneumonia']

# data.yaml 파일 경로
data_file = '/content/dataset/data.yaml'

# data.yaml 파일 생성
data = dict(
    train=train_file,
    val=val_file,
    nc=len(classes),
    names=classes
)

# 한글 문자열 지원을 위한 설정
yaml.add_representer(str, lambda dumper, data: dumper.represent_scalar('tag:yaml.org,2002:str', data, style='"'))

with open(data_file, 'w', encoding='UTF-8') as f:
    yaml.dump(data, f, allow_unicode=True)

In [None]:
# yolov8에서 지원하는 터미널
!yolo detect train data='/content/dataset/data.yaml' model='yolov8m.yaml' epochs=50 batch=16 imgsz=640 patience=10 save=True device=0 name='my_yolo'