In [2]:
import h5py
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
import cv2
import os
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.model_selection import train_test_split
import glob
import SimpleITK as sitk

In [3]:
# read metadata
path = "/media/yesindeed/WD5T/data/COVID-CT-MD/"

demo_data = pd.read_csv(path + "Clinical-data.csv")
demo_data

Unnamed: 0,clinical_data,Diagnosis,Folder,Patient Gender,Patient Age,Weight,Clinical characteristics,Surgery,Follow-up,PCR
0,0,COVID-19,P001,M,039Y,80.0,"Dyspnea, Cough, Fever",,,
1,1,COVID-19,P002,F,048Y,75.0,"Dyspnea, Flu-like symptoms",No,Uneventful,
2,2,COVID-19,P003,M,068Y,90.0,Cough,,Hospitalized,
3,3,COVID-19,P004,M,065Y,75.0,"Dyspnea, fever",No,Uneventful,
4,4,COVID-19,P005,M,029Y,78.0,"Cough, Fatigue",No,Uneventful,
...,...,...,...,...,...,...,...,...,...,...
300,302,Normal,normal072,M,031Y,105.0,Dyspnea,,,
301,303,Normal,normal073,M,039Y,96.0,"Headache, Myalgia",Yes,,
302,304,Normal,normal074,M,040Y,97.0,Chest pain,Yes,,
303,305,Normal,normal075,M,056Y,92.0,Dyspnea,,,


In [3]:
demo_data["Patient Gender"].value_counts()

Patient Gender
M    183
F    122
Name: count, dtype: int64

In [40]:
# resize 3d volumes and save as .nii.gz

for i in range(len(demo_data)):
    item = demo_data.iloc[i]

    diagnosis = item["Diagnosis"]
    folder = item["Folder"]

    diagnosis = "Cap" if diagnosis == "CAP" else diagnosis

    reader = sitk.ImageSeriesReader()
    dicom_names = reader.GetGDCMSeriesFileNames(
        os.path.join(path, f"{diagnosis} Cases", folder))
    reader.SetFileNames(dicom_names)
    image = reader.Execute()

    array = sitk.GetArrayFromImage(image)
    dtype = array.dtype
    array = torch.from_numpy(array).float()

    if array.shape[1] != 512 or array.shape[2] != 512:
        array = F.interpolate(array.unsqueeze(
            0), size=(512, 512), mode="bicubic")

    # resize to 34 since we need to stack nearest two slices for fake RGB input
    # the actual input to NN is 32 x 512 x 512
    array = F.interpolate(array.unsqueeze(0).unsqueeze(0), size=(
        34, 512, 512), mode="trilinear").numpy().astype(dtype)
    image = sitk.GetImageFromArray(array)

    save_folder = os.path.join(path, "nifti", f"{diagnosis} Cases")
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)

    sitk.WriteImage(image, os.path.join(save_folder, f"{folder}.nii.gz"))

    # break

In [51]:
def split_82(all_meta, max_test_size_per_group):
    males = all_meta[all_meta["Patient Gender"] == "M"]
    females = all_meta[all_meta["Patient Gender"] == "F"]

    males_train, males_test = train_test_split(
        np.unique(males["Folder"]), test_size=max_test_size_per_group, random_state=0
    )
    females_train, females_test = train_test_split(
        np.unique(females["Folder"]), test_size=max_test_size_per_group, random_state=0
    )

    sub_train = np.concatenate([males_train, females_train])
    sub_test = np.concatenate([males_test, females_test])

    train_meta = all_meta[all_meta["Folder"].isin(sub_train)]
    test_meta = all_meta[all_meta["Folder"].isin(sub_test)]

    return train_meta, test_meta


unique_males = demo_data[demo_data["Patient Gender"] == "M"]["Folder"].nunique()
unique_females = demo_data[demo_data["Patient Gender"] == "F"]["Folder"].nunique()
max_test_size_per_group = min(int(0.2 * unique_females), int(0.2 * unique_males))

# sub_train_meta, sub_test_meta = split_82(demo_data, max_test_size_per_group)


sub_train, sub_test = split_82(demo_data, max_test_size_per_group)

sub_train.to_csv(os.path.join(path, "train.csv"), index=False)
sub_test.to_csv(os.path.join(path, "test.csv"), index=False)

In [46]:
sub_train

Unnamed: 0,clinical_data,Diagnosis,Folder,Patient Gender,Patient Age,Weight,Clinical characteristics,Surgery,Follow-up,PCR
0,0,COVID-19,P001,M,039Y,80.0,"Dyspnea, Cough, Fever",,,
1,1,COVID-19,P002,F,048Y,75.0,"Dyspnea, Flu-like symptoms",No,Uneventful,
2,2,COVID-19,P003,M,068Y,90.0,Cough,,Hospitalized,
3,3,COVID-19,P004,M,065Y,75.0,"Dyspnea, fever",No,Uneventful,
4,4,COVID-19,P005,M,029Y,78.0,"Cough, Fatigue",No,Uneventful,
...,...,...,...,...,...,...,...,...,...,...
298,300,Normal,normal070,M,060Y,77.0,Dyspnea,No,,
301,303,Normal,normal073,M,039Y,96.0,"Headache, Myalgia",Yes,,
302,304,Normal,normal074,M,040Y,97.0,Chest pain,Yes,,
303,305,Normal,normal075,M,056Y,92.0,Dyspnea,,,


In [47]:
sub_test

Unnamed: 0,clinical_data,Diagnosis,Folder,Patient Gender,Patient Age,Weight,Clinical characteristics,Surgery,Follow-up,PCR
6,6,COVID-19,P007,F,060Y,70.0,Dyspnea,,,
8,8,COVID-19,P009,M,046Y,75.0,Cough,,Uneventful,
12,12,COVID-19,P013,M,059Y,82.0,Cough,No,,
17,17,COVID-19,P018,F,066Y,75.0,Cough,,,
19,19,COVID-19,P020,F,047Y,78.0,COVID-19 suspected,No,,
23,23,COVID-19,P024,F,036Y,75.0,Dyspnea,No,Uneventful,
33,33,COVID-19,P034,M,070Y,95.0,"Cough, Fever",No,Uneventful,
44,45,COVID-19,P045,F,033Y,70.0,Cough,No,,
52,53,COVID-19,P053,M,034Y,65.0,Follow-up,,,
58,59,COVID-19,P059,M,054Y,84.0,"Dyspnea, Cough",No,Uneventful,


In [4]:
df_test = pd.read_csv(os.path.join(path, "test.csv"))

df_test = df_test[~df_test["Patient Age"].isnull()]

df_test["Patient Age"] = df_test["Patient Age"].apply(lambda x: x[1:3])

df_test["age_binary"] = df_test["Patient Age"].values.astype("float")
df_test["age_binary"] = np.where(df_test["age_binary"].between(-1, 60), 0, df_test["age_binary"])
df_test["age_binary"] = np.where(df_test["age_binary"] >= 60, 1, df_test["age_binary"])

class_counts = df_test["age_binary"].value_counts()
print(class_counts)
min_count = class_counts.min()
balanced_test_meta = df_test.groupby("age_binary").apply(lambda x: x.sample(min_count)).reset_index(drop=True)


balanced_test_meta.to_csv(os.path.join(path, "test_age.csv"), index=False)

balanced_test_meta["age_binary"].value_counts()

age_binary
0.0    37
1.0    11
Name: count, dtype: int64


  balanced_test_meta = df_test.groupby("age_binary").apply(lambda x: x.sample(min_count)).reset_index(drop=True)


age_binary
0.0    11
1.0    11
Name: count, dtype: int64