In [27]:
import os
import numpy as np

import pandas as pd

pd.set_option('display.max_colwidth', 250)
pd.set_option('display.max_columns', None)

## Cohort Creation

### Cohort Conditions

In [28]:
working_directory = '/mnt/datasets/fastcovnet/'

resolution = '224'
columnToPredict = 'Case_type'  # Just to remove cases where columnToPredict is NULL

min_days_case_since_positive = -2
max_days_case_since_positive = 14

max_days_control_since_positive = -60

patientOnlyInOneCase = 0
keepOneImage_PatientViewPosition = 0
keepOnlyBurnedInAnnotationNO = 0
keepOnlyConsistentData = 1
filterDaysSincePositive = 1
filterByModalities = 0
filterByBodyPartExamined = 1
filterByViewPosition = 1
filterByManufacturer = 1
filterByManufacturerModelName = 1
filterByDeviceSerialNumber = 0

BurnedInAnnotation = ["NO"]    # To be safe, we start by only selecting BurnedInAnnotation = NO. Looks like all of the main ones have NO.

Modalities = ["DX"]

BodyPartExamined = ["CHEST", "TORAX", "THORAX", "TóRAX"]

ViewPosition = [
"PA"
# ,"LL"
# ,"AP"
]

Manufacturer = [
"Philips Medical Systems"
]

ManufacturerModelName = [
"DigitalDiagnost"
]

DeviceSerialNumber = [
"****", # (...)
]

random_seed = 1

In [29]:
full_dataset = pd.read_csv(os.path.join(working_directory,'full_covid_dataset.csv'),"|")
# full_dataset = pd.read_csv(os.path.join(working_directory,'full_covid_dataset_inferenceVP.csv'),"|")

### Data filtering

In [30]:
print("Number of initial images: " + str(full_dataset.shape[0]))

data = full_dataset

# Remove images where we can't be sure in which category they'll go because they're a bit earlier than the PCR
if filterDaysSincePositive:
    data = data[(data.Dies_desde_positiu<=max_days_control_since_positive) | (data.Dies_desde_positiu.isna()) | ((data.Dies_desde_positiu>=min_days_case_since_positive) & (data.Dies_desde_positiu<=max_days_case_since_positive))]
    print("Number of images after removing RX done in days where we can't be sure whether it's Case or Control: " + str(data.shape[0]))

# Remove images where DICOM Date doesn't match SIMDCAT BBDD Date
if keepOnlyConsistentData:
    data = data[data.StudyDate==(data["StudyDateTime"].str[:4] + data["StudyDateTime"].str[5:7] + data["StudyDateTime"].str[8:10]).astype(int)]
    print("Number of images after removing cases where DICOM Date doesn't match SIMDCAT BBDD Date: " + str(data.shape[0]))

# Select only images with BurnedInAnnotation = NO
if keepOnlyBurnedInAnnotationNO:
    data = data[data.BurnedInAnnotation.isin(BurnedInAnnotation)]
    print("Number of images after keeping only BurnedInAnnotation = NO: " + str(data.shape[0]))

# Select only specific Body Parts
if filterByBodyPartExamined:
    data = data[data.BodyPartExamined.isin(BodyPartExamined)]
    print("Number of images after filtering by Body Part: " + str(data.shape[0]))

# Select only specific ViewPositions
if filterByViewPosition:
    data = data[data.ViewPosition.isin(ViewPosition)]
    print("Number of images after filtering by View Position: " + str(data.shape[0]))
    
# Select only specific Modalities
if filterByModalities:
    data = data[data.Modality.isin(Modalities)]
    print("Number of images after filtering by Modality: " + str(data.shape[0]))

# Select only specific Manufacturer
if filterByManufacturer:
    data = data[data.Manufacturer.isin(Manufacturer)]
    print("Number of images after filtering by Manufacturer: " + str(data.shape[0]))

# Select only specific ManufacturerModelName (should be superseded by the SerialNumber)
if filterByManufacturerModelName:
    data = data[data.ManufacturerModelName.isin(ManufacturerModelName)]
    print("Number of images after filtering by Manufacturer Model Name: " + str(data.shape[0]))

# Select only specific RX Devices by SerialNumber, just to make sure our images are similar
if filterByDeviceSerialNumber:
    data = data[data.DeviceSerialNumber.isin(DeviceSerialNumber)]
    print("Number of images after filtering by DeviceSN: " + str(data.shape[0]))

# Assign whether they're Cases or Controls 
data["Case_type"] = np.where((data.Dies_desde_positiu<=max_days_control_since_positive) | (data.Dies_desde_positiu.isna()),"Control","Case")
    # We could've used np.select when we have more than 2 conditions: https://stackoverflow.com/questions/19913659/pandas-conditional-creation-of-a-series-dataframe-column

# NULL removal of predicted variable
data = data[data[columnToPredict].isna()==False]
print("Number of images after removing cases where predicted variable is NULL: " + str(data.shape[0]))

# Prevent patients to be in different cases. COVID preference.
if patientOnlyInOneCase:
    data = data[~(data.PatientId.isin(data.PatientId[data.Case_type == "Case"])) | (data.Case_type == "Case")]
    print("Number of images after limiting patients to be in just one case_type: " + str(data.shape[0]))

# Keep just one image per Patient and ViewPosition
if keepOneImage_PatientViewPosition:
    data = data.drop_duplicates(subset=["PatientId", "ViewPosition"])
    print("Number of images after keeping just one image per Patient and ViewPosition: " + str(data.shape[0]))

Number of initial images: 293849
Number of images after removing RX done in days where we can't be sure whether it's Case or Control: 243758
Number of images after removing cases where DICOM Date doesn't match SIMDCAT BBDD Date: 242128
Number of images after filtering by Body Part: 207427
Number of images after filtering by View Position: 62119
Number of images after filtering by Manufacturer: 39629
Number of images after filtering by Manufacturer Model Name: 38063
Number of images after removing cases where predicted variable is NULL: 38063


### Data cleaning / manipulation

In [17]:
# Remove unnecessary columns
data = data.drop(columns=["AFECTAT", "AFECTAT_PCR", "StudyDate", "BurnedInAnnotation"])
# data = data.rename(columns = {'destination_path':'path'})

# Convert columnToPredict to string, just in case they're numbers, so we don't have any errors.
data[columnToPredict] = data[columnToPredict].astype(str)

# Classes to predict
class_names = sorted(data[columnToPredict].unique())
num_class = len(class_names)
data[columnToPredict].value_counts()

Control    31048
Case        7015
Name: Case_type, dtype: int64

In [26]:
# We'll leave patients with just one RX at the bottom of the dataset. This data will be validation and test and for sure will be patients that were not in the training set
# We also shuffle Dataset so we can later split it into Train/Validation/Test without shuffling. This is important, since this way we'll be able to retrain without mixing datasets.

data_multiple_rx = data[data.duplicated(subset=["PatientId"], keep=False)]
data_multiple_rx = data_multiple_rx.sample(frac=1, random_state=random_seed)
data_unique_rx = data[~data.duplicated(subset=["PatientId"], keep=False)]
data_unique_rx = data_unique_rx.sample(frac=1, random_state=random_seed)

data = data_multiple_rx.append(data_unique_rx, ignore_index=True)

print(f'Total: {data.shape[0]} | RX with patients in more than one RX: {data_multiple_rx.shape[0]} | RX with patients in just that RX: {data_unique_rx.shape[0]}')


Total: 38063 | RX with patients in more than one RX: 15157 | RX with patients in just that RX: 22906


In [23]:
# Cohort save
data.to_csv(path_or_buf=os.path.join(working_directory,'cohort_covid_20201110.csv'), index=False, sep='|')