In [1]:
import os
import numpy as np

import pandas as pd

pd.set_option('display.max_colwidth', 250)
pd.set_option('display.max_columns', None)

## Cohort Creation

### Cohort Conditions

In [2]:
working_directory = '/mnt/datasets/fastcovnet/'

resolution = '224'
columnToPredict = 'ViewPosition'  # Just to remove cases where columnToPredict is NULL

min_days_case_since_positive = -2
max_days_case_since_positive = 14

max_days_control_since_positive = -60

patientOnlyInOneCase = 0
keepOneImage_PatientViewPosition = 0
keepOnlyBurnedInAnnotationNO = 0
keepOnlyConsistentData = 0
filterDaysSincePositive = 0
filterByModalities = 0
filterByBodyPartExamined = 1
filterByViewPosition = 1
filterByManufacturer = 0
filterByManufacturerModelName = 0
filterByDeviceSerialNumber = 0

BurnedInAnnotation = ["NO"]    # To be safe, we start by only selecting BurnedInAnnotation = NO. Looks like all of the main ones have NO.

Modalities = ["DX"]

BodyPartExamined = ["CHEST"]

ViewPosition = [
"PA"
 ,"LL"
 ,"AP"
]

Manufacturer = [
"Philips Medical Systems"
]

ManufacturerModelName = [
"DigitalDiagnost"
]

DeviceSerialNumber = [
"****"
]

random_seed = 1

In [3]:
full_dataset = pd.read_csv(os.path.join(working_directory,'full_covid_dataset.csv'),"|")

### Data filtering

In [4]:
print("Number of initial images: " + str(full_dataset.shape[0]))

data = full_dataset

# Remove images where we can't be sure in which category they'll go because they're a bit earlier than the PCR
if filterDaysSincePositive:
    data = data[(data.Dies_desde_positiu<=max_days_control_since_positive) | (data.Dies_desde_positiu.isna()) | ((data.Dies_desde_positiu>=min_days_case_since_positive) & (data.Dies_desde_positiu<=max_days_case_since_positive))]
    print("Number of images after removing RX done in days where we can't be sure whether it's Case or Control: " + str(data.shape[0]))

# Remove images where DICOM Date doesn't match SIMDCAT BBDD Date
if keepOnlyConsistentData:
    data = data[data.StudyDate==(data["StudyDateTime"].str[:4] + data["StudyDateTime"].str[5:7] + data["StudyDateTime"].str[8:10]).astype(int)]
    print("Number of images after removing cases where DICOM Date doesn't match SIMDCAT BBDD Date: " + str(data.shape[0]))

# Select only images with BurnedInAnnotation = NO
if keepOnlyBurnedInAnnotationNO:
    data = data[data.BurnedInAnnotation.isin(BurnedInAnnotation)]
    print("Number of images after keeping only BurnedInAnnotation = NO: " + str(data.shape[0]))

# Select only specific Body Parts
if filterByBodyPartExamined:
    data = data[data.BodyPartExamined.isin(BodyPartExamined)]
    print("Number of images after filtering by Body Part: " + str(data.shape[0]))

# Select only specific ViewPositions
if filterByViewPosition:
    data = data[data.ViewPosition.isin(ViewPosition)]
    print("Number of images after filtering by View Position: " + str(data.shape[0]))
    
# Select only specific Modalities
if filterByModalities:
    data = data[data.Modality.isin(Modalities)]
    print("Number of images after filtering by Modality: " + str(data.shape[0]))

# Select only specific Manufacturer
if filterByManufacturer:
    data = data[data.Manufacturer.isin(Manufacturer)]
    print("Number of images after filtering by Manufacturer: " + str(data.shape[0]))

# Select only specific ManufacturerModelName (should be superseded by the SerialNumber)
if filterByManufacturerModelName:
    data = data[data.ManufacturerModelName.isin(ManufacturerModelName)]
    print("Number of images after filtering by Manufacturer Model Name: " + str(data.shape[0]))

# Select only specific RX Devices by SerialNumber, just to make sure our images are similar
if filterByDeviceSerialNumber:
    data = data[data.DeviceSerialNumber.isin(DeviceSerialNumber)]
    print("Number of images after filtering by DeviceSN: " + str(data.shape[0]))

# Assign whether they're Cases or Controls 
data["Case_type"] = np.where((data.Dies_desde_positiu<=max_days_control_since_positive) | (data.Dies_desde_positiu.isna()),"Control","Case")
    # We could've used np.select when we have more than 2 conditions: https://stackoverflow.com/questions/19913659/pandas-conditional-creation-of-a-series-dataframe-column

# NULL removal of predicted variable
data = data[data[columnToPredict].isna()==False]
print("Number of images after removing cases where predicted variable is NULL: " + str(data.shape[0]))

# # Remove duplicate exact images, keeping just one of them
# data["duplicate"] = data.duplicated(subset=["MD5"])
# data = data[data.duplicate == False]
# data = data.drop(columns=["duplicate", "MD5"])
# print("Number of images after removing duplicate images: " + str(data.shape[0]))

# Prevent patients to be in different cases. COVID preference.
if patientOnlyInOneCase:
    data = data[~(data.PatientId.isin(data.PatientId[data.Case_type == "Case"])) | (data.Case_type == "Case")]
    print("Number of images after limiting patients to be in just one case: " + str(data.shape[0]))

# Keep just one image per Patient and ViewPosition
if keepOneImage_PatientViewPosition:
    data = data.drop_duplicates(subset=["PatientId", "ViewPosition"])
    print("Number of images after keeping just one image per Patient and ViewPosition: " + str(data.shape[0]))

Number of initial images: 293849
Number of images after filtering by Body Part: 234706
Number of images after filtering by View Position: 164121
Number of images after removing cases where predicted variable is NULL: 164121


### Data cleaning / manipulation

In [5]:
# Remove unnecessary columns
data = data.drop(columns=["AFECTAT", "AFECTAT_PCR", "StudyDate", "BurnedInAnnotation"])
# data = data.rename(columns = {'destination_path':'path'})

# Convert columnToPredict to string, just in case they're numbers, so we don't have any errors.
data[columnToPredict] = data[columnToPredict].astype(str)

# Shuffle Dataset so we can later split it into Train/Validation/Test without shuffling. This is important, since this way we'll be able to retrain without mixing datasets.
data = data.sample(frac=1, random_state=random_seed)

# Classes to predict
class_names = sorted(data[columnToPredict].unique())
num_class = len(class_names)
data[columnToPredict].value_counts()

PA    67145
LL    49328
AP    47648
Name: ViewPosition, dtype: int64

In [6]:
data.head()

Unnamed: 0,PatientId,Sexe,EDAT_PROVA_DX,EDAT_POSITIU_PCR,DATA_PRIMERA_PCR_POSITIVA,EXITUS,DATA_EXITUS,StudyDescription,StudyDateTime,Dies_desde_positiu,Source,InstitutionName,StudyInstanceUID,SOPInstanceUID,BodyPartExamined,DeviceSerialNumber,Manufacturer,ManufacturerModelName,Modality,StudyTime,ViewPosition,MD5,path_jpg,Case_type
278553,7277788,Home,52.0,52.0,2020-08-18 00:00:00,No,,TORAX PA Y LAT,2020-08-18 12:05:00.563,0.0,H08000089,H. Santa Creu i Sant Pau,1.3.51.0.1.1.193.146.12.142.4235872.4235850,1.3.46.670589.30.1.6.1.963335863035.1597745181671.2,CHEST,963335863035,Philips Medical Systems,DigitalDiagnost,DX,120500.562,LL,8d730e474f870fbbe458dba723adc139,/mnt/datasets/SIMDCAT/jpg/224/2020_08/1.3.46.670589.30.1.6.1.963335863035.1597745181671.2.jpg,Case
27352,3789678,Home,38.0,,,No,,Rx de tòrax 1-2 projeccions (F i P),2020-03-29 16:38:52.0,,H08002022,SDPI Sant Felix-Sabadell,1.2.826.0.1.3680043.2.403.41.1200329163323404.1.3450,1.3.46.670589.30.1.6.1.963333941249.1585492777968.2,CHEST,963333941249,Philips Medical Systems,DigitalDiagnost,DX,163852.687,PA,a9146e60233246b3eac2e33979af3e25,/mnt/datasets/SIMDCAT/jpg/224/2020_03/1.3.46.670589.30.1.6.1.963333941249.1585492777968.2.jpg,Control
35466,2369396,Dona,51.0,51.0,2020-03-21 00:00:00,No,,Rx de tòrax 1-2 projeccions (F i P),2020-03-22 12:38:11.0,1.0,H08002022,HOSPITAL GERMANS TRIAS Y PUJOL,1.2.826.0.1.3680043.2.403.41.1200322123726649.1.3450,1.3.46.670589.30.1.6.1.963334011495.1584877201625.2,CHEST,963334011495,Philips Medical Systems,DigitalDiagnost,DX,123811.843,LL,11d5cffec3c0b201dc979f59fcf93298,/mnt/datasets/SIMDCAT/jpg/224/2020_03/1.3.46.670589.30.1.6.1.963334011495.1584877201625.2.jpg,Case
33182,3852025,Dona,88.0,89.0,2020-04-23 00:00:00,No,,TORAX PA Y LAT,2019-06-06 08:46:24.0,-322.0,H08000089,H. Santa Creu i Sant Pau,1.3.51.0.1.1.193.146.12.142.3878787.3878756,1.3.46.670589.30.1.6.1.963333651186.1559810989312.2,CHEST,963333651186,Philips Medical Systems,DigitalDiagnost,DX,104624.328,AP,c937717936583bd8a1396a6af0a3927f,/mnt/datasets/SIMDCAT/jpg/224/2019_06/1.3.46.670589.30.1.6.1.963333651186.1559810989312.2.jpg,Control
207269,6183208,Dona,92.0,93.0,2020-04-28 00:00:00,Si,2020-05-07 00:00:00,Tórax,2018-11-05 06:51:49.0,-540.0,H08858656,Hospital Moises Broggi,1.2.840.113845.11.1000000001973866488.20181105071552.3441298,1.3.46.670589.30.1.6.1.963333689245.1541400750562.2,CHEST,963333689245,Philips Medical Systems,DigitalDiagnost,DX,75149.468,AP,b025dd1b37076e56b00b27b657279f04,/mnt/datasets/SIMDCAT/jpg/224/2018_11/1.3.46.670589.30.1.6.1.963333689245.1541400750562.2.jpg,Control


In [7]:
# Cohort save
data.to_csv(path_or_buf=os.path.join(working_directory,'cohort_viewposition_20201116.csv'), index=False, sep='|')

In [8]:
# NULL CHECK

In [9]:
full_dataset.ViewPosition.value_counts(dropna=False)

NaN                88422
PA                 76154
AP                 67277
LL                 50634
LATERAL             6430
RL                  2933
LAT                 1615
RLO                  137
LLO                  100
SUPINE                38
LLD                   27
TOWNE VIEW            16
RLD                   10
AP AXIAL              10
LAO                    9
AP OBL INT ROT         6
BILAT FROG             5
ERECT                  3
AXIAL                  3
RPO                    3
OBLIQUE                2
PA DLI                 2
LPO                    2
XTABLE LATERAL         1
PA OBL                 1
AP INT ROT             1
OBL                    1
XTABLE                 1
AP(SUPINE)             1
PANORAMICO             1
WATERS                 1
CUCLILLAS BILAT        1
PA Y VIEW              1
OBLI                   1
Name: ViewPosition, dtype: int64