# **Fetch, Load and Draw Insights from Data**

In [None]:
# Clone the repository that contains data
!git clone https://github.com/ieee8023/covid-chestxray-dataset.git

Cloning into 'covid-chestxray-dataset'...
remote: Enumerating objects: 15, done.[K
remote: Counting objects: 100% (15/15), done.[K
remote: Compressing objects: 100% (10/10), done.[K
remote: Total 3614 (delta 7), reused 11 (delta 5), pack-reused 3599[K
Receiving objects: 100% (3614/3614), 633.00 MiB | 39.75 MiB/s, done.
Resolving deltas: 100% (1430/1430), done.
Checking out files: 100% (1173/1173), done.


In [None]:
# Read data as pandas dataframe
import pandas as pd
import pickle as pkl

metadata = pd.read_csv("covid-chestxray-dataset/metadata.csv")
metadata = metadata.drop(["Unnamed: 29", "patientid", "temperature", "pO2_saturation", "leukocyte_count", "survival", "neutrophil_count", "lymphocyte_count", "modality", "date", "location", "folder", "doi", "url", "license", "extubated", "other_notes"], axis=1)
metadata.head()

Unnamed: 0,offset,sex,age,finding,RT_PCR_positive,intubated,intubation_present,went_icu,in_icu,needed_supplemental_O2,view,filename,clinical_notes
0,0.0,M,65.0,Pneumonia/Viral/COVID-19,Y,N,N,N,N,Y,PA,auntminnie-a-2020_01_28_23_51_6665_2020_01_28_...,"On January 22, 2020, a 65-year-old man with a ..."
1,3.0,M,65.0,Pneumonia/Viral/COVID-19,Y,N,N,N,N,Y,PA,auntminnie-b-2020_01_28_23_51_6665_2020_01_28_...,"On January 22, 2020, a 65-year-old man with a ..."
2,5.0,M,65.0,Pneumonia/Viral/COVID-19,Y,N,N,N,N,Y,PA,auntminnie-c-2020_01_28_23_51_6665_2020_01_28_...,"On January 22, 2020, a 65-year-old man with a ..."
3,6.0,M,65.0,Pneumonia/Viral/COVID-19,Y,N,N,N,N,Y,PA,auntminnie-d-2020_01_28_23_51_6665_2020_01_28_...,"On January 22, 2020, a 65-year-old man with a ..."
4,0.0,F,52.0,Pneumonia/Viral/COVID-19,Y,N,N,N,N,N,PA,nejmc2001573_f1a.jpeg,diffuse infiltrates in the bilateral lower lungs


In [None]:
# Analyse what different types of diseases are present
metadata["finding"].unique()

array(['Pneumonia/Viral/COVID-19', 'Pneumonia', 'Pneumonia/Viral/SARS',
       'Pneumonia/Fungal/Pneumocystis',
       'Pneumonia/Bacterial/Streptococcus', 'No Finding',
       'Pneumonia/Bacterial/Chlamydophila', 'Pneumonia/Bacterial/E.Coli',
       'Pneumonia/Bacterial/Klebsiella', 'Pneumonia/Bacterial/Legionella',
       'Unknown', 'Pneumonia/Lipoid', 'Pneumonia/Viral/Varicella',
       'Pneumonia/Bacterial', 'Pneumonia/Bacterial/Mycoplasma',
       'Pneumonia/Viral/Influenza', 'todo', 'Tuberculosis',
       'Pneumonia/Viral/Influenza/H1N1', 'Pneumonia/Fungal/Aspergillosis',
       'Pneumonia/Viral/Herpes ', 'Pneumonia/Aspiration',
       'Pneumonia/Bacterial/Nocardia', 'Pneumonia/Viral/MERS-CoV',
       'Pneumonia/Bacterial/Staphylococcus/MRSA'], dtype=object)

# **Preprocess Data**

In [None]:
# Add a new column "label" containing the label 1 if finding is "Pneumonia/Viral/COVID-19" else 0
possible_labels = metadata.finding.unique()
labels_dict = {possible_label: (1 if possible_label == "Pneumonia/Viral/COVID-19" else 0) for possible_label in possible_labels}
metadata["label"] = metadata.finding.replace(labels_dict)
metadata.head()

Unnamed: 0,offset,sex,age,finding,RT_PCR_positive,intubated,intubation_present,went_icu,in_icu,needed_supplemental_O2,view,filename,clinical_notes,label
0,0.0,M,65.0,Pneumonia/Viral/COVID-19,Y,N,N,N,N,Y,PA,auntminnie-a-2020_01_28_23_51_6665_2020_01_28_...,"On January 22, 2020, a 65-year-old man with a ...",1
1,3.0,M,65.0,Pneumonia/Viral/COVID-19,Y,N,N,N,N,Y,PA,auntminnie-b-2020_01_28_23_51_6665_2020_01_28_...,"On January 22, 2020, a 65-year-old man with a ...",1
2,5.0,M,65.0,Pneumonia/Viral/COVID-19,Y,N,N,N,N,Y,PA,auntminnie-c-2020_01_28_23_51_6665_2020_01_28_...,"On January 22, 2020, a 65-year-old man with a ...",1
3,6.0,M,65.0,Pneumonia/Viral/COVID-19,Y,N,N,N,N,Y,PA,auntminnie-d-2020_01_28_23_51_6665_2020_01_28_...,"On January 22, 2020, a 65-year-old man with a ...",1
4,0.0,F,52.0,Pneumonia/Viral/COVID-19,Y,N,N,N,N,N,PA,nejmc2001573_f1a.jpeg,diffuse infiltrates in the bilateral lower lungs,1


In [None]:
# Change the finding "Pneumonia/Viral/COVID-19" to "COVID-19"
covid_metadata = metadata.replace(to_replace="Pneumonia/Viral/COVID-19", value="COVID-19")
covid_metadata.head()

Unnamed: 0,offset,sex,age,finding,RT_PCR_positive,intubated,intubation_present,went_icu,in_icu,needed_supplemental_O2,view,filename,clinical_notes,label
0,0.0,M,65.0,COVID-19,Y,N,N,N,N,Y,PA,auntminnie-a-2020_01_28_23_51_6665_2020_01_28_...,"On January 22, 2020, a 65-year-old man with a ...",1
1,3.0,M,65.0,COVID-19,Y,N,N,N,N,Y,PA,auntminnie-b-2020_01_28_23_51_6665_2020_01_28_...,"On January 22, 2020, a 65-year-old man with a ...",1
2,5.0,M,65.0,COVID-19,Y,N,N,N,N,Y,PA,auntminnie-c-2020_01_28_23_51_6665_2020_01_28_...,"On January 22, 2020, a 65-year-old man with a ...",1
3,6.0,M,65.0,COVID-19,Y,N,N,N,N,Y,PA,auntminnie-d-2020_01_28_23_51_6665_2020_01_28_...,"On January 22, 2020, a 65-year-old man with a ...",1
4,0.0,F,52.0,COVID-19,Y,N,N,N,N,N,PA,nejmc2001573_f1a.jpeg,diffuse infiltrates in the bilateral lower lungs,1


# **Generate Image Embeddings**
I've used state-of-the-art EfficientNetB7 architecture for generating image embeddings.

In [None]:
# If you already have image embeddings, load them and skip the execution of next cell
image_embeddings = pkl.load(open('image_embeddings.pkl', 'rb'))

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.applications.efficientnet import EfficientNetB7, preprocess_input

# Load the model
model = EfficientNetB7(include_top=True)
new_model = Model(inputs=model.input, outputs=model.layers[-3].output)
new_model.summary()


# Generate image embeddings
import cv2
import os
from tqdm import tqdm

image_embeddings = list()
image_series = covid_metadata.filename

c = 0
for image_name in tqdm(image_series):
  try:
    image = cv2.imread("covid-chestxray-dataset/images/" + image_name)
    image = cv2.resize(image, (600, 600))
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    image = preprocess_input(image)
    image_embedding = new_model.predict(image)
    image_embeddings.append(image_embedding[0])
  except:
    c += 1
    image_embeddings.append(None)

pkl.dump(image_embeddings, open('image_embeddings.pkl', 'wb'))

print(f"\nFailed to generate image embeddings for {c} images.")

In [None]:
# Add image embeddings as a new column to dataframe
covid_metadata['image_embeddings'] = image_embeddings
covid_metadata.head()

Unnamed: 0,offset,sex,age,finding,RT_PCR_positive,intubated,intubation_present,went_icu,in_icu,needed_supplemental_O2,view,filename,clinical_notes,label,image_embeddings
0,0.0,M,65.0,COVID-19,Y,N,N,N,N,Y,PA,auntminnie-a-2020_01_28_23_51_6665_2020_01_28_...,"On January 22, 2020, a 65-year-old man with a ...",1,"[0.16646779, 0.37802455, 0.043559056, 0.090420..."
1,3.0,M,65.0,COVID-19,Y,N,N,N,N,Y,PA,auntminnie-b-2020_01_28_23_51_6665_2020_01_28_...,"On January 22, 2020, a 65-year-old man with a ...",1,"[0.24965343, 0.12475621, -0.047701336, 0.18407..."
2,5.0,M,65.0,COVID-19,Y,N,N,N,N,Y,PA,auntminnie-c-2020_01_28_23_51_6665_2020_01_28_...,"On January 22, 2020, a 65-year-old man with a ...",1,"[0.18539263, 0.16858068, 0.020502465, 0.060199..."
3,6.0,M,65.0,COVID-19,Y,N,N,N,N,Y,PA,auntminnie-d-2020_01_28_23_51_6665_2020_01_28_...,"On January 22, 2020, a 65-year-old man with a ...",1,"[0.20033258, 0.12199223, 0.019694442, 0.064149..."
4,0.0,F,52.0,COVID-19,Y,N,N,N,N,N,PA,nejmc2001573_f1a.jpeg,diffuse infiltrates in the bilateral lower lungs,1,"[0.11523124, 0.33968022, 0.024405614, 0.166120..."


In [None]:
 # Drop samples whose image embeddings are None
 print(f"Shape without removing null image embeddings: {covid_metadata.shape}")
 covid_metadata = covid_metadata.dropna(axis=0, subset=['image_embeddings'])
 print(f"Shape after removing null image embeddings: {covid_metadata.shape}")

Shape without removing null image embeddings: (950, 15)
Shape after removing null image embeddings: (929, 15)


# **Generate Text Embeddings (Clinical Notes Embeddings)**
Even clinical notes are important for drawing inferences about a patien't condition. We cannot neglect them. So I've used BioSentVec model, which is specifically trained on a very huge corpus of medical data, to convert these clinical notes into numbers and generate text embeddings. 

In [None]:
# If you already have text_embeddings, load them and skip the execution of next cell
text_embeddings = pkl.load(open('text_embeddings.pkl', 'rb'))

In [None]:
# Download BioSentVec model for generating clinical notes embeddings
!wget https://ftp.ncbi.nlm.nih.gov/pub/lu/Suppl/BioSentVec/BioSentVec_PubMed_MIMICIII-bigram_d700.bin

# Install sent2vec required for loading BioSentVec model
!wget https://github.com/epfml/sent2vec/archive/master.zip
!unzip master.zip
%cd sent2vec-master
!make
!sudo pip install .

# Load BioSentVec model
import sent2vec

model = sent2vec.Sent2vecModel()
model.load_model('BioSentVec_PubMed_MIMICIII-bigram_d700.bin')

# Generate text embeddings
text_embeddings = list()
text_series = covid_metadata.clinical_notes

c = 0
for clinical_data in text_series:
  try:
    text_embedding = model.embed_sentence(clinical_data)
    text_embeddings.append(text_embedding[0])
  except:
    c += 1
    text_embeddings.append(model.embed_sentence("No clinical notes are available for this patient.")[0])

pkl.dump(text_embeddings, open('text_embeddings.pkl', 'wb'))
print(f"\nFailed to generate text embeddings for {c} texts.")

In [None]:
# Add text embeddings as a new column to the dataframe
covid_metadata['text_embeddings'] = text_embeddings
covid_metadata.head()

Unnamed: 0,offset,sex,age,finding,RT_PCR_positive,intubated,intubation_present,went_icu,in_icu,needed_supplemental_O2,view,filename,clinical_notes,label,image_embeddings,text_embeddings
0,0.0,M,65.0,COVID-19,Y,N,N,N,N,Y,PA,auntminnie-a-2020_01_28_23_51_6665_2020_01_28_...,"On January 22, 2020, a 65-year-old man with a ...",1,"[0.16646779, 0.37802455, 0.043559056, 0.090420...","[-0.010609061, -0.03510914, 0.013853411, -0.08..."
1,3.0,M,65.0,COVID-19,Y,N,N,N,N,Y,PA,auntminnie-b-2020_01_28_23_51_6665_2020_01_28_...,"On January 22, 2020, a 65-year-old man with a ...",1,"[0.24965343, 0.12475621, -0.047701336, 0.18407...","[0.015778346, -0.058980826, -0.0014589406, -0...."
2,5.0,M,65.0,COVID-19,Y,N,N,N,N,Y,PA,auntminnie-c-2020_01_28_23_51_6665_2020_01_28_...,"On January 22, 2020, a 65-year-old man with a ...",1,"[0.18539263, 0.16858068, 0.020502465, 0.060199...","[0.015778346, -0.058980826, -0.0014589406, -0...."
3,6.0,M,65.0,COVID-19,Y,N,N,N,N,Y,PA,auntminnie-d-2020_01_28_23_51_6665_2020_01_28_...,"On January 22, 2020, a 65-year-old man with a ...",1,"[0.20033258, 0.12199223, 0.019694442, 0.064149...","[-0.011988744, -0.03202686, 0.035648797, -0.07..."
4,0.0,F,52.0,COVID-19,Y,N,N,N,N,N,PA,nejmc2001573_f1a.jpeg,diffuse infiltrates in the bilateral lower lungs,1,"[0.11523124, 0.33968022, 0.024405614, 0.166120...","[0.056925807, -0.031200163, -0.33779252, 0.028..."


# **Deal with Missing in Features** 
For offset and age feature, I've replaced the missing values with the most occurrring value as it shows the general trend of most of patient admitted.  

And for rest all other features, I've replaced missing values with a tag 'Unclear'.

In [None]:
# Deal with null values present in columns by replacing null value with 'Unclear' string except in case of age and 
# offset where null values are replaced withthe mode of columns
covid_metadata.age = covid_metadata.age.fillna(covid_metadata['age'].value_counts().max())
covid_metadata.offset = covid_metadata.offset.fillna(covid_metadata['offset'].value_counts().max())
covid_metadata.sex = covid_metadata.sex.fillna('Unclear')
covid_metadata.RT_PCR_positive = covid_metadata.RT_PCR_positive.fillna('N')
covid_metadata.intubated = covid_metadata.intubated.fillna('Unclear')
covid_metadata.intubation_present = covid_metadata.intubation_present.fillna('Unclear')
covid_metadata.in_icu = covid_metadata.in_icu.fillna('Unclear')
covid_metadata.went_icu = covid_metadata.went_icu.fillna('Unclear')
covid_metadata.needed_supplemental_O2 = covid_metadata.needed_supplemental_O2.fillna('Unclear')
covid_metadata.head()

Unnamed: 0,offset,sex,age,finding,RT_PCR_positive,intubated,intubation_present,went_icu,in_icu,needed_supplemental_O2,view,filename,clinical_notes,label,image_embeddings,text_embeddings
0,0.0,M,65.0,COVID-19,Y,N,N,N,N,Y,PA,auntminnie-a-2020_01_28_23_51_6665_2020_01_28_...,"On January 22, 2020, a 65-year-old man with a ...",1,"[0.16646779, 0.37802455, 0.043559056, 0.090420...","[-0.010609061, -0.03510914, 0.013853411, -0.08..."
1,3.0,M,65.0,COVID-19,Y,N,N,N,N,Y,PA,auntminnie-b-2020_01_28_23_51_6665_2020_01_28_...,"On January 22, 2020, a 65-year-old man with a ...",1,"[0.24965343, 0.12475621, -0.047701336, 0.18407...","[0.015778346, -0.058980826, -0.0014589406, -0...."
2,5.0,M,65.0,COVID-19,Y,N,N,N,N,Y,PA,auntminnie-c-2020_01_28_23_51_6665_2020_01_28_...,"On January 22, 2020, a 65-year-old man with a ...",1,"[0.18539263, 0.16858068, 0.020502465, 0.060199...","[0.015778346, -0.058980826, -0.0014589406, -0...."
3,6.0,M,65.0,COVID-19,Y,N,N,N,N,Y,PA,auntminnie-d-2020_01_28_23_51_6665_2020_01_28_...,"On January 22, 2020, a 65-year-old man with a ...",1,"[0.20033258, 0.12199223, 0.019694442, 0.064149...","[-0.011988744, -0.03202686, 0.035648797, -0.07..."
4,0.0,F,52.0,COVID-19,Y,N,N,N,N,N,PA,nejmc2001573_f1a.jpeg,diffuse infiltrates in the bilateral lower lungs,1,"[0.11523124, 0.33968022, 0.024405614, 0.166120...","[0.056925807, -0.031200163, -0.33779252, 0.028..."


# **Transform Age feature into Categorical feature**:
I've transformed age feature into a categorical feature. This is because ages like 23, 24 or 25 won't make much difference. So I've divided them into appropriate bins.

In [None]:
# Divide age into bins
covid_metadata.age = pd.cut(x=covid_metadata['age'], bins=[18, 28, 33, 38, 43, 48, 53, 58, 63, 68, 100], labels=[23, 30, 35, 40, 45, 50, 55, 60, 65, 70])
covid_metadata.head()

Unnamed: 0,offset,sex,age,finding,RT_PCR_positive,intubated,intubation_present,went_icu,in_icu,needed_supplemental_O2,view,filename,clinical_notes,label,image_embeddings,text_embeddings
0,0.0,M,65,COVID-19,Y,N,N,N,N,Y,PA,auntminnie-a-2020_01_28_23_51_6665_2020_01_28_...,"On January 22, 2020, a 65-year-old man with a ...",1,"[0.16646779, 0.37802455, 0.043559056, 0.090420...","[-0.010609061, -0.03510914, 0.013853411, -0.08..."
1,3.0,M,65,COVID-19,Y,N,N,N,N,Y,PA,auntminnie-b-2020_01_28_23_51_6665_2020_01_28_...,"On January 22, 2020, a 65-year-old man with a ...",1,"[0.24965343, 0.12475621, -0.047701336, 0.18407...","[0.015778346, -0.058980826, -0.0014589406, -0...."
2,5.0,M,65,COVID-19,Y,N,N,N,N,Y,PA,auntminnie-c-2020_01_28_23_51_6665_2020_01_28_...,"On January 22, 2020, a 65-year-old man with a ...",1,"[0.18539263, 0.16858068, 0.020502465, 0.060199...","[0.015778346, -0.058980826, -0.0014589406, -0...."
3,6.0,M,65,COVID-19,Y,N,N,N,N,Y,PA,auntminnie-d-2020_01_28_23_51_6665_2020_01_28_...,"On January 22, 2020, a 65-year-old man with a ...",1,"[0.20033258, 0.12199223, 0.019694442, 0.064149...","[-0.011988744, -0.03202686, 0.035648797, -0.07..."
4,0.0,F,50,COVID-19,Y,N,N,N,N,N,PA,nejmc2001573_f1a.jpeg,diffuse infiltrates in the bilateral lower lungs,1,"[0.11523124, 0.33968022, 0.024405614, 0.166120...","[0.056925807, -0.031200163, -0.33779252, 0.028..."


# **One-Hot Encode Categorical Features** 

In [None]:
# For encoding categorical data, install category_encoders
!pip install category_encoders

Collecting category_encoders
[?25l  Downloading https://files.pythonhosted.org/packages/44/57/fcef41c248701ee62e8325026b90c432adea35555cbc870aff9cfba23727/category_encoders-2.2.2-py2.py3-none-any.whl (80kB)
[K     |████                            | 10kB 14.8MB/s eta 0:00:01[K     |████████▏                       | 20kB 13.6MB/s eta 0:00:01[K     |████████████▏                   | 30kB 9.5MB/s eta 0:00:01[K     |████████████████▎               | 40kB 8.2MB/s eta 0:00:01[K     |████████████████████▎           | 51kB 5.6MB/s eta 0:00:01[K     |████████████████████████▍       | 61kB 5.9MB/s eta 0:00:01[K     |████████████████████████████▍   | 71kB 6.1MB/s eta 0:00:01[K     |████████████████████████████████| 81kB 4.4MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.2.2


In [None]:
import category_encoders as ce

encoder = ce.OneHotEncoder(cols=['sex', 'age', 'RT_PCR_positive', 'intubated', 'intubation_present', 'went_icu', 'in_icu', 'needed_supplemental_O2', 'view'], handle_unknown='return_nan', return_df=True, use_cat_names=True)
covid_metadata = encoder.fit_transform(covid_metadata)
covid_metadata.head()

  import pandas.util.testing as tm
  elif pd.api.types.is_categorical(cols):


Unnamed: 0,offset,sex_M,sex_F,sex_Unclear,age_23.0,age_30.0,age_35.0,age_40.0,age_45.0,age_50.0,age_55.0,age_60.0,age_65.0,age_70.0,age_nan,finding,RT_PCR_positive_Y,RT_PCR_positive_N,RT_PCR_positive_Unclear,intubated_N,intubated_Y,intubated_Unclear,intubation_present_N,intubation_present_Y,intubation_present_Unclear,went_icu_N,went_icu_Y,went_icu_Unclear,in_icu_N,in_icu_Y,in_icu_Unclear,needed_supplemental_O2_Y,needed_supplemental_O2_N,needed_supplemental_O2_Unclear,view_PA,view_AP,view_L,view_Axial,view_AP Supine,view_Coronal,view_AP Erect,filename,clinical_notes,label,image_embeddings,text_embeddings
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,COVID-19,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,auntminnie-a-2020_01_28_23_51_6665_2020_01_28_...,"On January 22, 2020, a 65-year-old man with a ...",1,"[0.16646779, 0.37802455, 0.043559056, 0.090420...","[-0.010609061, -0.03510914, 0.013853411, -0.08..."
1,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,COVID-19,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,auntminnie-b-2020_01_28_23_51_6665_2020_01_28_...,"On January 22, 2020, a 65-year-old man with a ...",1,"[0.24965343, 0.12475621, -0.047701336, 0.18407...","[0.015778346, -0.058980826, -0.0014589406, -0...."
2,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,COVID-19,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,auntminnie-c-2020_01_28_23_51_6665_2020_01_28_...,"On January 22, 2020, a 65-year-old man with a ...",1,"[0.18539263, 0.16858068, 0.020502465, 0.060199...","[0.015778346, -0.058980826, -0.0014589406, -0...."
3,6.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,COVID-19,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,auntminnie-d-2020_01_28_23_51_6665_2020_01_28_...,"On January 22, 2020, a 65-year-old man with a ...",1,"[0.20033258, 0.12199223, 0.019694442, 0.064149...","[-0.011988744, -0.03202686, 0.035648797, -0.07..."
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,COVID-19,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,nejmc2001573_f1a.jpeg,diffuse infiltrates in the bilateral lower lungs,1,"[0.11523124, 0.33968022, 0.024405614, 0.166120...","[0.056925807, -0.031200163, -0.33779252, 0.028..."


In [None]:
covid_metadata.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 929 entries, 0 to 949
Data columns (total 46 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   offset                          929 non-null    float64
 1   sex_M                           929 non-null    float64
 2   sex_F                           929 non-null    float64
 3   sex_Unclear                     929 non-null    float64
 4   age_23.0                        929 non-null    float64
 5   age_30.0                        929 non-null    float64
 6   age_35.0                        929 non-null    float64
 7   age_40.0                        929 non-null    float64
 8   age_45.0                        929 non-null    float64
 9   age_50.0                        929 non-null    float64
 10  age_55.0                        929 non-null    float64
 11  age_60.0                        929 non-null    float64
 12  age_65.0                        929 

# **Random Forest Classifiers**

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

test_size = 0.15
random_state_split = 16
random_state_classifier = 0

## Random Forest Classifeir for Image Embeddings:

In [None]:
# Load data
X = covid_metadata.image_embeddings
y = covid_metadata.label

In [None]:
X.head()

0    [0.16646779, 0.37802455, 0.043559056, 0.090420...
1    [0.24965343, 0.12475621, -0.047701336, 0.18407...
2    [0.18539263, 0.16858068, 0.020502465, 0.060199...
3    [0.20033258, 0.12199223, 0.019694442, 0.064149...
4    [0.11523124, 0.33968022, 0.024405614, 0.166120...
Name: image_embeddings, dtype: object

In [None]:
y.head()

0    1
1    1
2    1
3    1
4    1
Name: label, dtype: int64

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state_split)

print("Shape of original dataset :", covid_metadata.shape)
print("Shape of input - training set", X_train.shape)
print("Shape of output - training set", y_train.shape)
print("Shape of input - testing set", X_test.shape)
print("Shape of output - testing set", y_test.shape)

Shape of original dataset : (929, 46)
Shape of input - training set (789,)
Shape of output - training set (789,)
Shape of input - testing set (140,)
Shape of output - testing set (140,)


In [None]:
# Prepare data
tX_train = list()
for i in X_train:
  tX_train.append(list(i))
tX_train = np.array(tX_train)  


tX_test = list()
for i in X_test:
  tX_test.append(list(i))
tX_test = np.array(tX_test)


ty_train = list()
for i in y_train:
  ty_train.append(i)
ty_train = np.array(ty_train)


ty_test = list()
for i in y_test:
  ty_test.append(i)
ty_test = np.array(ty_test)

In [None]:
# Train classifier
clf_image = RandomForestClassifier(n_estimators=100, random_state=random_state_classifier)
clf_image.fit(tX_train, ty_train)

# Predict using classifier
y_pred_image = clf_image.predict(tX_test)
acc = sum(y_pred_image  == ty_test)/len(tX_test) * 100

print(f"Valdiation set accuracy: {str(acc)[:5]}")

Valdiation set accuracy: 80.71


## Random Forest Classifier for Clinical Notes (Text Embeddings)

In [None]:
# Load data
X = covid_metadata.text_embeddings
y = covid_metadata.label

In [None]:
X.head()

0    [-0.010609061, -0.03510914, 0.013853411, -0.08...
1    [0.015778346, -0.058980826, -0.0014589406, -0....
2    [0.015778346, -0.058980826, -0.0014589406, -0....
3    [-0.011988744, -0.03202686, 0.035648797, -0.07...
4    [0.056925807, -0.031200163, -0.33779252, 0.028...
Name: text_embeddings, dtype: object

In [None]:
y.head()

0    1
1    1
2    1
3    1
4    1
Name: label, dtype: int64

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state_split)

print("Shape of original dataset :", covid_metadata.shape)
print("Shape of input - training set", X_train.shape)
print("Shape of output - training set", y_train.shape)
print("Shape of input - testing set", X_test.shape)
print("Shape of output - testing set", y_test.shape)

Shape of original dataset : (929, 46)
Shape of input - training set (789,)
Shape of output - training set (789,)
Shape of input - testing set (140,)
Shape of output - testing set (140,)


In [None]:
# Prepare data
tX_train = list()
for i in X_train:
  tX_train.append(list(i))
tX_train = np.array(tX_train)


tX_test = list()
for i in X_test:
  tX_test.append(list(i))
tX_test = np.array(tX_test)


ty_train = list()
for i in y_train:
  ty_train.append(i)
ty_train = np.array(ty_train)


ty_test = list()
for i in y_test:
  ty_test.append(i)
ty_test = np.array(ty_test)

In [None]:
# Train classifier
clf_text = RandomForestClassifier(random_state=random_state_classifier)
clf_text.fit(tX_train, ty_train)

# Predict using classifier
y_pred_text = clf_text.predict(tX_test)
acc = sum(y_pred_text  == ty_test)/len(tX_test) * 100

print(f"Valdiation set accuracy: {str(acc)[:5]}%.")

Valdiation set accuracy: 78.57%.


## Random Forest Classifier for Patient Features

In [None]:
# Load data
y = covid_metadata.label
X = covid_metadata.drop(["finding", "filename", "clinical_notes", "label", "image_embeddings", "text_embeddings"], axis=1)

In [None]:
X.head()

Unnamed: 0,offset,sex_M,sex_F,sex_Unclear,age_23.0,age_30.0,age_35.0,age_40.0,age_45.0,age_50.0,age_55.0,age_60.0,age_65.0,age_70.0,age_nan,RT_PCR_positive_Y,RT_PCR_positive_N,RT_PCR_positive_Unclear,intubated_N,intubated_Y,intubated_Unclear,intubation_present_N,intubation_present_Y,intubation_present_Unclear,went_icu_N,went_icu_Y,went_icu_Unclear,in_icu_N,in_icu_Y,in_icu_Unclear,needed_supplemental_O2_Y,needed_supplemental_O2_N,needed_supplemental_O2_Unclear,view_PA,view_AP,view_L,view_Axial,view_AP Supine,view_Coronal,view_AP Erect
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,6.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
y.head()

0    1
1    1
2    1
3    1
4    1
Name: label, dtype: int64

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state_split)

print("Shape of original dataset :", covid_metadata.shape)
print("Shape of input - training set", X_train.shape)
print("Shape of output - training set", y_train.shape)
print("Shape of input - testing set", X_test.shape)
print("Shape of output - testing set", y_test.shape)

Shape of original dataset : (929, 46)
Shape of input - training set (789, 40)
Shape of output - training set (789,)
Shape of input - testing set (140, 40)
Shape of output - testing set (140,)


In [None]:
# Train classifier
clf_features = RandomForestClassifier(random_state=random_state_classifier)
clf_features.fit(X_train, y_train)

# Predict using classifier
y_pred_features = clf_features.predict(X_test)
acc = sum(y_pred_features  == y_test)/len(X_test) * 100

print(f"Valdiation set accuracy: {str(acc)[:5]}")

Valdiation set accuracy: 96.42


# **Calculating Resultant Accuracy by Combining the predictions of each classifier**

In [None]:
def covid_19_detection(y_pred_image, y_pred_text, y_pred_features):
  res = list()
  for i, j, k in zip(y_pred_image, y_pred_text, y_pred_features):
    if (i+j+k) >= 2:
      res.append(1)
    else:
      res.append(0)

  return res

y_pred = covid_19_detection(y_pred_image, y_pred_text, y_pred_features)

In [None]:
res_acc = sum(y_pred == y_test)/len(y_test) * 100
print(f"Resultant validation accuracy is: {str(res_acc)[:5]}%.")

Resultant validation accuracy is: 92.14%.
