### Analysis of the chest x-ray dataset

In [1]:
# to handle datasets
import pandas as pd
import numpy as np

# for plotting
# import matplotlib.pyplot as plt
# %matplotlib inline

# to display all the columns of the dataframe in the notebook
pd.pandas.set_option('display.max_columns', None)

In [3]:
# load dataset
df = pd.read_csv('CheXpert-v1.0-small/train.csv', usecols=[
    'Path',
    'No Finding',
    'Enlarged Cardiomediastinum',
    'Cardiomegaly',
    'Lung Opacity',
    'Lung Lesion',
    'Edema',
    'Consolidation',
    'Pneumonia',
    'Atelectasis',
    'Pneumothorax',
    'Pleural Effusion',
    'Pleural Other',
    'Fracture',
    'Support Devices'
]) # This features are the features i consider relevent to this project.

In [4]:
df.columns

Index(['Path', 'No Finding', 'Enlarged Cardiomediastinum', 'Cardiomegaly',
       'Lung Opacity', 'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia',
       'Atelectasis', 'Pneumothorax', 'Pleural Effusion', 'Pleural Other',
       'Fracture', 'Support Devices'],
      dtype='object')

In [5]:
df.head()

Unnamed: 0,Path,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices
0,CheXpert-v1.0-small/train/patient00001/study1/...,1.0,,,,,,,,,0.0,,,,1.0
1,CheXpert-v1.0-small/train/patient00002/study2/...,,,-1.0,1.0,,-1.0,-1.0,,-1.0,,-1.0,,1.0,
2,CheXpert-v1.0-small/train/patient00002/study1/...,,,,1.0,,,-1.0,,,,,,1.0,
3,CheXpert-v1.0-small/train/patient00002/study1/...,,,,1.0,,,-1.0,,,,,,1.0,
4,CheXpert-v1.0-small/train/patient00003/study1/...,,,,,,1.0,,,,0.0,,,,


In [6]:
# Columns name transformation.

df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '').str.replace('/', '_or_')

In [7]:
df.columns

Index(['path', 'no_finding', 'enlarged_cardiomediastinum', 'cardiomegaly',
       'lung_opacity', 'lung_lesion', 'edema', 'consolidation', 'pneumonia',
       'atelectasis', 'pneumothorax', 'pleural_effusion', 'pleural_other',
       'fracture', 'support_devices'],
      dtype='object')

In [8]:
# -1.0: Uncertainty
# 0.0: means the disease has not occured 
# 1.0
# Nan: Unmentioned


def feature_value_count(df, variable):
    print('The feature "{}" has: \n'.format(variable), df[variable].value_counts(), 'values \n')

for features in df.columns:
    if features != 'path':
        feature_value_count(df, variable = features)

The feature "no_finding" has: 
 1.0    22381
Name: no_finding, dtype: int64 values 

The feature "enlarged_cardiomediastinum" has: 
  0.0    21638
-1.0    12403
 1.0    10798
Name: enlarged_cardiomediastinum, dtype: int64 values 

The feature "cardiomegaly" has: 
  1.0    27000
 0.0    11116
-1.0     8087
Name: cardiomegaly, dtype: int64 values 

The feature "lung_opacity" has: 
  1.0    105581
 0.0      6599
-1.0      5598
Name: lung_opacity, dtype: int64 values 

The feature "lung_lesion" has: 
  1.0    9186
-1.0    1488
 0.0    1270
Name: lung_lesion, dtype: int64 values 

The feature "edema" has: 
  1.0    52246
 0.0    20726
-1.0    12984
Name: edema, dtype: int64 values 

The feature "consolidation" has: 
  0.0    28097
-1.0    27742
 1.0    14783
Name: consolidation, dtype: int64 values 

The feature "pneumonia" has: 
 -1.0    18770
 1.0     6039
 0.0     2799
Name: pneumonia, dtype: int64 values 

The feature "atelectasis" has: 
 -1.0    33739
 1.0    33376
 0.0     1328
Name: 

In [59]:
# df.isnull().any(axis=1).value_counts()
# df_copy = df.dropna()

In [9]:
# Check column values type
df.dtypes

path                           object
no_finding                    float64
enlarged_cardiomediastinum    float64
cardiomegaly                  float64
lung_opacity                  float64
lung_lesion                   float64
edema                         float64
consolidation                 float64
pneumonia                     float64
atelectasis                   float64
pneumothorax                  float64
pleural_effusion              float64
pleural_other                 float64
fracture                      float64
support_devices               float64
dtype: object

In [10]:
# Removing Uncertinty from the dataset
temp_df = df.replace(to_replace = -1.0, value = None)

In [11]:
# Checking dataset distribution after removing uncertainty

for features in df.columns:
    if features != 'path':
        feature_value_count(temp_df, variable = features)

The feature "no_finding" has: 
 1.0    22381
Name: no_finding, dtype: int64 values 

The feature "enlarged_cardiomediastinum" has: 
 0.0    23055
1.0    11621
Name: enlarged_cardiomediastinum, dtype: int64 values 

The feature "cardiomegaly" has: 
 1.0    28555
0.0    11509
Name: cardiomegaly, dtype: int64 values 

The feature "lung_opacity" has: 
 1.0    108235
0.0      6806
Name: lung_opacity, dtype: int64 values 

The feature "lung_lesion" has: 
 1.0    9318
0.0    1272
Name: lung_lesion, dtype: int64 values 

The feature "edema" has: 
 1.0    56545
0.0    21975
Name: edema, dtype: int64 values 

The feature "consolidation" has: 
 0.0    31664
1.0    17269
Name: consolidation, dtype: int64 values 

The feature "pneumonia" has: 
 1.0    6853
0.0    3081
Name: pneumonia, dtype: int64 values 

The feature "atelectasis" has: 
 1.0    40172
0.0     1572
Name: atelectasis, dtype: int64 values 

The feature "pneumothorax" has: 
 0.0    57239
1.0    19935
Name: pneumothorax, dtype: int64 va

In [12]:
# creating the the disease column list
label_features = [x for x in df.columns if x != 'path']

# Remove column where there are no findings or diseases

# checking rows that have more than one disease

In [13]:
df.columns

Index(['path', 'no_finding', 'enlarged_cardiomediastinum', 'cardiomegaly',
       'lung_opacity', 'lung_lesion', 'edema', 'consolidation', 'pneumonia',
       'atelectasis', 'pneumothorax', 'pleural_effusion', 'pleural_other',
       'fracture', 'support_devices'],
      dtype='object')

In [86]:
# temp_df[label_features] = temp_df.replace(to_replace = 'Nan', value = 0.0)

In [14]:
temp_df.fillna(0, inplace=True)

In [15]:
temp_df

Unnamed: 0,path,no_finding,enlarged_cardiomediastinum,cardiomegaly,lung_opacity,lung_lesion,edema,consolidation,pneumonia,atelectasis,pneumothorax,pleural_effusion,pleural_other,fracture,support_devices
0,CheXpert-v1.0-small/train/patient00001/study1/...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,CheXpert-v1.0-small/train/patient00002/study2/...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,CheXpert-v1.0-small/train/patient00002/study1/...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,CheXpert-v1.0-small/train/patient00002/study1/...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,CheXpert-v1.0-small/train/patient00003/study1/...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223409,CheXpert-v1.0-small/train/patient64537/study2/...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
223410,CheXpert-v1.0-small/train/patient64537/study1/...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
223411,CheXpert-v1.0-small/train/patient64538/study1/...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
223412,CheXpert-v1.0-small/train/patient64539/study1/...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [16]:
label_features

['no_finding',
 'enlarged_cardiomediastinum',
 'cardiomegaly',
 'lung_opacity',
 'lung_lesion',
 'edema',
 'consolidation',
 'pneumonia',
 'atelectasis',
 'pneumothorax',
 'pleural_effusion',
 'pleural_other',
 'fracture',
 'support_devices']

In [17]:
# temp_df[label_features] = pd.to_numeric(temp_df[label_features])

temp_df['enlarged_cardiomediastinum'] = pd.to_numeric(temp_df['enlarged_cardiomediastinum'])
temp_df['cardiomegaly'] = pd.to_numeric(temp_df['cardiomegaly'])
temp_df['lung_opacity'] = pd.to_numeric(temp_df['lung_opacity'])
temp_df['lung_lesion'] = pd.to_numeric(temp_df['lung_lesion'])
temp_df['edema'] = pd.to_numeric(temp_df['edema'])
temp_df['consolidation'] = pd.to_numeric(temp_df['consolidation'])
temp_df['pneumonia'] = pd.to_numeric(temp_df['pneumonia'])
temp_df['atelectasis'] = pd.to_numeric(temp_df['atelectasis'])
temp_df['pneumothorax'] = pd.to_numeric(temp_df['pneumothorax'])
temp_df['pleural_effusion'] = pd.to_numeric(temp_df['pleural_effusion'])
temp_df['pleural_other'] = pd.to_numeric(temp_df['pleural_other'])
temp_df['fracture'] = pd.to_numeric(temp_df['fracture'])
temp_df['support_devices'] = pd.to_numeric(temp_df['support_devices'])

In [18]:
temp_df.dtypes

path                           object
no_finding                    float64
enlarged_cardiomediastinum    float64
cardiomegaly                  float64
lung_opacity                  float64
lung_lesion                   float64
edema                         float64
consolidation                 float64
pneumonia                     float64
atelectasis                   float64
pneumothorax                  float64
pleural_effusion              float64
pleural_other                 float64
fracture                      float64
support_devices               float64
dtype: object

In [19]:
temp_df['no_of_labels'] = temp_df['enlarged_cardiomediastinum']+temp_df['cardiomegaly']+temp_df['lung_opacity']+temp_df['lung_lesion']+temp_df['edema']+temp_df['consolidation']+temp_df['pneumonia']+temp_df['atelectasis']+temp_df['pneumothorax']+temp_df['pleural_effusion']+temp_df['pleural_other']+temp_df['fracture']+temp_df['support_devices']

In [21]:
temp_df['no_of_labels'].value_counts()

# The result below shows how many disease(s) are associated with row

3.0    55933
2.0    54840
1.0    44464
4.0    33099
0.0    22659
5.0    10456
6.0     1743
7.0      201
8.0       19
Name: no_of_labels, dtype: int64

In [22]:
temp_df[temp_df.no_of_labels == 8.0]

# Below we see the peculiar cases of the 2 observations with 8 differnt disease case.

Unnamed: 0,path,no_finding,enlarged_cardiomediastinum,cardiomegaly,lung_opacity,lung_lesion,edema,consolidation,pneumonia,atelectasis,pneumothorax,pleural_effusion,pleural_other,fracture,support_devices,no_of_labels
21841,CheXpert-v1.0-small/train/patient05319/study6/...,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,8.0
32510,CheXpert-v1.0-small/train/patient07931/study1/...,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,8.0
62843,CheXpert-v1.0-small/train/patient15162/study11...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,8.0
70452,CheXpert-v1.0-small/train/patient16888/study1/...,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,8.0
70453,CheXpert-v1.0-small/train/patient16888/study1/...,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,8.0
77854,CheXpert-v1.0-small/train/patient18704/study1/...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,8.0
79364,CheXpert-v1.0-small/train/patient19056/study1/...,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,8.0
79365,CheXpert-v1.0-small/train/patient19056/study1/...,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,8.0
126702,CheXpert-v1.0-small/train/patient30351/study1/...,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,8.0
130832,CheXpert-v1.0-small/train/patient31349/study19...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,8.0


We will then save a sample of the temp_df for further analysis whenever needed

In [23]:
temp_df.to_csv('CheXpert-v1.0-small/train_data.csv', index=False)