## Load Libraries

In [None]:
import seaborn as sns
import numpy as np
import kagglehub
from kagglehub import KaggleDatasetAdapter
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import urllib.request
import os

In [None]:
!pip install kagglehub
!pip install kagglehub[pandas-datasets]
!pip install wget
!pip install -q tensorflow tensorflow-addons

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m611.8/611.8 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
inflect 7.5.0 requires typeguard>=4.0.1, but you have typeguard 2.13.3 which is incompatible.[0m[31m
[0m

In [None]:
from google.colab import drive
drive.mount('/content/drive')

zip_path = '/content/drive/MyDrive/images_resized.zip'
extract_path = '/content/chest_xray_images'

!unzip -q "$zip_path" -d "$extract_path"

Mounted at /content/drive


In [None]:
#Sanity Check - should say 38008 images
image_dir = os.path.join(extract_path, 'images_resized')

# Check if the directory exists
if os.path.exists(image_dir):
    num_files = len([name for name in os.listdir(image_dir) if os.path.isfile(os.path.join(image_dir, name))])
    print(f"Number of files in 'images_resized': {num_files}")
else:
    print(f"Directory 'images_resized' not found in '{extract_path}'")

Number of files in 'images_resized': 38008


## Load Dataset

In [None]:
# Set the dataset path
dataset_name = "nih-chest-xrays/data"
version = 3
# Set the path to the file you'd like to load
file_path = "Data_Entry_2017.csv"
file_path_bbox = "BBox_List_2017.csv"

gcloud_url_base = 'https://storage.googleapis.com/gcs-public-data--healthcare-nih-chest-xray/png/'

In [None]:
# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  dataset_name,
  file_path,
  # Provide any additional arguments like
  # sql_query or pandas_kwargs. See the
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)

df_box_list = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  dataset_name,
  file_path_bbox
)

  df = kagglehub.load_dataset(
  df_box_list = kagglehub.load_dataset(


In [None]:
display(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112120 entries, 0 to 112119
Data columns (total 12 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Image Index                  112120 non-null  object 
 1   Finding Labels               112120 non-null  object 
 2   Follow-up #                  112120 non-null  int64  
 3   Patient ID                   112120 non-null  int64  
 4   Patient Age                  112120 non-null  int64  
 5   Patient Gender               112120 non-null  object 
 6   View Position                112120 non-null  object 
 7   OriginalImage[Width          112120 non-null  int64  
 8   Height]                      112120 non-null  int64  
 9   OriginalImagePixelSpacing[x  112120 non-null  float64
 10  y]                           112120 non-null  float64
 11  Unnamed: 11                  0 non-null       float64
dtypes: float64(3), int64(5), object(4)
memory usage: 10.3+ MB


None

# Remove all where "View Position" column value is "AP"
AP means "anteroposterior dimension" which is an X-ray from front-to-back This wil affect the training with both back-to-front and front-to-back images of MRIs

In [None]:
# Entries before removal
print(f"Before 'AP' removal: {df['View Position'].value_counts()}")

# Entries after removal
df = df[df['View Position'] != 'AP']

# Remaining data is 66.57% of total initial data
print(f"After 'AP' removal: {df['View Position'].value_counts()}")

Before 'AP' removal: View Position
PA    67310
AP    44810
Name: count, dtype: int64
After 'AP' removal: View Position
PA    67310
Name: count, dtype: int64


In [None]:
links = [
    "https://nihcc.box.com/shared/static/vfk49d74nhbxq3nqjg0900w5nvkorp5c.gz",
    "https://nihcc.box.com/shared/static/i28rlmbvmfjbl8p2n3ril0pptcmcu9d1.gz",
    "https://nihcc.box.com/shared/static/f1t00wrtdk94satdfb9olcolqx20z2jp.gz",
    "https://nihcc.box.com/shared/static/0aowwzs5lhjrceb3qp67ahp0rd1l1etg.gz",
    "https://nihcc.box.com/shared/static/v5e3goj22zr6h8tzualxfsqlqaygfbsn.gz",
    "https://nihcc.box.com/shared/static/asi7ikud9jwnkrnkj99jnpfkjdes7l6l.gz",
    "https://nihcc.box.com/shared/static/jn1b4mw4n6lnh74ovmcjb8y48h8xj07n.gz",
    "https://nihcc.box.com/shared/static/tvpxmn7qyrgl0w8wfh9kqfjskv6nmm1j.gz",
    "https://nihcc.box.com/shared/static/upyy3ml7qdumlgk2rfcvlb9k6gvqq2pj.gz",
    "https://nihcc.box.com/shared/static/l6nilvfa9cg3s28tqv1qc1olm3gnz54p.gz",
    "https://nihcc.box.com/shared/static/hhq8fkdgvcari67vfhs7ppg2w6ni4jze.gz",
    "https://nihcc.box.com/shared/static/ioqwiy20ihqwyr8pf4c24eazhh281pbu.gz",
]

In [None]:
# Create a dictionary for folder locations
folder_ranges = {
    "images_001": (0, 4998),  # Adjusted to 0-based index
    "images_002": (4999, 14998),
    "images_003": (14999, 24998),
    "images_004": (24999, 34998),
    "images_005": (34999, 44998),
    "images_006": (44999, 54998),
    "images_007": (54999, 64998),
    "images_008": (64999, 74998),
    "images_009": (74999, 84998),
    "images_010": (84999, 94998),
    "images_011": (94999, 104998),
    "images_012": (104999, 112120)
}

def get_image_folder(df, image_name):
    if image_name in df["Image Index"].values:
        image_index = df[df["Image Index"] == image_name].index[0]  # Get row index
        # print(f"Image {image_name} is at index {image_index}")  # Debugging output

        for folder, (start, end) in folder_ranges.items():
            if start <= image_index <= end:
                return folder

    return None  # If not found

In [None]:
#Sanity Check
display(df.head())
display(df.tail())
display(df.columns)

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143,
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143,
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168,
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171,
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143,


Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11
112115,00030801_001.png,Mass|Pneumonia,1,30801,39,M,PA,2048,2500,0.168,0.168,
112116,00030802_000.png,No Finding,0,30802,29,M,PA,2048,2500,0.168,0.168,
112117,00030803_000.png,No Finding,0,30803,42,F,PA,2048,2500,0.168,0.168,
112118,00030804_000.png,No Finding,0,30804,30,F,PA,2048,2500,0.168,0.168,
112119,00030805_000.png,No Finding,0,30805,27,M,PA,2048,2500,0.171,0.171,


Index(['Image Index', 'Finding Labels', 'Follow-up #', 'Patient ID',
       'Patient Age', 'Patient Gender', 'View Position', 'OriginalImage[Width',
       'Height]', 'OriginalImagePixelSpacing[x', 'y]', 'Unnamed: 11'],
      dtype='object')

# We want to have 7 generalized classes from the original 15
Take values from "Finding Labels" and convert them into more generalized labels

In [None]:
# Create a list to store all unique labels
all_labels = []

# Iterate over the 'Finding Labels' column
for index, row in df.iterrows():
    labels = row['Finding Labels'].split('|')
    for label in labels:
        all_labels.append(label)

# Get unique labels and print them
all_labels = list(set(all_labels))
print(f"All possible options in 'Finding Labels': {all_labels}")

All possible options in 'Finding Labels': ['Pleural_Thickening', 'Emphysema', 'Mass', 'Cardiomegaly', 'Infiltration', 'Pneumothorax', 'No Finding', 'Pneumonia', 'Fibrosis', 'Edema', 'Atelectasis', 'Consolidation', 'Hernia', 'Effusion', 'Nodule']


In [None]:
def generalize_labels(label):
    if label in ['Pneumonia', 'Consolidation', 'Infiltration']:
        return 'Infection/Infiltration'
    elif label in ['Edema', 'Effusion', 'Pleural_Thickening']:
        return 'Fluid Related Issues'
    elif label in ['Atelectasis', 'Pneumothorax', 'Fibrosis', 'Emphysema']:
        return 'Lung Structure Issues'
    elif label in ['Nodule', 'Mass']:
        return 'Nodule/Mass'
    elif label == 'Cardiomegaly':
        return 'Cardiac Issues'
    elif label == 'Hernia':
        return 'Hernia'
    else:
        return label  # If we don't detect an issue 'No Finding'


df['Finding Labels'] = df['Finding Labels'].apply(lambda x: '|'.join([generalize_labels(label) for label in x.split('|')]))

# Example:
display(df.head()) # View the updated DataFrame

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11
0,00000001_000.png,Cardiac Issues,0,1,58,M,PA,2682,2749,0.143,0.143,
1,00000001_001.png,Cardiac Issues|Lung Structure Issues,1,1,58,M,PA,2894,2729,0.143,0.143,
2,00000001_002.png,Cardiac Issues|Fluid Related Issues,2,1,58,M,PA,2500,2048,0.168,0.168,
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171,
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143,


In [None]:
display(df.describe())
display(df.info())

Unnamed: 0,Follow-up #,Patient ID,Patient Age,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11
count,67310.0,67310.0,67310.0,67310.0,67310.0,67310.0,67310.0,0.0
mean,4.786317,14396.542802,47.352979,2632.590016,2652.208468,0.153868,0.153868,
std,9.403191,8559.885944,16.28955,374.573816,396.607849,0.017179,0.017179,
min,0.0,1.0,1.0,1143.0,1001.0,0.115,0.115,
25%,0.0,7157.25,36.0,2500.0,2411.0,0.143,0.143,
50%,1.0,14112.0,49.0,2678.0,2885.0,0.143,0.143,
75%,5.0,21117.75,59.0,2992.0,2991.0,0.168,0.168,
max,156.0,30805.0,412.0,3056.0,3056.0,0.194336,0.194336,


<class 'pandas.core.frame.DataFrame'>
Index: 67310 entries, 0 to 112119
Data columns (total 12 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Image Index                  67310 non-null  object 
 1   Finding Labels               67310 non-null  object 
 2   Follow-up #                  67310 non-null  int64  
 3   Patient ID                   67310 non-null  int64  
 4   Patient Age                  67310 non-null  int64  
 5   Patient Gender               67310 non-null  object 
 6   View Position                67310 non-null  object 
 7   OriginalImage[Width          67310 non-null  int64  
 8   Height]                      67310 non-null  int64  
 9   OriginalImagePixelSpacing[x  67310 non-null  float64
 10  y]                           67310 non-null  float64
 11  Unnamed: 11                  0 non-null      float64
dtypes: float64(3), int64(5), object(4)
memory usage: 6.7+ MB


None

In [None]:
df['Finding Labels'].info()
df['Finding Labels'].head()
df.head()

<class 'pandas.core.series.Series'>
Index: 67310 entries, 0 to 112119
Series name: Finding Labels
Non-Null Count  Dtype 
--------------  ----- 
67310 non-null  object
dtypes: object(1)
memory usage: 1.0+ MB


Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11
0,00000001_000.png,Cardiac Issues,0,1,58,M,PA,2682,2749,0.143,0.143,
1,00000001_001.png,Cardiac Issues|Lung Structure Issues,1,1,58,M,PA,2894,2729,0.143,0.143,
2,00000001_002.png,Cardiac Issues|Fluid Related Issues,2,1,58,M,PA,2500,2048,0.168,0.168,
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171,
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143,


# Pre-processing - Column Clean Up and Encoding

In [None]:
# Reduce rows with 'No Finding' label to a maximum of 10,000
no_finding_df = df[df['Finding Labels'] == 'No Finding']
if len(no_finding_df) > 10000:
    no_finding_df = no_finding_df.sample(n=10000, random_state=42)  # Randomly sample 10,000 rows

# Concatenate the reduced 'No Finding' rows with other rows
other_findings_df = df[df['Finding Labels'] != 'No Finding']
df = pd.concat([no_finding_df, other_findings_df], ignore_index=True)

In [None]:
#Sanity Check
image_files = set(os.listdir('/content/chest_xray_images/images_resized'))
missing_files = df[~df['Image Index'].isin(image_files)]
print("Number of missing images:", len(missing_files))


Number of missing images: 0


In [None]:
# Rename columns
df = df.rename(columns={
    "OriginalImage[Width": "width",
    "Height]": "height",
    "OriginalImagePixelSpacing[x": "pixel_spacing x",
    "y]": "pixel_spacing y"
})

# drop 'Unnamed: 11' column
df = df.drop(columns=['Unnamed: 11'], errors='ignore')

display(df.head())

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,width,height,pixel_spacing x,pixel_spacing y
0,00019856_000.png,No Finding,0,19856,57,M,PA,2992,2991,0.143,0.143
1,00001020_000.png,No Finding,0,1020,52,M,PA,2500,2048,0.171,0.171
2,00008187_001.png,No Finding,1,8187,59,M,PA,2500,2048,0.168,0.168
3,00003360_003.png,No Finding,3,3360,8,M,PA,2048,2500,0.168,0.168
4,00014364_000.png,No Finding,0,14364,26,F,PA,2454,2991,0.143,0.143


In [None]:
# Extract all unique labels
all_labels = sorted(set(label for sublist in df['Finding Labels'].str.split('|') for label in sublist))
display(all_labels)

# Encode multi-labels
def encode_multilabel(labels):
    label_set = labels.split('|')
    return [1 if label in label_set else 0 for label in all_labels]

df['encoded_labels'] = df['Finding Labels'].apply(encode_multilabel)
display(df.head())

y = np.array(df['encoded_labels'].tolist())

['Cardiac Issues',
 'Fluid Related Issues',
 'Hernia',
 'Infection/Infiltration',
 'Lung Structure Issues',
 'No Finding',
 'Nodule/Mass']

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,width,height,pixel_spacing x,pixel_spacing y,encoded_labels
0,00019856_000.png,No Finding,0,19856,57,M,PA,2992,2991,0.143,0.143,"[0, 0, 0, 0, 0, 1, 0]"
1,00001020_000.png,No Finding,0,1020,52,M,PA,2500,2048,0.171,0.171,"[0, 0, 0, 0, 0, 1, 0]"
2,00008187_001.png,No Finding,1,8187,59,M,PA,2500,2048,0.168,0.168,"[0, 0, 0, 0, 0, 1, 0]"
3,00003360_003.png,No Finding,3,3360,8,M,PA,2048,2500,0.168,0.168,"[0, 0, 0, 0, 0, 1, 0]"
4,00014364_000.png,No Finding,0,14364,26,F,PA,2454,2991,0.143,0.143,"[0, 0, 0, 0, 0, 1, 0]"
