<a href="https://colab.research.google.com/github/AdiVM/Neuro240/blob/main/Neuro240MidtermCheckpoint.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
import os
from google.colab import drive
drive.flush_and_unmount()
drive.mount('/content/drive', force_remount=True)


path = "/content/drive/MyDrive/NIH_ChestXRay_Data_Neuro240"
print("Contents of the directory:", os.listdir(path))

Mounted at /content/drive
Contents of the directory: ['images', 'Data_Entry_2017_v2020.csv']


In [25]:
print("Contents of the directory:", os.listdir(path))

Contents of the directory: ['images', 'Data_Entry_2017_v2020.csv']


In [26]:
import pandas as pd

metadata_path = "/content/drive/MyDrive/NIH_ChestXRay_Data_Neuro240/Data_Entry_2017_v2020.csv"
image_folder = "/content/drive/MyDrive/NIH_ChestXRay_Data_Neuro240/images"

metadata = pd.read_csv(metadata_path)

print("Metdata loaded")

# Filtering the metadata to find images labeled either no finding or those containing the word mass
filtered_metadata = metadata[
    (metadata["Finding Labels"] == "No Finding") |
    (metadata["Finding Labels"].str.contains("Mass", na=False))
]

filtered_image_indexes = set(filtered_metadata["Image Index"])

print("Getting list of all images")
# Load only the first 100 images just to see that things are working
subset_size = 100
available_images = os.listdir(image_folder)[:subset_size]

# Match those images with the indexes we pulled above
matching_images = filtered_image_indexes.intersection(available_images)

# Convert to stored list
matching_images = sorted(list(matching_images))

print(f"Total matching images found: {len(matching_images)}")

Metdata loaded
Getting list of all images
Total matching images found: 57


In [27]:
# Now to perform stratified shuffle split
from sklearn.model_selection import train_test_split
import pandas as pd

 # Check class distribution before splitting
print(filtered_metadata["Finding Labels"].value_counts())

# There are many small classes of mass, so need to group them all together before splitting
# Standardize labels: Convert anything containing "Mass" to just "Mass"
filtered_metadata["Finding Labels"] = filtered_metadata["Finding Labels"].apply(
    lambda x: "Mass" if "Mass" in x else x
)

# Verify new label counts
print(filtered_metadata["Finding Labels"].value_counts())


Finding Labels
No Finding                                                           60361
Mass                                                                  2139
Infiltration|Mass                                                      420
Effusion|Mass                                                          402
Mass|Nodule                                                            394
                                                                     ...  
Cardiomegaly|Mass|Nodule                                                 1
Cardiomegaly|Effusion|Fibrosis|Mass                                      1
Consolidation|Effusion|Mass|Nodule|Pleural_Thickening|Atelectasis        1
Effusion|Mass|Pneumonia|Pneumothorax                                     1
Atelectasis|Consolidation|Mass|Pleural_Thickening|Pneumothorax           1
Name: count, Length: 295, dtype: int64
Finding Labels
No Finding    60361
Mass           5782
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_metadata["Finding Labels"] = filtered_metadata["Finding Labels"].apply(


In [28]:
# Split the data while ensuring proportional distribution of classes
train_metadata, test_metadata = train_test_split(
    filtered_metadata,
    test_size=0.2,
    stratify=filtered_metadata["Finding Labels"],
    random_state=42
)

# Class distribution in train and test sets
print("Training set:")
print(train_metadata["Finding Labels"].value_counts())

print("Testing Set:")
print(test_metadata["Finding Labels"].value_counts())

Training set:
Finding Labels
No Finding    48288
Mass           4626
Name: count, dtype: int64
Testing Set:
Finding Labels
No Finding    12073
Mass           1156
Name: count, dtype: int64


In [29]:
print(f"Train metadata entries: {len(train_metadata)}")
print(f"Test metadata entries: {len(test_metadata)}")

Train metadata entries: 52914
Test metadata entries: 13229


In [30]:
# Filtering metdata
# Convert "Image Index" column to a set for fast lookup
train_image_files = set(train_metadata["Image Index"])
test_image_files = set(test_metadata["Image Index"])

# Get the list of all available images in the extracted folder
available_images = set(os.listdir(image_folder))

# Keep only images that exist in the extracted folder
train_images = sorted(list(train_image_files.intersection(available_images)))[:80]
test_images = sorted(list(test_image_files.intersection(available_images) - set(train_images)))[:20]


print(f"Total train images found: {len(train_images)}")
print(f"Total test images found: {len(test_images)}")

# Print a few samples
print("Sample train images:", train_images[:10])
print("Sample test images:", test_images[:10])

Total train images found: 80
Total test images found: 20
Sample train images: ['00000002_000.png', '00000004_000.png', '00000005_000.png', '00000005_002.png', '00000005_003.png', '00000005_005.png', '00000006_000.png', '00000007_000.png', '00000008_001.png', '00000011_002.png']
Sample test images: ['00000005_001.png', '00000005_004.png', '00000011_001.png', '00000011_003.png', '00000013_000.png', '00000013_016.png', '00000013_017.png', '00000013_019.png', '00000013_023.png', '00000013_024.png']


In [31]:
# Will use TensorFlow for model training
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import os
import pandas as pd

In [35]:
# Image preprocessing parameters
image_size = (224, 224)  # Resize images
batch_size = 32

# Data augmentation for training
train_datagen = ImageDataGenerator(
    rescale=1.0 / 255,  # Normalize pixel values
    rotation_range=15,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True
)

# Only rescale for testing
test_datagen = ImageDataGenerator(rescale=1.0 / 255)

train_metadata_filtered = train_metadata[train_metadata["Image Index"].isin(train_images)]
test_metadata_filtered = test_metadata[test_metadata["Image Index"].isin(test_images)]

# Load train images from directory
train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_metadata_filtered,  # Use filtered metadata
    directory=image_folder,
    x_col="Image Index",
    y_col="Finding Labels",
    target_size=image_size,
    batch_size=batch_size,
    class_mode="binary"
)

# Load test images
test_generator = test_datagen.flow_from_dataframe(
    dataframe=test_metadata_filtered,  # Use filtered metadata
    directory=image_folder,
    x_col="Image Index",
    y_col="Finding Labels",
    target_size=image_size,
    batch_size=batch_size,
    class_mode="binary"
)

Found 80 validated image filenames belonging to 2 classes.
Found 20 validated image filenames belonging to 2 classes.


In [36]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

# Adam Optimizer
model = Sequential([
    Conv2D(32, (3, 3), activation="relu", input_shape=(224, 224, 3)),
    MaxPooling2D(pool_size=(2, 2)),

    Conv2D(64, (3, 3), activation="relu"),
    MaxPooling2D(pool_size=(2, 2)),

    Conv2D(128, (3, 3), activation="relu"),
    MaxPooling2D(pool_size=(2, 2)),

    Flatten(),
    Dense(128, activation="relu"),
    Dropout(0.5),
    Dense(1, activation="sigmoid")  # Binary classification
])

# Compile model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Print model summary
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [37]:
# Train the model
history = model.fit(
    train_generator,
    validation_data=test_generator,
    epochs=5,
    verbose=1
)

  self._warn_if_super_not_called()


Epoch 1/5
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 13s/step - accuracy: 0.8370 - loss: 2.2045 - val_accuracy: 0.8000 - val_loss: 1.3619
Epoch 2/5
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3s/step - accuracy: 0.6870 - loss: 1.0255 - val_accuracy: 0.8000 - val_loss: 0.6698
Epoch 3/5
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 3s/step - accuracy: 0.6984 - loss: 0.6464 - val_accuracy: 0.8000 - val_loss: 0.5095
Epoch 4/5
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2s/step - accuracy: 0.8586 - loss: 0.4744 - val_accuracy: 0.8000 - val_loss: 0.5282
Epoch 5/5
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3s/step - accuracy: 0.8573 - loss: 0.4505 - val_accuracy: 0.8000 - val_loss: 0.4669


In [38]:
# Evaluate on test set
test_loss, test_acc = model.evaluate(test_generator)
print(f"Test Accuracy: {test_acc * 100:.2f}%")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - accuracy: 0.8000 - loss: 0.4669   
Test Accuracy: 80.00%
