<a href="https://colab.research.google.com/github/AdiVM/Neuro240/blob/main/Neuro240FinalProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [78]:
# All future runs can start here
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
import pandas as pd
import os

Mounted at /content/drive


In [79]:
metadata_path = "/content/drive/MyDrive/NIH_ChestXRay_Data_Neuro240/Data_Entry_2017_v2020.csv"
image_folder = "/content/drive/MyDrive/NIH_ChestXRay_Data_Neuro240/images"

metadata = pd.read_csv(metadata_path)

print("Metdata loaded")

# Filtering the metadata to find images labeled either no finding or those containing the word mass
filtered_metadata = metadata[
    (metadata["Finding Labels"] == "No Finding") |
    (metadata["Finding Labels"].str.contains("Mass", na=False))
]

filtered_image_indexes = set(filtered_metadata["Image Index"])
filtered_metadata = filtered_metadata.head(50000)

matching_images = sorted(list(filtered_metadata["Image Index"]))

# Convert to stored list
matching_images = sorted(list(matching_images))

print(f"Total matching images found: {len(matching_images)}")

Metdata loaded
Total matching images found: 50000


In [80]:
# Now to perform stratified shuffle split
from sklearn.model_selection import train_test_split
import pandas as pd

 # Check class distribution before splitting
print(filtered_metadata["Finding Labels"].value_counts())

# There are many small classes of mass, so need to group them all together before splitting
# Standardize labels: Convert anything containing "Mass" to just "Mass"
filtered_metadata["Finding Labels"] = filtered_metadata["Finding Labels"].apply(
    lambda x: "Mass" if "Mass" in x else x
)

# Verify new label counts
print(filtered_metadata["Finding Labels"].value_counts())


Finding Labels
No Finding                                                                          45542
Mass                                                                                 1708
Infiltration|Mass                                                                     329
Mass|Nodule                                                                           293
Effusion|Mass                                                                         285
                                                                                    ...  
Effusion|Emphysema|Mass|Nodule                                                          1
Atelectasis|Consolidation|Effusion|Fibrosis|Infiltration|Mass|Pleural_Thickening        1
Atelectasis|Consolidation|Effusion|Fibrosis|Infiltration|Mass                           1
Cardiomegaly|Consolidation|Effusion|Infiltration|Mass|Nodule                            1
Edema|Fibrosis|Infiltration|Mass                                                     

In [81]:
label_map = {"No Finding": 0, "Mass": 1}
filtered_metadata["Label"] = filtered_metadata["Finding Labels"].map(label_map)

In [82]:
print(filtered_metadata["Label"].value_counts())

Label
0    45542
1     4458
Name: count, dtype: int64


In [83]:
# Split the data while ensuring proportional distribution of classes
train_metadata, test_metadata = train_test_split(
    filtered_metadata,
    test_size=0.2,
    stratify=filtered_metadata["Label"],
    random_state=42
)

# Class distribution in train and test sets
print("Training set:")
print(train_metadata["Label"].value_counts())

print("Testing Set:")
print(test_metadata["Label"].value_counts())

Training set:
Label
0    36434
1     3566
Name: count, dtype: int64
Testing Set:
Label
0    9108
1     892
Name: count, dtype: int64


In [84]:
print(f"Train metadata entries: {len(train_metadata)}")
print(f"Test metadata entries: {len(test_metadata)}")

Train metadata entries: 40000
Test metadata entries: 10000


In [85]:
# Filtering metdata
train_image_files = set(train_metadata["Image Index"])
test_image_files = set(test_metadata["Image Index"])

train_images = sorted(list(train_image_files))
test_images = sorted(list(test_image_files))

# Convert to sorted lists for consistency
train_images = sorted(list(train_images))
test_images = sorted(list(test_images))


print(f"Total train images found: {len(train_images)}")
print(f"Total test images found: {len(test_images)}")

# Print a few samples
print("Sample train images:", train_images[:10])
print("Sample test images:", test_images[:10])

Total train images found: 40000
Total test images found: 10000
Sample train images: ['00000002_000.png', '00000004_000.png', '00000005_000.png', '00000005_002.png', '00000005_003.png', '00000005_004.png', '00000005_005.png', '00000006_000.png', '00000007_000.png', '00000011_002.png']
Sample test images: ['00000005_001.png', '00000008_001.png', '00000011_001.png', '00000011_003.png', '00000013_000.png', '00000013_017.png', '00000013_029.png', '00000013_030.png', '00000018_000.png', '00000032_049.png']


In [86]:
# Will use TensorFlow for model training
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import os
import pandas as pd

In [87]:
# Image preprocessing parameters
image_size = (224, 224)  # Resize images
batch_size = 32

# Data augmentation for training
train_datagen = ImageDataGenerator(
    rescale=1.0 / 255,  # Normalize pixel values
    rotation_range=15,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True
)

# Only rescale for testing
test_datagen = ImageDataGenerator(rescale=1.0 / 255)

# Load train images from directory
train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_metadata,
    directory=image_folder,
    x_col="Image Index",
    y_col="Label",
    target_size=image_size,
    batch_size=batch_size,
    class_mode="raw"
)

# Load test images
test_generator = test_datagen.flow_from_dataframe(
    dataframe=test_metadata,
    directory=image_folder,
    x_col="Image Index",
    y_col="Label",
    target_size=image_size,
    batch_size=batch_size,
    class_mode="raw"
)

Found 40000 validated image filenames.
Found 10000 validated image filenames.


In [88]:
print("Train distribution:")
print(train_metadata["Label"].value_counts())

print("\nTest distribution:")
print(test_metadata["Label"].value_counts())

Train distribution:
Label
0    36434
1     3566
Name: count, dtype: int64

Test distribution:
Label
0    9108
1     892
Name: count, dtype: int64


In [11]:
def train_model_with_subset(train_metadata, test_metadata, image_folder,
                            train_size,
                            image_size=(224, 224), batch_size=32, epochs=5,
                            output_dir="/content/drive/MyDrive/NIH_ChestXRay_Data_Neuro240/results"):
    import os
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns
    from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
    from tensorflow.keras.preprocessing.image import ImageDataGenerator

    # Sample training and test subsets
    train_metadata_subset = train_metadata.sample(train_size, random_state=42)
    test_metadata_subset = test_metadata.sample(int(train_size / 4), random_state=42)

    os.makedirs(output_dir, exist_ok=True)
    result_prefix = os.path.join(output_dir, f"train_{train_size}")

    # Save class distributions
    def save_distribution(df, name):
        counts = df["Finding Labels"].value_counts()
        percents = counts / counts.sum() * 100
        dist_df = pd.DataFrame({"Count": counts, "Percent": percents})
        dist_df.to_csv(f"{result_prefix}_{name}_distribution.csv")
        return dist_df

    train_dist = save_distribution(train_metadata_subset, "train")
    test_dist = save_distribution(test_metadata_subset, "test")

    # Combined barplot
    combined_df = pd.concat([
        train_dist["Percent"].rename("Train"),
        test_dist["Percent"].rename("Test")
    ], axis=1).fillna(0).reset_index()

    # Rename the column to 'Label'
    combined_df.columns.values[0] = "Label"

    combined_df = pd.melt(combined_df, id_vars="Label", var_name="Set", value_name="Percent")

    plt.figure(figsize=(6, 4))
    sns.barplot(data=combined_df, x="Label", y="Percent", hue="Set")
    plt.title("Class Distribution: Train vs Test")
    plt.ylabel("Percentage")
    plt.xlabel("Class Label")
    plt.tight_layout()
    plt.savefig(f"{result_prefix}_class_distribution_comparison.png")
    plt.close()

    # Image data generators
    train_datagen = ImageDataGenerator(
        rescale=1.0 / 255,
        rotation_range=15,
        width_shift_range=0.1,
        height_shift_range=0.1,
        horizontal_flip=True
    )
    test_datagen = ImageDataGenerator(rescale=1.0 / 255)

    train_generator = train_datagen.flow_from_dataframe(
        dataframe=train_metadata_subset,
        directory=image_folder,
        x_col="Image Index",
        y_col="Finding Labels",
        target_size=image_size,
        batch_size=batch_size,
        class_mode="binary",
        shuffle=True
    )
    test_generator = test_datagen.flow_from_dataframe(
        dataframe=test_metadata_subset,
        directory=image_folder,
        x_col="Image Index",
        y_col="Finding Labels",
        target_size=image_size,
        batch_size=batch_size,
        class_mode="binary",
        shuffle=False
    )

    # CNN model
    model = Sequential([
        Conv2D(32, (3, 3), activation="relu", input_shape=(224, 224, 3)),
        MaxPooling2D(pool_size=(2, 2)),
        Conv2D(64, (3, 3), activation="relu"),
        MaxPooling2D(pool_size=(2, 2)),
        Conv2D(128, (3, 3), activation="relu"),
        MaxPooling2D(pool_size=(2, 2)),
        Flatten(),
        Dense(128, activation="relu"),
        Dropout(0.5),
        Dense(1, activation="sigmoid")
    ])

    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    history = model.fit(
        train_generator,
        validation_data=test_generator,
        epochs=epochs,
        verbose=1
    )

    test_loss, test_acc = model.evaluate(test_generator)
    y_pred_proba = model.predict(test_generator).flatten()
    y_pred = (y_pred_proba > 0.5).astype(int)
    y_true = test_generator.classes
    auc = roc_auc_score(y_true, y_pred_proba)
    cm = confusion_matrix(y_true, y_pred)

    # Save confusion matrix
    confusion_df = pd.DataFrame(
    cm,
    index=["Mass", "No Finding"],
    columns=["Predicted Mass", "Predicted No Finding"])

    confusion_df.to_csv(f"{result_prefix}_confusion_matrix.csv")

    # Save predictions
    filenames = test_generator.filenames
    results_df = pd.DataFrame({
        "Filename": filenames,
        "TrueLabel": y_true,
        "PredictedLabel": y_pred,
        "PredictedProb": y_pred_proba
    })
    results_df.to_csv(f"{result_prefix}_predictions.csv", index=False)

    # Save ROC curve
    fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
    plt.figure()
    plt.plot(fpr, tpr, label=f"AUC = {auc:.2f}")
    plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.legend()
    plt.grid(True)
    plt.savefig(f"{result_prefix}_roc_curve.png")
    plt.close()

    print(f"Test Accuracy (train size={train_size}): {test_acc * 100:.2f}%")
    print(f"AUC: {auc:.4f}")

    return model, history, test_acc, auc

In [12]:
model, history, acc, auc = train_model_with_subset(
    train_metadata=train_metadata,
    test_metadata=test_metadata,
    image_folder=image_folder,
    train_size=500
)

In [13]:
model_250, history_250, acc_250, auc_250 = train_model_with_subset(
    train_metadata=train_metadata,
    test_metadata=test_metadata,
    image_folder=image_folder,
    train_size=250
)

In [14]:
model_1000, history_1000, acc_1000, auc_1000 = train_model_with_subset(
    train_metadata=train_metadata,
    test_metadata=test_metadata,
    image_folder=image_folder,
    train_size=1000
)

In [15]:
model_250, history_250, acc_250, auc_250 = train_model_with_noise(
    train_metadata=train_metadata,
    test_metadata=test_metadata,
    image_folder=image_folder,
    train_size=500
)

In [90]:
def train_model_with_class_weighting(train_metadata, test_metadata, image_folder,
                            train_size,
                            image_size=(224, 224), batch_size=32, epochs=5,
                            output_dir="/content/drive/MyDrive/NIH_ChestXRay_Data_Neuro240/results_class_weight"):
    import os
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns
    import numpy as np
    from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
    from tensorflow.keras.preprocessing.image import ImageDataGenerator

    # Sample training and test subsets
    half_size = train_size // 2
    mass_subset = train_metadata[train_metadata["Finding Labels"] == "Mass"].sample(half_size, random_state=42)
    no_finding_subset = train_metadata[train_metadata["Finding Labels"] == "No Finding"].sample(half_size, random_state=42)
    train_metadata_subset = pd.concat([mass_subset, no_finding_subset]).sample(frac=1, random_state=42)  # Shuffle
    test_metadata_subset = test_metadata.sample(int(train_size / 4), random_state=42)

    os.makedirs(output_dir, exist_ok=True)
    result_prefix = os.path.join(output_dir, f"train_{train_size}")

    # Save class distributions
    def save_distribution(df, name):
        counts = df["Finding Labels"].value_counts()
        percents = counts / counts.sum() * 100
        dist_df = pd.DataFrame({"Count": counts, "Percent": percents})
        dist_df.to_csv(f"{result_prefix}_{name}_distribution.csv")
        return dist_df

    train_dist = save_distribution(train_metadata_subset, "train")
    test_dist = save_distribution(test_metadata_subset, "test")

    # Combined barplot
    combined_df = pd.concat([
        train_dist["Percent"].rename("Train"),
        test_dist["Percent"].rename("Test")
    ], axis=1).fillna(0).reset_index()

    # Rename the correct column to 'Label'
    combined_df.columns.values[0] = "Label"

    combined_df = pd.melt(combined_df, id_vars="Label", var_name="Set", value_name="Percent")

    plt.figure(figsize=(6, 4))
    sns.barplot(data=combined_df, x="Label", y="Percent", hue="Set")
    plt.title("Class Distribution: Train vs Test")
    plt.ylabel("Percentage")
    plt.xlabel("Class Label")
    plt.tight_layout()
    plt.savefig(f"{result_prefix}_class_distribution_comparison.png")
    plt.close()

    # Image data generators
    train_datagen = ImageDataGenerator(
        rescale=1.0 / 255,
        rotation_range=15,
        width_shift_range=0.1,
        height_shift_range=0.1,
        horizontal_flip=False
    )
    test_datagen = ImageDataGenerator(rescale=1.0 / 255)

    train_generator = train_datagen.flow_from_dataframe(
        dataframe=train_metadata_subset,
        directory=image_folder,
        x_col="Image Index",
        y_col="Finding Labels",
        target_size=image_size,
        batch_size=batch_size,
        class_mode="binary",
        shuffle=False
    )
    test_generator = test_datagen.flow_from_dataframe(
        dataframe=test_metadata_subset,
        directory=image_folder,
        x_col="Image Index",
        y_col="Finding Labels",
        target_size=image_size,
        batch_size=batch_size,
        class_mode="binary",
        shuffle=False
    )

    # CNN model
    model = Sequential([
        Conv2D(32, (3, 3), activation="relu", input_shape=(224, 224, 3)),
        MaxPooling2D(pool_size=(2, 2)),
        Conv2D(64, (3, 3), activation="relu"),
        MaxPooling2D(pool_size=(2, 2)),
        Conv2D(128, (3, 3), activation="relu"),
        MaxPooling2D(pool_size=(2, 2)),
        Flatten(),
        Dense(128, activation="relu"),
        Dropout(0.5),
        Dense(1, activation="sigmoid")
    ])

    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

    from sklearn.utils.class_weight import compute_class_weight
    import numpy as np

    # Map labels to integers using the generator's class indices
    label_map = train_generator.class_indices

    # Convert y labels to 0/1
    label_map = {"No Finding": 0, "Mass": 1}
    y_train_labels = train_metadata_subset["Finding Labels"].map(label_map)

    # Compute class weights
    class_weights = compute_class_weight(
        class_weight="balanced",
        classes=np.array([0, 1]),
        y=y_train_labels
    )
    class_weight_dict = dict(zip([0, 1], class_weights))

    print("Class weights:", class_weight_dict)

    history = model.fit(
        train_generator,
        validation_data=test_generator,
        epochs=epochs,
        verbose=1,
        class_weight=class_weight_dict
    )

    test_loss, test_acc = model.evaluate(test_generator)
    y_pred_proba = model.predict(test_generator).flatten()
    from sklearn.metrics import f1_score

    # Trying multiple thresholds to find the optimal one
    y_true = test_generator.classes

    thresholds = np.linspace(0.1, 0.9, 9)
    best_f1 = 0
    best_threshold = 0.5  # Baseline
    best_preds = None

    for t in thresholds:
      preds = (y_pred_proba > t).astype(int)
      f1 = f1_score(y_true, preds)
      print(f"Threshold {t:.2f} → F1 Score: {f1:.4f}")
      if f1 > best_f1:
          best_f1 = f1
          best_threshold = t
          best_preds = preds

    print(f"\nBest Threshold: {best_threshold:.2f} → F1 Score: {best_f1:.4f}")


    auc = roc_auc_score(y_true, y_pred_proba)
    cm = confusion_matrix(y_true, best_preds)

    # Saving confusion matrix
    confusion_df = pd.DataFrame(
    cm,
    index=["Mass", "No Finding"],
    columns=["Predicted Mass", "Predicted No Finding"])

    confusion_df.to_csv(f"{result_prefix}_confusion_matrix.csv")

    # Save predictions
    filenames = test_generator.filenames
    results_df = pd.DataFrame({
        "Filename": filenames,
        "TrueLabel": y_true,
        "PredictedLabel": best_preds,
        "PredictedProb": y_pred_proba
    })
    results_df.to_csv(f"{result_prefix}_predictions.csv", index=False)

    # Save ROC curve
    fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
    plt.figure()
    plt.plot(fpr, tpr, label=f"AUC = {auc:.2f}")
    plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.legend()
    plt.grid(True)
    plt.savefig(f"{result_prefix}_roc_curve.png")
    plt.close()

    print(f"Test Accuracy (train size={train_size}): {test_acc * 100:.2f}%")

    with open(f"{result_prefix}_best_threshold.txt", "w") as f:
      f.write(f"Best threshold: {best_threshold:.2f}, F1: {best_f1:.4f}")

    return model, history, test_acc, auc

In [17]:
model_250_class, history_250_class, acc_250_class, auc_250_class = train_model_with_class_weighting(
    train_metadata=train_metadata,
    test_metadata=test_metadata,
    image_folder=image_folder,
    train_size=250
)

In [91]:
model_500_class, history_500_class, acc_500_class, auc_500_class = train_model_with_class_weighting(
    train_metadata=train_metadata,
    test_metadata=test_metadata,
    image_folder=image_folder,
    train_size=500
)

Found 500 validated image filenames belonging to 2 classes.
Found 125 validated image filenames belonging to 2 classes.
Class weights:

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  self._warn_if_super_not_called()


 {0: np.float64(1.0), 1: np.float64(1.0)}
Epoch 1/5
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m192s[0m 11s/step - accuracy: 0.4848 - loss: 1.4409 - val_accuracy: 0.8160 - val_loss: 0.6913
Epoch 2/5
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 5s/step - accuracy: 0.4435 - loss: 0.6941 - val_accuracy: 0.1120 - val_loss: 0.6998
Epoch 3/5
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 5s/step - accuracy: 0.5313 - loss: 0.6944 - val_accuracy: 0.8880 - val_loss: 0.6673
Epoch 4/5
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 5s/step - accuracy: 0.5329 - loss: 0.6901 - val_accuracy: 0.8880 - val_loss: 0.6708
Epoch 5/5
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 5s/step - accuracy: 0.5129 - loss: 0.6977 - val_accuracy: 0.3920 - val_loss: 0.6915
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1s/step - accuracy: 0.3953 - loss: 0.6923
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8

In [None]:
Plotting all of the different training set sizes so we can just evaluate what a generic classifier is going to be doing in response to training size increase.
I will then use this to plot how AUC and accuracy change over the course of the simple classifier.

model_350_class, history_350_class, acc_350_class, auc_350_class = train_model_with_class_subset(
    train_metadata=train_metadata,
    test_metadata=test_metadata,
    image_folder=image_folder,
    train_size=350
)

model_450_class, history_450_class, acc_450_class, auc_450_class = train_model_with_class_subset(
    train_metadata=train_metadata,
    test_metadata=test_metadata,
    image_folder=image_folder,
    train_size=450
)

model_550_class, history_550_class, acc_550_class, auc_550_class = train_model_with_class_subset(
    train_metadata=train_metadata,
    test_metadata=test_metadata,
    image_folder=image_folder,
    train_size=550
)

model_650_class, history_650_class, acc_650_class, auc_650_class = train_model_with_class_subset(
    train_metadata=train_metadata,
    test_metadata=test_metadata,
    image_folder=image_folder,
    train_size=650
)

model_750_class, history_750_class, acc_750_class, auc_750_class = train_model_with_class_subset(
    train_metadata=train_metadata,
    test_metadata=test_metadata,
    image_folder=image_folder,
    train_size=750
)

model_850_class, history_850_class, acc_850_class, auc_850_class = train_model_with_class_subset(
    train_metadata=train_metadata,
    test_metadata=test_metadata,
    image_folder=image_folder,
    train_size=850
)

model_950_class, history_950_class, acc_950_class, auc_950_class = train_model_with_class_subset(
    train_metadata=train_metadata,
    test_metadata=test_metadata,
    image_folder=image_folder,
    train_size=950
)

In [20]:
model_1000_class, history_1000_class, acc_1000_class, auc_1000_class = train_model_with_class_weighting(
    train_metadata=train_metadata,
    test_metadata=test_metadata,
    image_folder=image_folder,
    train_size=1000
)

In [19]:
model_7000_class, history_7000_class, acc_7000_class, auc_7000_class = train_model_with_class_weighting(
    train_metadata=train_metadata,
    test_metadata=test_metadata,
    image_folder=image_folder,
    train_size=7000
)

In [22]:
def train_model_with_noise(train_metadata, test_metadata, image_folder,
                            train_size,
                            image_size=(224, 224), batch_size=32, epochs=5,
                            output_dir="/content/drive/MyDrive/NIH_ChestXRay_Data_Neuro240/results_noise"):
    import os
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns
    from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
    from tensorflow.keras.preprocessing.image import ImageDataGenerator

    # Sample training and test subsets
    train_metadata_subset = train_metadata.sample(train_size, random_state=42)
    test_metadata_subset = test_metadata.sample(int(train_size / 4), random_state=42)

    os.makedirs(output_dir, exist_ok=True)
    result_prefix = os.path.join(output_dir, f"train_{train_size}")

    # Save class distributions
    def save_distribution(df, name):
        counts = df["Finding Labels"].value_counts()
        percents = counts / counts.sum() * 100
        dist_df = pd.DataFrame({"Count": counts, "Percent": percents})
        dist_df.to_csv(f"{result_prefix}_{name}_distribution.csv")
        return dist_df

    train_dist = save_distribution(train_metadata_subset, "train")
    test_dist = save_distribution(test_metadata_subset, "test")

    # Combined barplot
    combined_df = pd.concat([
        train_dist["Percent"].rename("Train"),
        test_dist["Percent"].rename("Test")
    ], axis=1).fillna(0).reset_index()

    # Rename the correct column to 'Label'
    combined_df.columns.values[0] = "Label"

    combined_df = pd.melt(combined_df, id_vars="Label", var_name="Set", value_name="Percent")

    plt.figure(figsize=(6, 4))
    sns.barplot(data=combined_df, x="Label", y="Percent", hue="Set")
    plt.title("Class Distribution: Train vs Test")
    plt.ylabel("Percentage")
    plt.xlabel("Class Label")
    plt.tight_layout()
    plt.savefig(f"{result_prefix}_class_distribution_comparison.png")
    plt.close()


    # Data generators with noisey training set
    train_datagen_noisy = ImageDataGenerator(
        rescale=1.0 / 255,
        rotation_range=20,
        width_shift_range=0.15,
        height_shift_range=0.15,
        horizontal_flip=True,
        brightness_range=[0.8, 1.2],
        zoom_range=0.1,
        preprocessing_function=lambda x: x + np.random.normal(loc=0.0, scale=0.05, size=x.shape)
    )
    test_datagen = ImageDataGenerator(rescale=1.0 / 255)

    train_generator = train_datagen.flow_from_dataframe(
        dataframe=train_metadata_subset,
        directory=image_folder,
        x_col="Image Index",
        y_col="Finding Labels",
        target_size=image_size,
        batch_size=batch_size,
        class_mode="binary",
        shuffle=True
    )
    test_generator = test_datagen.flow_from_dataframe(
        dataframe=test_metadata_subset,
        directory=image_folder,
        x_col="Image Index",
        y_col="Finding Labels",
        target_size=image_size,
        batch_size=batch_size,
        class_mode="binary",
        shuffle=False
    )

    # CNN model
    model = Sequential([
        Conv2D(32, (3, 3), activation="relu", input_shape=(224, 224, 3)),
        MaxPooling2D(pool_size=(2, 2)),
        Conv2D(64, (3, 3), activation="relu"),
        MaxPooling2D(pool_size=(2, 2)),
        Conv2D(128, (3, 3), activation="relu"),
        MaxPooling2D(pool_size=(2, 2)),
        Flatten(),
        Dense(128, activation="relu"),
        Dropout(0.5),
        Dense(1, activation="sigmoid")
    ])

    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

    from sklearn.utils.class_weight import compute_class_weight
    import numpy as np

    # Map labels to integers using the generator's class indices
    label_map = train_generator.class_indices
    inv_label_map = {v: k for k, v in label_map.items()}

    # Convert y labels to 0/1
    y_train_labels = train_metadata_subset["Finding Labels"].map(label_map)

    # Compute class weights
    # class_weights = compute_class_weight(
    #     class_weight="balanced",
    #     classes=np.unique(y_train_labels),
    #     y=y_train_labels
    # )
    # class_weight_dict = dict(enumerate(class_weights))

    history = model.fit(
        train_generator,
        validation_data=test_generator,
        epochs=epochs,
        verbose=1
        # class_weight=class_weight_dict
    )

    test_loss, test_acc = model.evaluate(test_generator)
    y_pred_proba = model.predict(test_generator).flatten()
    y_pred = (y_pred_proba > 0.5).astype(int)
    y_true = test_generator.classes
    auc = roc_auc_score(y_true, y_pred_proba)
    cm = confusion_matrix(y_true, y_pred)

    # Save confusion matrix
    confusion_df = pd.DataFrame(
    cm,
    index=["Mass", "No Finding"],
    columns=["Predicted Mass", "Predicted No Finding"])

    confusion_df.to_csv(f"{result_prefix}_confusion_matrix.csv")

    # Save predictions
    filenames = test_generator.filenames
    results_df = pd.DataFrame({
        "Filename": filenames,
        "TrueLabel": y_true,
        "PredictedLabel": y_pred,
        "PredictedProb": y_pred_proba
    })
    results_df.to_csv(f"{result_prefix}_predictions.csv", index=False)

    # Save ROC curve
    fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
    plt.figure()
    plt.plot(fpr, tpr, label=f"AUC = {auc:.2f}")
    plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.legend()
    plt.grid(True)
    plt.savefig(f"{result_prefix}_roc_curve.png")
    plt.close()

    print(f"Test Accuracy (train size={train_size}): {test_acc * 100:.2f}%")
    print(f"AUC: {auc:.4f}")

    return model, history, test_acc, auc

In [23]:
model_250, history_250, acc_250, auc_250 = train_model_with_noise(
    train_metadata=train_metadata,
    test_metadata=test_metadata,
    image_folder=image_folder,
    train_size=250
)

Found 250 validated image filenames belonging to 2 classes.
Found 62 validated image filenames belonging to 2 classes.


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  self._warn_if_super_not_called()


Epoch 1/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 14s/step - accuracy: 0.8507 - loss: 1.5540 - val_accuracy: 0.8548 - val_loss: 0.4187
Epoch 2/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 7s/step - accuracy: 0.9391 - loss: 0.2866 - val_accuracy: 0.8548 - val_loss: 0.4984
Epoch 3/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 5s/step - accuracy: 0.9390 - loss: 0.2459 - val_accuracy: 0.8548 - val_loss: 0.4411
Epoch 4/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 6s/step - accuracy: 0.9403 - loss: 0.2539 - val_accuracy: 0.8548 - val_loss: 0.4379
Epoch 5/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 7s/step - accuracy: 0.9226 - loss: 0.2831 - val_accuracy: 0.8548 - val_loss: 0.4618
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 946ms/step - accuracy: 0.8303 - loss: 0.5247
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1s/step
Test Accuracy (train size=250): 85.4

In [25]:
model_500, history_500, acc_500, auc_500 = train_model_with_noise(
    train_metadata=train_metadata,
    test_metadata=test_metadata,
    image_folder=image_folder,
    train_size=500
)

In [26]:
model_1000, history_1000, acc_1000, auc_1000 = train_model_with_noise(
    train_metadata=train_metadata,
    test_metadata=test_metadata,
    image_folder=image_folder,
    train_size=1000
)

Nothing seems to be working, our dataset is preventing our model from being able to distinguish between classes well, and the addition of guassian noise almost appears to have little effect becuase the model's classification is already so poor.

In [42]:
# Now let us see if we can distinguish between different types of Mass, rather than distinguishing between No Finding
metadata_path = "/content/drive/MyDrive/NIH_ChestXRay_Data_Neuro240/Data_Entry_2017_v2020.csv"
image_folder = "/content/drive/MyDrive/NIH_ChestXRay_Data_Neuro240/images"

metadata = pd.read_csv(metadata_path)

print("Metdata loaded")

# Filtering the metadata to find images labeled either no finding or those containing the word mass
filtered_metadata = metadata[
    (metadata["Finding Labels"] == "No Finding") |
    (metadata["Finding Labels"].str.contains("Mass", na=False))
]

filtered_image_indexes = set(filtered_metadata["Image Index"])
filtered_metadata = filtered_metadata.head(50000)

matching_images = sorted(list(filtered_metadata["Image Index"]))

# Convert to stored list
matching_images = sorted(list(matching_images))

print(f"Total matching images found: {len(matching_images)}")

Metdata loaded
Total matching images found: 50000


In [43]:
from sklearn.model_selection import train_test_split
import pandas as pd

 # Check class distribution before splitting
print(filtered_metadata["Finding Labels"].value_counts())

# There are many small classes of mass, so need to group them all together before splitting
# Standardize labels: Converting anything containing "Mass" to just "Mass"
filtered_metadata["Finding Labels"] = filtered_metadata["Finding Labels"].apply(
    lambda x: "Edema" if "Edema" in x else x
)

filtered_metadata["Finding Labels"] = filtered_metadata["Finding Labels"].apply(
    lambda x: "Cardiomegaly" if "Cardiomegaly" in x else x
)



Finding Labels
No Finding                                                                          45542
Mass                                                                                 1708
Infiltration|Mass                                                                     329
Mass|Nodule                                                                           293
Effusion|Mass                                                                         285
                                                                                    ...  
Effusion|Emphysema|Mass|Nodule                                                          1
Atelectasis|Consolidation|Effusion|Fibrosis|Infiltration|Mass|Pleural_Thickening        1
Atelectasis|Consolidation|Effusion|Fibrosis|Infiltration|Mass                           1
Cardiomegaly|Consolidation|Effusion|Infiltration|Mass|Nodule                            1
Edema|Fibrosis|Infiltration|Mass                                                     

In [45]:
print(filtered_metadata["Finding Labels"].value_counts())

Finding Labels
No Finding                                                    45542
Mass                                                           1708
Infiltration|Mass                                               329
Mass|Nodule                                                     293
Effusion|Mass                                                   285
                                                              ...  
Effusion|Infiltration|Mass|Pleural_Thickening|Pneumothorax        1
Mass|Nodule|Atelectasis                                           1
Infiltration|Mass|Pleural_Thickening|Pneumothorax                 1
Effusion|Nodule|Pneumothorax|Mass                                 1
Infiltration|Mass|Nodule|Pneumonia                                1
Name: count, Length: 191, dtype: int64


We will now try to distinguish between Cardiomegaly and Edema

In [46]:
label_map = {"Cardiomegaly": 0, "Edema": 1}
filtered_metadata["Label"] = filtered_metadata["Finding Labels"].map(label_map)

In [47]:
print(filtered_metadata["Label"])

3       NaN
12      NaN
13      NaN
14      NaN
15      NaN
         ..
83732   NaN
83734   NaN
83735   NaN
83736   NaN
83737   NaN
Name: Label, Length: 50000, dtype: float64


In [50]:
filtered_metadata = filtered_metadata[filtered_metadata['Label'].notna()]

In [51]:
filtered_metadata

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Label
41,00000013_025.png,Cardiomegaly,3,13,56,M,PA,2992,2991,0.143,0.143,0.0
43,00000013_027.png,Cardiomegaly,5,13,56,M,AP,2500,2048,0.168,0.168,0.0
44,00000013_028.png,Cardiomegaly,6,13,56,M,AP,2500,2048,0.168,0.168,0.0
45,00000013_029.png,Cardiomegaly,7,13,56,M,AP,2500,2048,0.168,0.168,0.0
46,00000013_030.png,Cardiomegaly,8,13,56,M,AP,2500,2048,0.168,0.168,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
82776,00020333_001.png,Edema,1,20333,64,F,AP,3056,2544,0.139,0.139,1.0
82784,00020337_000.png,Cardiomegaly,0,20337,53,M,PA,2922,2991,0.143,0.143,0.0
82851,00020363_000.png,Cardiomegaly,0,20363,36,F,PA,2544,2916,0.139,0.139,0.0
82934,00020396_001.png,Edema,1,20396,41,M,PA,2544,3040,0.139,0.139,1.0


In [52]:
# Split the data while ensuring proportional distribution of classes
train_metadata, test_metadata = train_test_split(
    filtered_metadata,
    test_size=0.2,
    stratify=filtered_metadata["Label"],
    random_state=42
)

# Class distribution in train and test sets
print("Training set:")
print(train_metadata["Label"].value_counts())

print("Testing Set:")
print(test_metadata["Label"].value_counts())

Training set:
Label
1.0    83
0.0    53
Name: count, dtype: int64
Testing Set:
Label
1.0    21
0.0    14
Name: count, dtype: int64


In [53]:
# Filtering metdata for new approach
train_image_files = set(train_metadata["Image Index"])
test_image_files = set(test_metadata["Image Index"])

train_images = sorted(list(train_image_files))
test_images = sorted(list(test_image_files))

# Convert to sorted lists
train_images = sorted(list(train_images))
test_images = sorted(list(test_images))


print(f"Total train images found: {len(train_images)}")
print(f"Total test images found: {len(test_images)}")

# Print a few samples
print("Sample train images:", train_images[:10])
print("Sample test images:", test_images[:10])

Total train images found: 136
Total test images found: 35
Sample train images: ['00000013_025.png', '00000013_027.png', '00000013_028.png', '00000013_029.png', '00000013_030.png', '00000013_044.png', '00000211_035.png', '00000376_008.png', '00000435_000.png', '00001097_000.png']
Sample test images: ['00001301_044.png', '00004746_003.png', '00006829_007.png', '00006829_008.png', '00007872_003.png', '00008613_007.png', '00009608_023.png', '00009640_003.png', '00010294_048.png', '00010314_016.png']


In [54]:
# Image preprocessing parameters
image_size = (224, 224)  # Resize images
batch_size = 32

# Data augmentation for training
train_datagen = ImageDataGenerator(
    rescale=1.0 / 255,  # Normalize pixel values
    rotation_range=15,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=False
)

# Only rescale for testing
test_datagen = ImageDataGenerator(rescale=1.0 / 255)

# Load train images from directory
train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_metadata,
    directory=image_folder,
    x_col="Image Index",
    y_col="Label",
    target_size=image_size,
    batch_size=batch_size,
    class_mode="raw"
)

# Load test images
test_generator = test_datagen.flow_from_dataframe(
    dataframe=test_metadata,
    directory=image_folder,
    x_col="Image Index",
    y_col="Label",
    target_size=image_size,
    batch_size=batch_size,
    class_mode="raw"
)

Found 136 validated image filenames.
Found 35 validated image filenames.


In [59]:
print(train_metadata["Finding Labels"].value_counts())

Finding Labels
Edema           83
Cardiomegaly    53
Name: count, dtype: int64


In [76]:
def train_model_with_subset(train_metadata, test_metadata, image_folder,
                            train_size,
                            image_size=(224, 224), batch_size=32, epochs=5,
                            output_dir="/content/drive/MyDrive/NIH_ChestXRay_Data_Neuro240/results"):
    import os
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns
    from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
    from tensorflow.keras.preprocessing.image import ImageDataGenerator
    from sklearn.utils.class_weight import compute_class_weight
    import numpy as np

    # Sample training and test subsets
    train_metadata_subset = train_metadata.sample(train_size, random_state=42)
    test_metadata_subset = test_metadata.sample(int(train_size / 4), random_state=42)

    # Filter to only Edema and Cardiomegaly
    allowed_labels = ["Edema", "Cardiomegaly"]
    train_metadata_subset = train_metadata_subset[train_metadata_subset["Finding Labels"].isin(allowed_labels)]
    test_metadata_subset = test_metadata_subset[test_metadata_subset["Finding Labels"].isin(allowed_labels)]

    # # Map labels: Edema -> 1, Cardiomegaly -> 0
    # label_map = {"Cardiomegaly": 0, "Edema": 1}
    # train_metadata_subset["Finding Labels"] = train_metadata_subset["Finding Labels"].map(label_map)
    # test_metadata_subset["Finding Labels"] = test_metadata_subset["Finding Labels"].map(label_map)

    os.makedirs(output_dir, exist_ok=True)
    result_prefix = os.path.join(output_dir, f"train_{train_size}")

    # Save class distributions
    def save_distribution(df, name):
        counts = df["Finding Labels"].value_counts()
        percents = counts / counts.sum() * 100
        dist_df = pd.DataFrame({"Count": counts, "Percent": percents})
        dist_df.to_csv(f"{result_prefix}_{name}_distribution.csv")
        return dist_df

    train_dist = save_distribution(train_metadata_subset, "train")
    test_dist = save_distribution(test_metadata_subset, "test")

    # Combined barplot
    combined_df = pd.concat([
        train_dist["Percent"].rename("Train"),
        test_dist["Percent"].rename("Test")
    ], axis=1).fillna(0).reset_index()

    combined_df.columns.values[0] = "Label"
    combined_df = pd.melt(combined_df, id_vars="Label", var_name="Set", value_name="Percent")

    plt.figure(figsize=(6, 4))
    sns.barplot(data=combined_df, x="Label", y="Percent", hue="Set")
    plt.title("Class Distribution: Train vs Test")
    plt.ylabel("Percentage")
    plt.xlabel("Class Label")
    plt.tight_layout()
    plt.savefig(f"{result_prefix}_class_distribution_comparison.png")
    plt.close()

    # Image data generators
    train_datagen = ImageDataGenerator(
        rescale=1.0 / 255,
        rotation_range=15,
        width_shift_range=0.1,
        height_shift_range=0.1,
        horizontal_flip=False
    )
    test_datagen = ImageDataGenerator(rescale=1.0 / 255)

    train_generator = train_datagen.flow_from_dataframe(
        dataframe=train_metadata_subset,
        directory=image_folder,
        x_col="Image Index",
        y_col="Finding Labels",
        target_size=image_size,
        batch_size=batch_size,
        class_mode="binary",
        shuffle=True
    )
    test_generator = test_datagen.flow_from_dataframe(
        dataframe=test_metadata_subset,
        directory=image_folder,
        x_col="Image Index",
        y_col="Finding Labels",
        target_size=image_size,
        batch_size=batch_size,
        class_mode="binary",
        shuffle=True
    )

    # CNN model
    model = Sequential([
        Conv2D(32, (3, 3), activation="relu", input_shape=(224, 224, 3)),
        MaxPooling2D(pool_size=(2, 2)),
        Conv2D(64, (3, 3), activation="relu"),
        MaxPooling2D(pool_size=(2, 2)),
        Conv2D(128, (3, 3), activation="relu"),
        MaxPooling2D(pool_size=(2, 2)),
        Flatten(),
        Dense(128, activation="relu"),
        Dropout(0.5),
        Dense(1, activation="sigmoid")
    ])

    y_train_labels = train_metadata_subset["Finding Labels"]
    classes = np.unique(y_train_labels)  # ['Cardiomegaly' 'Edema']

    class_weights = compute_class_weight(
        class_weight="balanced",
        classes=classes,
        y=y_train_labels
    )
    class_weight_dict = dict(zip(classes, class_weights))

    print("Class weights:", class_weight_dict)

    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

    history = model.fit(
        train_generator,
        validation_data=test_generator,
        epochs=epochs,
        verbose=1,
        class_weight=class_weight_dict
    )

    test_loss, test_acc = model.evaluate(test_generator)
    y_pred_proba = model.predict(test_generator).flatten()
    y_pred = (y_pred_proba > 0.5).astype(int)
    y_true = test_generator.classes
    auc = roc_auc_score(y_true, y_pred_proba)
    cm = confusion_matrix(y_true, y_pred)

    # Save confusion matrix
    confusion_df = pd.DataFrame(
        cm,
        index=["Cardiomegaly (0)", "Edema (1)"],
        columns=["Predicted Cardiomegaly (0)", "Predicted Edema (1)"]
    )
    confusion_df.to_csv(f"{result_prefix}_confusion_matrix.csv")

    # Save predictions
    filenames = test_generator.filenames
    results_df = pd.DataFrame({
        "Filename": filenames,
        "TrueLabel": y_true,
        "PredictedLabel": y_pred,
        "PredictedProb": y_pred_proba
    })
    results_df.to_csv(f"{result_prefix}_predictions.csv", index=False)

    # Save ROC curve
    fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
    plt.figure()
    plt.plot(fpr, tpr, label=f"AUC = {auc:.2f}")
    plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.legend()
    plt.grid(True)
    plt.savefig(f"{result_prefix}_roc_curve.png")
    plt.close()

    print(f"Test Accuracy (train size={train_size}): {test_acc * 100:.2f}%")
    print(f"AUC: {auc:.4f}")

    return model, history, test_acc, auc

In [77]:
model_small, history_small, acc_small, auc_small = train_model_with_subset(
    train_metadata=train_metadata,
    test_metadata=test_metadata,
    image_folder=image_folder,
    train_size=136
)

Found 136 validated image filenames belonging to 2 classes.
Found 34 validated image filenames belonging to 2 classes.
Class weights: {'Cardiomegaly': np.float64(1.2830188679245282), 'Edema': np.float64(0.8192771084337349)}


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  self._warn_if_super_not_called()


Epoch 1/5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 4s/step - accuracy: 0.6218 - loss: 2.0099 - val_accuracy: 0.3824 - val_loss: 0.7250
Epoch 2/5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 4s/step - accuracy: 0.5384 - loss: 0.6959 - val_accuracy: 0.6176 - val_loss: 0.6748
Epoch 3/5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 4s/step - accuracy: 0.6072 - loss: 0.6987 - val_accuracy: 0.6176 - val_loss: 0.6716
Epoch 4/5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 5s/step - accuracy: 0.6194 - loss: 0.6729 - val_accuracy: 0.6176 - val_loss: 0.6813
Epoch 5/5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 4s/step - accuracy: 0.5787 - loss: 0.6871 - val_accuracy: 0.6176 - val_loss: 0.6862
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 106ms/step - accuracy: 0.6201 - loss: 0.6860
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 139ms/step
Test Accuracy (train size=136): 61.