<a href="https://colab.research.google.com/github/10710arnav/Noesis/blob/main/Aryan%20Basnet%2C%20Arnav%20Maharjan%20and%20Ashila%20A%20M%20Ardiyansyah/06_dataset6_LMIC_Bangladesh.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **NOTE ON RUNTIME AND OUTPUTS**

# Google Colab may have disconnected or reset the runtime during long training sessions, crashes, or memory interruptions. When this occurred, some previously displayed outputs in the notebook were no longer visible. However, all results remained saved and logged correctly. Each model’s complete metrics and metadata were stored as JSON files in my Google Drive folder:

# [https://drive.google.com/drive/folders/1ejlJaZhHEBm-1khLBJ--mbG2pg5TZHoJ?usp=sharing](https://drive.google.com/drive/folders/1ejlJaZhHEBm-1khLBJ--mbG2pg5TZHoJ?usp=sharing)

# These JSON files contain the full and reliable outputs for all models across all datasets, even if certain notebook outputs were lost due to runtime resets.

# ==============================
# SETUP: Freeze all package versions
# ==============================
Ensure reproducibility by installing the exact versions of packages used in these notebooks. This includes pre-installed packages in Colab.

The packages and versions used are:

- numpy==1.25.2
- pandas==2.1.1
- matplotlib==3.8.0
- seaborn==0.12.2
- scikit-learn==1.3.2
- tensorflow==2.15.0
- keras==2.15.0
- scipy==1.11.2
- opencv-python==4.9.0.73
- Pillow==10.0.1
- h5py==3.9.0
- google-colab==2.0.0

In [None]:
from google.colab import drive
drive.mount('/content/mydrive')

Mounted at /content/mydrive


In [None]:
import os

# Start from root of your mounted drive
root = "/content/mydrive/MyDrive/"

# List top-level folders in your Drive
print("Top-level folders in MyDrive:")
print(os.listdir(root))

Top-level folders in MyDrive:
['Getting started.pdf', 'Copy of Retro Internet Aesthetic Interface Theme for Marketing by Slidesgo.gslides', 'THE RESCUE - [NARRATIVE WRITING].gdoc', 'POEM-.gdoc', 'Chest-X-Ray Epic Hospital Chittagong, Bangladesh pneumonia', 'A Primary Chest X-ray Dataset of Normal and Pneumo', 'A Primary Chest X-ray Dataset of Normal and Pneumonia.zip', 'Colab Notebooks']


In [None]:
import os

# Main folder
dataset_root = "/content/mydrive/MyDrive/Chest-X-Ray Epic Hospital Chittagong, Bangladesh pneumonia"

# Expected splits
splits = ["Training", "Testing"]

for split in splits:
    split_path = os.path.join(dataset_root, split)

    if not os.path.exists(split_path):
        print(f"[ERROR] Split folder not found: {split_path}")
        continue

    print(f"\n--- {split.upper()} ---")

    # List classes in this split
    for cls in os.listdir(split_path):
        cls_path = os.path.join(split_path, cls)
        if os.path.isdir(cls_path):
            images = [f for f in os.listdir(cls_path) if f.lower().endswith((".jpg", ".jpeg", ".png"))]
            print(f"{cls.lower()}: {len(images)} images")


--- TRAINING ---
pneumonia: 1050 images
normal: 1063 images

--- TESTING ---
pneumonia: 256 images
normal: 257 images


In [None]:
# ==========================================
# CHEST X-RAY CLASSIFICATION - BANGLADESH DATASET
# Using Pneumonia Dataset from Epic Hospital, Chittagong, Bangladesh
# ==========================================

# -------------------------------
# IMPORT LIBRARIES
# -------------------------------
# Core utilities for file handling, timing, memory cleanup, JSON output, and warnings
# Numpy for arrays, TensorFlow/Keras for deep learning
# Sklearn for evaluation metrics and splitting data
import os, zipfile, shutil, time, gc, json, warnings
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')  # suppress unnecessary warnings

# -------------------------------
# STEP 0: PATHS & DATASET LOCATION
# -------------------------------
# Mount Google Drive to access dataset and save results
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Root folder containing the original dataset
DATASET_ROOT = "/content/drive/MyDrive/Chest-X-Ray Epic Hospital Chittagong, Bangladesh pneumonia"

# Original folders already split into Training and Testing
train_orig = os.path.join(DATASET_ROOT, "Training")
test_orig  = os.path.join(DATASET_ROOT, "Testing")

# -------------------------------
# STEP 1: METADATA
# -------------------------------
# Basic info for reference and saving results
DATASET_NAME = "Bangladesh_chest_xray"
COUNTRY_INCOME_LEVEL = "LMIC"
NUM_CLASSES = 2
CLASS_NAMES = ['Normal', 'Pneumonia']

print(f"Dataset: {DATASET_NAME}, Income Level: {COUNTRY_INCOME_LEVEL}, Classes: {CLASS_NAMES}")

# -------------------------------
# STEP 2: SPLIT DATA (optional)
# -------------------------------
# Create standardized folder structure for train/val/test
base_dir = "/content/Bangladesh_split"
train_dir = os.path.join(base_dir, "train")
val_dir   = os.path.join(base_dir, "val")
test_dir  = os.path.join(base_dir, "test")

for d in [train_dir, val_dir, test_dir]:
    for c in CLASS_NAMES:
        os.makedirs(os.path.join(d, c), exist_ok=True)

# Copy original training and testing images into new standardized structure
for cls in CLASS_NAMES:
    cls_folder = "normal" if cls=="Normal" else "pneumonia"
    for f in os.listdir(os.path.join(train_orig, cls_folder)):
        shutil.copy(os.path.join(train_orig, cls_folder, f), os.path.join(train_dir, cls))
    for f in os.listdir(os.path.join(test_orig, cls_folder)):
        shutil.copy(os.path.join(test_orig, cls_folder, f), os.path.join(test_dir, cls))

# Split validation set from training data (20%) for hyperparameter tuning
for cls in CLASS_NAMES:
    files = os.listdir(os.path.join(train_dir, cls))
    train_files, val_files = train_test_split(files, test_size=0.2, random_state=42)
    val_cls_dir = os.path.join(val_dir, cls)
    for f in val_files:
        shutil.move(os.path.join(train_dir, cls, f), os.path.join(val_cls_dir, f))

# -------------------------------
# STEP 3: DATA AUGMENTATION
# -------------------------------
# Image preprocessing and augmentation to improve generalization
IMG_SIZE = (224, 224)
BATCH_SIZE = 8  # small batch due to dataset size
EPOCHS = 20

# Training generator with augmentations: rotation, shift, shear, zoom, brightness, flip
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=25,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.1,
    zoom_range=0.2,
    brightness_range=[0.8,1.2],
    horizontal_flip=True,
    fill_mode='nearest'
)

# Validation/test generator only rescales pixels
val_test_datagen = ImageDataGenerator(rescale=1./255)

# Generators create batches and feed data to the model
train_generator = train_datagen.flow_from_directory(
    train_dir, target_size=IMG_SIZE, batch_size=BATCH_SIZE,
    class_mode='categorical', shuffle=True
)
validation_generator = val_test_datagen.flow_from_directory(
    val_dir, target_size=IMG_SIZE, batch_size=BATCH_SIZE,
    class_mode='categorical', shuffle=False
)
test_generator = val_test_datagen.flow_from_directory(
    test_dir, target_size=IMG_SIZE, batch_size=BATCH_SIZE,
    class_mode='categorical', shuffle=False
)

# -------------------------------
# STEP 4: CLASS WEIGHTS
# -------------------------------
# Compute class weights to handle imbalance in Normal vs Pneumonia images
y_train = train_generator.classes
class_weights_array = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = {i: weight for i, weight in enumerate(class_weights_array)}
print(f"\nClass weights: {class_weights}")

# -------------------------------
# STEP 5: MODEL DEFINITIONS
# -------------------------------
# Baseline CNN: simple small architecture for comparison
def create_baseline_cnn(input_shape=(224,224,3), num_classes=2):
    model = keras.Sequential([
        layers.Conv2D(32, (3,3), activation='relu', input_shape=input_shape),
        layers.MaxPooling2D((2,2)),
        layers.Conv2D(64, (3,3), activation='relu'),
        layers.MaxPooling2D((2,2)),
        layers.Conv2D(64, (3,3), activation='relu'),
        layers.Flatten(),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(num_classes, activation='softmax')
    ])
    return model

# Transfer learning models: MobileNetV2, EfficientNetB0, ResNet50
# Freeze 85% of layers to prevent overfitting on small dataset
def create_transfer_model(base_model_name, input_shape=(224,224,3), num_classes=2):
    if base_model_name == 'MobileNetV2':
        base = tf.keras.applications.MobileNetV2(input_shape=input_shape, include_top=False, weights='imagenet')
    elif base_model_name == 'EfficientNetB0':
        base = tf.keras.applications.EfficientNetB0(input_shape=input_shape, include_top=False, weights='imagenet')
    elif base_model_name == 'ResNet50':
        base = tf.keras.applications.ResNet50(input_shape=input_shape, include_top=False, weights='imagenet')
    else:
        raise ValueError(f"Unknown model: {base_model_name}")

    base.trainable = True
    freeze_until = int(len(base.layers) * 0.85)
    for layer in base.layers[:freeze_until]:
        layer.trainable = False

    inputs = keras.Input(shape=input_shape)
    x = base(inputs, training=False)
    x = layers.GlobalAveragePooling2D()(x)
    x = layers.Dense(128, activation='relu')(x)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(num_classes, activation='softmax')(x)
    return keras.Model(inputs, outputs)

# -------------------------------
# STEP 6: TRAINING FUNCTION
# -------------------------------
# Compile, train, and evaluate any given model, save results
def train_and_evaluate(model, model_name):
    print(f"\n{'='*50}\nTraining {model_name}\n{'='*50}")

    model.compile(
        optimizer=keras.optimizers.Adam(1e-4),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    # Early stopping and LR reduction callbacks to avoid overfitting
    early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True)
    reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-7)

    start_time = time.time()
    history = model.fit(
        train_generator,
        epochs=EPOCHS,
        validation_data=validation_generator,
        class_weight=class_weights,
        callbacks=[early_stop, reduce_lr],
        verbose=1
    )
    training_time = (time.time() - start_time)/60  # convert to minutes

    # Evaluate on test set
    predictions = model.predict(test_generator)
    y_pred = np.argmax(predictions, axis=1)
    y_true = test_generator.classes

    f1_per_class = f1_score(y_true, y_pred, average=None).tolist()
    f1_weighted = f1_score(y_true, y_pred, average='weighted')
    cm = confusion_matrix(y_true, y_pred)

    # Save all results in JSON
    results = {
        'dataset_name': DATASET_NAME,
        'country_income': COUNTRY_INCOME_LEVEL,
        'model_name': model_name,
        'num_classes': NUM_CLASSES,
        'class_names': CLASS_NAMES,
        'f1_per_class': f1_per_class,
        'f1_weighted': float(f1_weighted),
        'confusion_matrix': cm.tolist(),
        'training_time_minutes': float(training_time),
        'num_images_train': train_generator.samples,
        'num_images_val': validation_generator.samples,
        'num_images_test': test_generator.samples,
        'num_parameters': int(model.count_params()),
        'epochs_trained': len(history.history['loss'])
    }

    os.makedirs('/content/drive/MyDrive/xray_research_results', exist_ok=True)
    filename = f'/content/drive/MyDrive/xray_research_results/{DATASET_NAME}_{model_name}_results.json'
    with open(filename, 'w') as f:
        json.dump(results, f, indent=2)

    print(f"{model_name} Weighted F1: {f1_weighted:.4f}, F1 per class: {f1_per_class}")
    print(f"Confusion Matrix:\n{cm}")
    print(f"Training time: {training_time:.2f} min, Params: {model.count_params():,}")

    # Cleanup to free GPU memory
    del model
    tf.keras.backend.clear_session()
    gc.collect()

    return results

# -------------------------------
# STEP 7: TRAIN ALL MODELS
# -------------------------------
all_results = []
for model_name in ['BaselineCNN', 'MobileNetV2', 'EfficientNetB0', 'ResNet50']:
    if model_name == 'BaselineCNN':
        model = create_baseline_cnn(num_classes=NUM_CLASSES)
    else:
        model = create_transfer_model(model_name, num_classes=NUM_CLASSES)
    results = train_and_evaluate(model, model_name)
    all_results.append(results)

# -------------------------------
# STEP 8: SUMMARY
# -------------------------------
print("\n" + "="*60)
print("ALL TRAINING COMPLETE")
for r in all_results:
    print(f"{r['model_name']}: Weighted F1: {r['f1_weighted']:.4f}, Epochs: {r['epochs_trained']}, Params: {r['num_parameters']:,}")
print(f"\nAll results saved to: /content/drive/MyDrive/xray_research_results/")

Mounted at /content/drive
Dataset: Bangladesh_chest_xray, Income Level: LMIC, Classes: ['Normal', 'Pneumonia']
Found 1690 images belonging to 2 classes.
Found 423 images belonging to 2 classes.
Found 513 images belonging to 2 classes.

Class weights: {0: np.float64(0.9941176470588236), 1: np.float64(1.005952380952381)}

Training BaselineCNN
Epoch 1/20
[1m212/212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m265s[0m 1s/step - accuracy: 0.6739 - loss: 0.5878 - val_accuracy: 0.8865 - val_loss: 0.2909 - learning_rate: 1.0000e-04
Epoch 2/20
[1m212/212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m259s[0m 1s/step - accuracy: 0.8586 - loss: 0.3689 - val_accuracy: 0.8960 - val_loss: 0.2516 - learning_rate: 1.0000e-04
Epoch 3/20
[1m212/212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m270s[0m 1s/step - accuracy: 0.8490 - loss: 0.3728 - val_accuracy: 0.8889 - val_loss: 0.2481 - learning_rate: 1.0000e-04
Epoch 4/20
[1m212/212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m259s[0m 1s/st