<a href="https://colab.research.google.com/github/10710arnav/Noesis/blob/main/Aryan%20Basnet%2C%20Arnav%20Maharjan%20and%20Ashila%20A%20M%20Ardiyansyah/03_dataset3_LMIC_Nigeria.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **NOTE ON RUNTIME AND OUTPUTS**

# Google Colab may have disconnected or reset the runtime during long training sessions, crashes, or memory interruptions. When this occurred, some previously displayed outputs in the notebook were no longer visible. However, all results remained saved and logged correctly. Each model’s complete metrics and metadata were stored as JSON files in my Google Drive folder:

# [https://drive.google.com/drive/folders/1ejlJaZhHEBm-1khLBJ--mbG2pg5TZHoJ?usp=sharing](https://drive.google.com/drive/folders/1ejlJaZhHEBm-1khLBJ--mbG2pg5TZHoJ?usp=sharing)

# These JSON files contain the full and reliable outputs for all models across all datasets, even if certain notebook outputs were lost due to runtime resets.

# ==============================
# SETUP: Freeze all package versions
# ==============================
Ensure reproducibility by installing the exact versions of packages used in these notebooks. This includes pre-installed packages in Colab.

The packages and versions used are:

- numpy==1.25.2
- pandas==2.1.1
- matplotlib==3.8.0
- seaborn==0.12.2
- scikit-learn==1.3.2
- tensorflow==2.15.0
- keras==2.15.0
- scipy==1.11.2
- opencv-python==4.9.0.73
- Pillow==10.0.1
- h5py==3.9.0
- google-colab==2.0.0

In [None]:
# ==========================================
# CHEST X-RAY CLASSIFICATION - DATASET 3
# NIGERIA CHEST X-RAY DATABASE
# ==========================================

# STEP 1: METADATA
DATASET_NAME = "dataset3_nigeria_chest_xray"
COUNTRY_INCOME_LEVEL = "LMIC"  # Low-Middle Income Country (Nigeria)
DATASET_SOURCE = "https://www.kaggle.com/datasets/aminumusa/nigeria-chest-x-ray-dataset"
NUM_CLASSES = 3  # Normal, Pneumonia, TB (dropping COVID-19)

print(f"Dataset: {DATASET_NAME}")
print(f"Income Level: {COUNTRY_INCOME_LEVEL}")
print(f"Classes: Normal, Pneumonia, Tuberculosis (COVID-19 dropped)")

# STEP 2: MOUNT DRIVE
from google.colab import drive
drive.mount('/content/drive')

# Create results directory
!mkdir -p /content/drive/MyDrive/xray_research_results

# STEP 3: IMPORT DATASET
import kagglehub
import os
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
import json
import time
import gc
import matplotlib.pyplot as plt
import warnings
import shutil
warnings.filterwarnings('ignore')

# Download dataset
print("Downloading dataset...")
path = kagglehub.dataset_download("aminumusa/nigeria-chest-x-ray-dataset")
print("Path to dataset files:", path)

# Explore dataset structure
print("\nDataset structure:")
for root, dirs, files in os.walk(path):
    level = root.replace(path, '').count(os.sep)
    indent = ' ' * 2 * level
    print(f"{indent}{os.path.basename(root)}/")
    subindent = ' ' * 2 * (level + 1)
    for file in files[:5]:  # Show first 5 files only
        print(f"{subindent}{file}")
    if len(files) > 5:
        print(f"{subindent}... and {len(files)-5} more files")

# STEP 4: ORGANIZE DATA INTO TRAIN/VAL/TEST SPLITS
# Dataset has train_folder and test_folder with 4 classes each
# We need to combine them, drop COVID-19, and create new 70/15/15 splits

print("\nOrganizing data into train/val/test splits...")
print("NOTE: Dropping all COVID-19 images as per project requirements")

# Create organized directory structure
base_dir = '/content/organized_data'
for split in ['train', 'val', 'test']:
    for class_name in ['Normal', 'Pneumonia', 'TB']:
        os.makedirs(os.path.join(base_dir, split, class_name), exist_ok=True)

# Function to organize dataset
def organize_dataset(source_path, base_dir, train_ratio=0.70, val_ratio=0.15, test_ratio=0.15):
    """
    Organize Nigeria dataset into train/val/test splits
    Drop COVID-19 class, keep Normal, Pneumonia, TB
    """
    # Collect images by class
    class_images = {
        'Normal': [],
        'Pneumonia': [],
        'TB': []
    }

    covid_count = 0

    for root, dirs, files in os.walk(source_path):
        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg')):
                file_path = os.path.join(root, file)
                parent_folder = os.path.basename(root).lower()

                # Classify based on folder name
                if 'normal' in parent_folder:
                    class_images['Normal'].append(file_path)
                elif 'pneumonia' in parent_folder:
                    class_images['Pneumonia'].append(file_path)
                elif 'tb' in parent_folder or 'tuberculosis' in parent_folder:
                    class_images['TB'].append(file_path)
                elif 'covid' in parent_folder:
                    covid_count += 1
                    # Skip COVID-19 images
                    continue

    print(f"\nFound images:")
    print(f"  Normal: {len(class_images['Normal'])}")
    print(f"  Pneumonia: {len(class_images['Pneumonia'])}")
    print(f"  TB: {len(class_images['TB'])}")
    print(f"  COVID-19 (dropped): {covid_count}")

    # Split each class
    for class_name, images in class_images.items():
        if len(images) == 0:
            print(f"WARNING: No images found for class {class_name}")
            continue

        # Shuffle
        np.random.seed(42)
        np.random.shuffle(images)

        # Calculate split indices
        n = len(images)
        train_end = int(n * train_ratio)
        val_end = train_end + int(n * val_ratio)

        # Split data
        train_files = images[:train_end]
        val_files = images[train_end:val_end]
        test_files = images[val_end:]

        print(f"\n{class_name} split: Train={len(train_files)}, Val={len(val_files)}, Test={len(test_files)}")

        # Copy files
        for files, split in [(train_files, 'train'), (val_files, 'val'), (test_files, 'test')]:
            dest_dir = os.path.join(base_dir, split, class_name)
            for i, src in enumerate(files):
                dest = os.path.join(dest_dir, f"{class_name}_{split}_{i}{os.path.splitext(src)[1]}")
                shutil.copy2(src, dest)

# Organize the dataset
organize_dataset(path, base_dir)

# STEP 5: SETUP DATA PREPROCESSING
IMG_SIZE = (224, 224)
BATCH_SIZE = 32
EPOCHS = 20

CLASS_NAMES = ['Normal', 'Pneumonia', 'TB']
CLASS_MAPPING = {
    'Normal': 0,      # Healthy
    'Pneumonia': 1,   # Pneumonia
    'TB': 2           # Tuberculosis
}

# Data augmentation for training
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=15,
    zoom_range=0.1,
    brightness_range=[0.9, 1.1],
    horizontal_flip=True,
    width_shift_range=0.1,
    height_shift_range=0.1
)

# No augmentation for validation and test
val_test_datagen = ImageDataGenerator(rescale=1./255)

# Setup data paths
train_dir = os.path.join(base_dir, 'train')
val_dir = os.path.join(base_dir, 'val')
test_dir = os.path.join(base_dir, 'test')

# Check if directories exist
print("\nChecking directories:")
print(f"Train dir exists: {os.path.exists(train_dir)}")
print(f"Val dir exists: {os.path.exists(val_dir)}")
print(f"Test dir exists: {os.path.exists(test_dir)}")

# Create data generators
train_generator = train_datagen.flow_from_directory(
    train_dir,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    shuffle=True
)

validation_generator = val_test_datagen.flow_from_directory(
    val_dir,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    shuffle=False
)

test_generator = val_test_datagen.flow_from_directory(
    test_dir,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    shuffle=False
)

# Print dataset info
print(f"\nTraining samples: {train_generator.samples}")
print(f"Validation samples: {validation_generator.samples}")
print(f"Test samples: {test_generator.samples}")
print(f"Classes found: {train_generator.class_indices}")

# STEP 6: DEFINE MODEL ARCHITECTURES

def create_baseline_cnn(input_shape=(224, 224, 3), num_classes=3):
    """Simple CNN baseline"""
    model = keras.Sequential([
        layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.Flatten(),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(1 if num_classes == 2 else num_classes,
                    activation='sigmoid' if num_classes == 2 else 'softmax')
    ])
    return model

def create_transfer_model(base_model_name, input_shape=(224, 224, 3), num_classes=3):
    """Create transfer learning model"""
    # Load base model
    if base_model_name == 'MobileNetV2':
        base = tf.keras.applications.MobileNetV2(
            input_shape=input_shape, include_top=False, weights='imagenet')
    elif base_model_name == 'EfficientNetB0':
        base = tf.keras.applications.EfficientNetB0(
            input_shape=input_shape, include_top=False, weights='imagenet')
    elif base_model_name == 'ResNet50':
        base = tf.keras.applications.ResNet50(
            input_shape=input_shape, include_top=False, weights='imagenet')
    elif base_model_name == 'Xception':
        base = tf.keras.applications.Xception(
            input_shape=input_shape, include_top=False, weights='imagenet')
    elif base_model_name == 'InceptionV3':
        base = tf.keras.applications.InceptionV3(
            input_shape=input_shape, include_top=False, weights='imagenet')
    else:
        raise ValueError(f"Unknown model: {base_model_name}")

    # Freeze base layers
    base.trainable = False

    # Add custom head
    inputs = keras.Input(shape=input_shape)
    x = base(inputs, training=False)
    x = layers.GlobalAveragePooling2D()(x)
    x = layers.Dense(128, activation='relu')(x)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1 if num_classes == 2 else num_classes,
                          activation='sigmoid' if num_classes == 2 else 'softmax')(x)

    model = keras.Model(inputs, outputs)
    return model

# STEP 7: TRAINING AND EVALUATION FUNCTION

def train_and_evaluate(model, model_name, train_gen, val_gen, test_gen):
    """Train model and return results"""
    print(f"\n{'='*50}")
    print(f"Training {model_name}")
    print(f"{'='*50}")

    # Compile
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy' if NUM_CLASSES == 2 else 'categorical_crossentropy',
        metrics=['accuracy']
    )

    # Callbacks
    early_stop = keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=5, restore_best_weights=True
    )

    # Train
    start_time = time.time()
    history = model.fit(
        train_gen,
        epochs=EPOCHS,
        validation_data=val_gen,
        callbacks=[early_stop],
        verbose=1
    )
    training_time = (time.time() - start_time) / 60  # in minutes

    # Evaluate on test set
    print("\nEvaluating on test set...")
    test_loss, test_acc = model.evaluate(test_gen, verbose=0)

    # Get predictions
    predictions = model.predict(test_gen)
    if NUM_CLASSES == 2:
        y_pred = (predictions > 0.5).astype(int).flatten()
        y_true = test_gen.classes
    else:
        y_pred = np.argmax(predictions, axis=1)
        y_true = test_gen.classes

    # Calculate metrics
    if NUM_CLASSES == 2:
        # Binary classification
        f1_per_class = [
            f1_score(y_true == 0, y_pred == 0),
            f1_score(y_true == 1, y_pred == 1)
        ]
        f1_weighted = f1_score(y_true, y_pred, average='weighted')
    else:
        # Multi-class
        f1_per_class = f1_score(y_true, y_pred, average=None).tolist()
        f1_weighted = f1_score(y_true, y_pred, average='weighted')

    cm = confusion_matrix(y_true, y_pred)

    # Prepare results
    results = {
        'dataset_name': DATASET_NAME,
        'country_income': COUNTRY_INCOME_LEVEL,
        'model_name': model_name,
        'num_classes': NUM_CLASSES,
        'class_names': CLASS_NAMES,
        'f1_per_class': f1_per_class,
        'f1_weighted': float(f1_weighted),
        'confusion_matrix': cm.tolist(),
        'training_time_minutes': float(training_time),
        'num_images_train': train_gen.samples,
        'num_images_val': val_gen.samples,
        'num_images_test': test_gen.samples,
        'num_parameters': int(model.count_params()),
        'test_accuracy': float(test_acc),
        'epochs_trained': len(history.history['loss'])
    }

    # Print results
    print(f"\n{model_name} Results:")
    print(f"F1 Scores per class: {f1_per_class}")
    print(f"Weighted F1 Score: {f1_weighted:.4f}")
    print(f"Training time: {training_time:.2f} minutes")
    print(f"Parameters: {model.count_params():,}")

    # Save results
    filename = f'/content/drive/MyDrive/xray_research_results/{DATASET_NAME}_{COUNTRY_INCOME_LEVEL}_{model_name}_results.json'
    with open(filename, 'w') as f:
        json.dump(results, f, indent=2)
    print(f"Results saved to: {filename}")

    # Clear memory
    del model
    tf.keras.backend.clear_session()
    gc.collect()

    return results

# STEP 8: TRAIN ALL MODELS

all_results = []

# Model 1: Baseline CNN
model = create_baseline_cnn(num_classes=NUM_CLASSES)
results = train_and_evaluate(model, 'BaselineCNN', train_generator, validation_generator, test_generator)
all_results.append(results)

# Model 2: MobileNetV2
model = create_transfer_model('MobileNetV2', num_classes=NUM_CLASSES)
results = train_and_evaluate(model, 'MobileNetV2', train_generator, validation_generator, test_generator)
all_results.append(results)

# Model 3: EfficientNetB0
model = create_transfer_model('EfficientNetB0', num_classes=NUM_CLASSES)
results = train_and_evaluate(model, 'EfficientNetB0', train_generator, validation_generator, test_generator)
all_results.append(results)

# Model 4: ResNet50
model = create_transfer_model('ResNet50', num_classes=NUM_CLASSES)
results = train_and_evaluate(model, 'ResNet50', train_generator, validation_generator, test_generator)
all_results.append(results)

# STEP 9: SUMMARY
print("\n" + "="*60)
print("TRAINING COMPLETE - SUMMARY")
print("="*60)

for result in all_results:
    print(f"\n{result['model_name']}:")
    print(f"  Weighted F1: {result['f1_weighted']:.4f}")
    print(f"  Training Time: {result['training_time_minutes']:.2f} min")
    print(f"  Parameters: {result['num_parameters']:,}")

print(f"\nAll results saved to: /content/drive/MyDrive/xray_research_results/")

Dataset: dataset3_nigeria_chest_xray
Income Level: LMIC
Classes: Normal, Pneumonia, Tuberculosis (COVID-19 dropped)
Mounted at /content/drive
Downloading dataset...
Downloading from https://www.kaggle.com/api/v1/datasets/download/aminumusa/nigeria-chest-x-ray-dataset?dataset_version_number=1...


100%|██████████| 241M/241M [00:07<00:00, 34.0MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/aminumusa/nigeria-chest-x-ray-dataset/versions/1

Dataset structure:
1/
  my_dataset/
    train_folder/
      PNEUMONIA/
        Viral Pneumonia-439.png
        Viral Pneumonia-329.png
        Viral Pneumonia-8.png
        Viral Pneumonia-334.png
        Viral Pneumonia-169.png
        ... and 495 more files
      COVID/
        COVID-463.png
        COVID-176.png
        COVID-354.png
        COVID-340.png
        COVID-467.png
        ... and 495 more files
      TB/
        Tuberculosis-251.png
        Tuberculosis-119.png
        Tuberculosis-451.png
        Tuberculosis-12.png
        Tuberculosis-327.png
        ... and 495 more files
      NORMAL/
        Normal-528.png
        Normal-308.png
        Normal-272.png
        Normal-430.png
        Normal-503.png
        ... and 495 more files
    test_folder/
      PNEUMONIA/
        Viral Pneumonia-579.png
        Viral Pneumonia-589.png
        Viral Pneumonia-517.png
      

In [None]:
# ==========================================
# TRAIN RESNET50 ONLY FOR DATASET 3
# NIGERIA CHEST X-RAY DATABASE - FULL SETUP
# ==========================================

# STEP 1: METADATA
DATASET_NAME = "dataset3_nigeria_chest_xray"
COUNTRY_INCOME_LEVEL = "LMIC"
NUM_CLASSES = 3
CLASS_NAMES = ['Normal', 'Pneumonia', 'TB']

print(f"Training ResNet50 for {DATASET_NAME}")

# STEP 2: MOUNT DRIVE
from google.colab import drive
drive.mount('/content/drive')

# STEP 3: IMPORTS
import kagglehub
import os
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
import json
import time
import gc
import shutil
import warnings
warnings.filterwarnings('ignore')

# STEP 4: DOWNLOAD DATASET
print("\nDownloading dataset...")
path = kagglehub.dataset_download("aminumusa/nigeria-chest-x-ray-dataset")
print("Path to dataset files:", path)

# STEP 5: ORGANIZE DATA
print("\nOrganizing data into train/val/test splits...")
print("NOTE: Dropping all COVID-19 images as per project requirements")

base_dir = '/content/organized_data'
for split in ['train', 'val', 'test']:
    for class_name in ['Normal', 'Pneumonia', 'TB']:
        os.makedirs(os.path.join(base_dir, split, class_name), exist_ok=True)

def organize_dataset(source_path, base_dir, train_ratio=0.70, val_ratio=0.15, test_ratio=0.15):
    """Organize Nigeria dataset into train/val/test splits"""
    class_images = {
        'Normal': [],
        'Pneumonia': [],
        'TB': []
    }

    covid_count = 0

    for root, dirs, files in os.walk(source_path):
        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg')):
                file_path = os.path.join(root, file)
                parent_folder = os.path.basename(root).lower()

                if 'normal' in parent_folder:
                    class_images['Normal'].append(file_path)
                elif 'pneumonia' in parent_folder:
                    class_images['Pneumonia'].append(file_path)
                elif 'tb' in parent_folder or 'tuberculosis' in parent_folder:
                    class_images['TB'].append(file_path)
                elif 'covid' in parent_folder:
                    covid_count += 1
                    continue

    print(f"\nFound images:")
    print(f"  Normal: {len(class_images['Normal'])}")
    print(f"  Pneumonia: {len(class_images['Pneumonia'])}")
    print(f"  TB: {len(class_images['TB'])}")
    print(f"  COVID-19 (dropped): {covid_count}")

    for class_name, images in class_images.items():
        if len(images) == 0:
            print(f"WARNING: No images found for class {class_name}")
            continue

        np.random.seed(42)
        np.random.shuffle(images)

        n = len(images)
        train_end = int(n * train_ratio)
        val_end = train_end + int(n * val_ratio)

        train_files = images[:train_end]
        val_files = images[train_end:val_end]
        test_files = images[val_end:]

        print(f"\n{class_name} split: Train={len(train_files)}, Val={len(val_files)}, Test={len(test_files)}")

        for files, split in [(train_files, 'train'), (val_files, 'val'), (test_files, 'test')]:
            dest_dir = os.path.join(base_dir, split, class_name)
            for i, src in enumerate(files):
                dest = os.path.join(dest_dir, f"{class_name}_{split}_{i}{os.path.splitext(src)[1]}")
                shutil.copy2(src, dest)

organize_dataset(path, base_dir)

# STEP 6: SETUP DATA PREPROCESSING
IMG_SIZE = (224, 224)
BATCH_SIZE = 32
EPOCHS = 20

train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=15,
    zoom_range=0.1,
    brightness_range=[0.9, 1.1],
    horizontal_flip=True,
    width_shift_range=0.1,
    height_shift_range=0.1
)

val_test_datagen = ImageDataGenerator(rescale=1./255)

train_dir = os.path.join(base_dir, 'train')
val_dir = os.path.join(base_dir, 'val')
test_dir = os.path.join(base_dir, 'test')

train_generator = train_datagen.flow_from_directory(
    train_dir,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    shuffle=True
)

validation_generator = val_test_datagen.flow_from_directory(
    val_dir,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    shuffle=False
)

test_generator = val_test_datagen.flow_from_directory(
    test_dir,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    shuffle=False
)

print(f"\nData loaded:")
print(f"Training samples: {train_generator.samples}")
print(f"Validation samples: {validation_generator.samples}")
print(f"Test samples: {test_generator.samples}")
print(f"Classes found: {train_generator.class_indices}")

# STEP 7: COMPUTE CLASS WEIGHTS
y_train = train_generator.classes
class_weights_array = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights = {i: weight for i, weight in enumerate(class_weights_array)}

print(f"\nClass distribution in training:")
for i, class_name in enumerate(CLASS_NAMES):
    print(f"  {class_name} ({i}): {np.sum(y_train == i)} samples")
print(f"Class weights computed: {class_weights}")

# STEP 8: CREATE MODEL
def create_resnet50(input_shape=(224, 224, 3), num_classes=3):
    """Create ResNet50 transfer learning model"""
    base = tf.keras.applications.ResNet50(
        input_shape=input_shape,
        include_top=False,
        weights='imagenet'
    )

    base.trainable = False

    inputs = keras.Input(shape=input_shape)
    x = base(inputs, training=False)
    x = layers.GlobalAveragePooling2D()(x)
    x = layers.Dense(128, activation='relu')(x)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(num_classes, activation='softmax')(x)

    model = keras.Model(inputs, outputs)
    return model

# STEP 9: TRAIN AND EVALUATE
def train_and_evaluate(model, model_name, train_gen, val_gen, test_gen, class_weights):
    """Train model and return results"""
    print(f"\n{'='*50}")
    print(f"Training {model_name}")
    print(f"{'='*50}")

    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    early_stop = keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True,
        verbose=1
    )

    reduce_lr = keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=3,
        min_lr=1e-7,
        verbose=1
    )

    start_time = time.time()
    history = model.fit(
        train_gen,
        epochs=EPOCHS,
        validation_data=val_gen,
        callbacks=[early_stop, reduce_lr],
        class_weight=class_weights,
        verbose=1
    )
    training_time = (time.time() - start_time) / 60

    print("\nEvaluating on test set...")
    test_loss, test_acc = model.evaluate(test_gen, verbose=0)

    predictions = model.predict(test_gen)
    y_pred = np.argmax(predictions, axis=1)
    y_true = test_gen.classes

    f1_per_class = f1_score(y_true, y_pred, average=None).tolist()
    f1_weighted = f1_score(y_true, y_pred, average='weighted')
    cm = confusion_matrix(y_true, y_pred)

    results = {
        'dataset_name': DATASET_NAME,
        'country_income': COUNTRY_INCOME_LEVEL,
        'model_name': model_name,
        'num_classes': NUM_CLASSES,
        'class_names': CLASS_NAMES,
        'f1_per_class': f1_per_class,
        'f1_weighted': float(f1_weighted),
        'confusion_matrix': cm.tolist(),
        'training_time_minutes': float(training_time),
        'num_images_train': train_gen.samples,
        'num_images_val': val_gen.samples,
        'num_images_test': test_gen.samples,
        'num_parameters': int(model.count_params()),
        'test_accuracy': float(test_acc),
        'epochs_trained': len(history.history['loss'])
    }

    print(f"\n{model_name} Results:")
    print(f"F1 Scores per class: {f1_per_class}")
    print(f"Weighted F1 Score: {f1_weighted:.4f}")
    print(f"Confusion Matrix:\n{cm}")
    print(f"Training time: {training_time:.2f} minutes")
    print(f"Parameters: {model.count_params():,}")

    filename = f'/content/drive/MyDrive/xray_research_results/{DATASET_NAME}_{COUNTRY_INCOME_LEVEL}_{model_name}_results.json'
    with open(filename, 'w') as f:
        json.dump(results, f, indent=2)
    print(f"Results saved to: {filename}")

    del model
    tf.keras.backend.clear_session()
    gc.collect()

    return results

# STEP 10: TRAIN RESNET50
print("\n" + "="*60)
print("STARTING RESNET50 TRAINING")
print("="*60)

model = create_resnet50(num_classes=NUM_CLASSES)
results = train_and_evaluate(
    model,
    'ResNet50',
    train_generator,
    validation_generator,
    test_generator,
    class_weights
)

# STEP 11: SUMMARY
print("\n" + "="*60)
print("TRAINING COMPLETE")
print("="*60)
print(f"\nResNet50:")
print(f"  Weighted F1: {results['f1_weighted']:.4f}")
print(f"  F1 per class: {results['f1_per_class']}")
print(f"  Training Time: {results['training_time_minutes']:.2f} min")
print(f"  Parameters: {results['num_parameters']:,}")
print(f"\nResults saved to: /content/drive/MyDrive/xray_research_results/")

Training ResNet50 for dataset3_nigeria_chest_xray
Mounted at /content/drive

Downloading dataset...
Using Colab cache for faster access to the 'nigeria-chest-x-ray-dataset' dataset.
Path to dataset files: /kaggle/input/nigeria-chest-x-ray-dataset

Organizing data into train/val/test splits...
NOTE: Dropping all COVID-19 images as per project requirements

Found images:
  Normal: 650
  Pneumonia: 650
  TB: 650
  COVID-19 (dropped): 650

Normal split: Train=454, Val=97, Test=99

Pneumonia split: Train=454, Val=97, Test=99

TB split: Train=454, Val=97, Test=99
Found 1362 images belonging to 3 classes.
Found 291 images belonging to 3 classes.
Found 297 images belonging to 3 classes.

Data loaded:
Training samples: 1362
Validation samples: 291
Test samples: 297
Classes found: {'Normal': 0, 'Pneumonia': 1, 'TB': 2}

Class distribution in training:
  Normal (0): 454 samples
  Pneumonia (1): 454 samples
  TB (2): 454 samples
Class weights computed: {0: np.float64(1.0), 1: np.float64(1.0), 2: n