<a href="https://colab.research.google.com/github/10710arnav/Noesis/blob/main/Aryan%20Basnet%2C%20Arnav%20Maharjan%20and%20Ashila%20A%20M%20Ardiyansyah/00_initial_experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NOTE: This Colab notebook is a **test run**.
# The RSUA dataset is relatively small and this script is used primarily to demonstrate the workflow for training different models (Baseline CNN, MobileNetV2, EfficientNetB0, ResNet50) on a dataset. This mirrors the approach we took for the six main datasets in our research. Outcomes here are not intended for publication-level results; the purpose is to validate the pipeline and ensure proper handling of each model type.


# **NOTE ON RUNTIME AND OUTPUTS**

# Google Colab may have disconnected or reset the runtime during long training sessions, crashes, or memory interruptions. When this occurred, some previously displayed outputs in the notebook were no longer visible. However, all results remained saved and logged correctly. Each model’s complete metrics and metadata were stored as JSON files in my Google Drive folder:

# [https://drive.google.com/drive/folders/1ejlJaZhHEBm-1khLBJ--mbG2pg5TZHoJ?usp=sharing](https://drive.google.com/drive/folders/1ejlJaZhHEBm-1khLBJ--mbG2pg5TZHoJ?usp=sharing)

# These JSON files contain the full and reliable outputs for all models across all datasets, even if certain notebook outputs were lost due to runtime resets.

# ==============================
# SETUP: Freeze all package versions
# ==============================
Ensure reproducibility by installing the exact versions of packages used in these notebooks. This includes pre-installed packages in Colab.

The packages and versions used are:

- numpy==1.25.2
- pandas==2.1.1
- matplotlib==3.8.0
- seaborn==0.12.2
- scikit-learn==1.3.2
- tensorflow==2.15.0
- keras==2.15.0
- scipy==1.11.2
- opencv-python==4.9.0.73
- Pillow==10.0.1
- h5py==3.9.0
- google-colab==2.0.0

In [None]:
import zipfile, os, shutil, time, gc, json, warnings
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')

# ==========================
# STEP 0: PATHS AND EXTRACTION
# ==========================
zip_path = '/content/Data Chest X-Ray RSUA (Annotated)-20230618T030427Z-001.zip'
extract_path = '/content/RSUA_dataset'

if not os.path.exists(extract_path):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    print(f"Dataset extracted to: {extract_path}")
else:
    print(f"Dataset already extracted at: {extract_path}")

DATASET_ROOT = "/content/RSUA_dataset/Data Chest X-Ray RSUA (Annotated)"
normal_dir = os.path.join(DATASET_ROOT, "Non_Covid", "images")
pneumonia_dir = os.path.join(DATASET_ROOT, "Non_Covid_Pneumonia", "images")

# ==========================
# STEP 1: METADATA
# ==========================
DATASET_NAME = "RSUA_chest_xray"
COUNTRY_INCOME_LEVEL = "LMIC"
NUM_CLASSES = 2
CLASS_NAMES = ['Normal', 'Pneumonia']

print(f"Dataset: {DATASET_NAME}, Income Level: {COUNTRY_INCOME_LEVEL}, Classes: {CLASS_NAMES}")

# ==========================
# STEP 2: MOUNT DRIVE
# ==========================
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
os.makedirs('/content/drive/MyDrive/xray_research_results', exist_ok=True)

# ==========================
# STEP 3: SPLIT FUNCTION
# ==========================
base_dir = "/content/RSUA_split"
train_dir = os.path.join(base_dir, "train")
val_dir = os.path.join(base_dir, "val")
test_dir = os.path.join(base_dir, "test")

for d in [train_dir, val_dir, test_dir]:
    for c in CLASS_NAMES:
        os.makedirs(os.path.join(d, c), exist_ok=True)

def split_copy(src_dir, class_name):
    image_extensions = ('.png', '.jpg', '.jpeg', '.bmp')
    all_files = []
    for root, dirs, files in os.walk(src_dir):
        for f in files:
            if f.lower().endswith(image_extensions):
                all_files.append(os.path.join(root, f))

    train_files, temp_files = train_test_split(all_files, test_size=0.3, random_state=42)
    val_files, test_files = train_test_split(temp_files, test_size=0.5, random_state=42)

    for fpath in train_files:
        shutil.copy(fpath, os.path.join(train_dir, class_name))
    for fpath in val_files:
        shutil.copy(fpath, os.path.join(val_dir, class_name))
    for fpath in test_files:
        shutil.copy(fpath, os.path.join(test_dir, class_name))

split_copy(normal_dir, 'Normal')
split_copy(pneumonia_dir, 'Pneumonia')

# ==========================
# STEP 4: DATA AUGMENTATION
# ==========================
IMG_SIZE = (224, 224)
BATCH_SIZE = 8
EPOCHS = 20

train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=25,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.1,
    zoom_range=0.2,
    brightness_range=[0.8,1.2],
    horizontal_flip=True,
    fill_mode='nearest'
)
val_test_datagen = ImageDataGenerator(rescale=1./255)

train_generator = train_datagen.flow_from_directory(
    train_dir, target_size=IMG_SIZE, batch_size=BATCH_SIZE,
    class_mode='categorical', shuffle=True
)
validation_generator = val_test_datagen.flow_from_directory(
    val_dir, target_size=IMG_SIZE, batch_size=BATCH_SIZE,
    class_mode='categorical', shuffle=False
)
test_generator = val_test_datagen.flow_from_directory(
    test_dir, target_size=IMG_SIZE, batch_size=BATCH_SIZE,
    class_mode='categorical', shuffle=False
)

# ==========================
# STEP 5: CLASS WEIGHTS
# ==========================
y_train = train_generator.classes
class_weights_array = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = {i: weight for i, weight in enumerate(class_weights_array)}
print(f"\nClass weights: {class_weights}")

# ==========================
# STEP 6: MODEL DEFINITIONS
# ==========================
def create_baseline_cnn(input_shape=(224,224,3), num_classes=2):
    model = keras.Sequential([
        layers.Conv2D(32, (3,3), activation='relu', input_shape=input_shape),
        layers.MaxPooling2D((2,2)),
        layers.Conv2D(64, (3,3), activation='relu'),
        layers.MaxPooling2D((2,2)),
        layers.Conv2D(64, (3,3), activation='relu'),
        layers.Flatten(),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(num_classes, activation='softmax')
    ])
    return model

def create_transfer_model(base_model_name, input_shape=(224,224,3), num_classes=2):
    if base_model_name == 'MobileNetV2':
        base = tf.keras.applications.MobileNetV2(input_shape=input_shape, include_top=False, weights='imagenet')
    elif base_model_name == 'EfficientNetB0':
        base = tf.keras.applications.EfficientNetB0(input_shape=input_shape, include_top=False, weights='imagenet')
    elif base_model_name == 'ResNet50':
        base = tf.keras.applications.ResNet50(input_shape=input_shape, include_top=False, weights='imagenet')
    else:
        raise ValueError(f"Unknown model: {base_model_name}")

    base.trainable = True
    freeze_until = int(len(base.layers) * 0.85)
    for layer in base.layers[:freeze_until]:
        layer.trainable = False

    inputs = keras.Input(shape=input_shape)
    x = base(inputs, training=False)
    x = layers.GlobalAveragePooling2D()(x)
    x = layers.Dense(128, activation='relu')(x)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(num_classes, activation='softmax')(x)
    return keras.Model(inputs, outputs)

# ==========================
# STEP 7: TRAINING FUNCTION
# ==========================
def train_and_evaluate(model, model_name):
    print(f"\n{'='*50}\nTraining {model_name}\n{'='*50}")

    model.compile(
        optimizer=keras.optimizers.Adam(1e-4),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True)
    reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-7)

    start_time = time.time()
    history = model.fit(
        train_generator,
        epochs=EPOCHS,
        validation_data=validation_generator,
        class_weight=class_weights,
        callbacks=[early_stop, reduce_lr],
        verbose=1
    )
    training_time = (time.time() - start_time)/60

    predictions = model.predict(test_generator)
    y_pred = np.argmax(predictions, axis=1)
    y_true = test_generator.classes

    f1_per_class = f1_score(y_true, y_pred, average=None).tolist()
    f1_weighted = f1_score(y_true, y_pred, average='weighted')
    cm = confusion_matrix(y_true, y_pred)

    results = {
        'dataset_name': DATASET_NAME,
        'country_income': COUNTRY_INCOME_LEVEL,
        'model_name': model_name,
        'num_classes': NUM_CLASSES,
        'class_names': CLASS_NAMES,
        'f1_per_class': f1_per_class,
        'f1_weighted': float(f1_weighted),
        'confusion_matrix': cm.tolist(),
        'training_time_minutes': float(training_time),
        'num_images_train': train_generator.samples,
        'num_images_val': validation_generator.samples,
        'num_images_test': test_generator.samples,
        'num_parameters': int(model.count_params()),
        'epochs_trained': len(history.history['loss'])
    }

    filename = f'/content/drive/MyDrive/xray_research_results/{DATASET_NAME}_{model_name}_results.json'
    with open(filename, 'w') as f:
        json.dump(results, f, indent=2)

    print(f"{model_name} Weighted F1: {f1_weighted:.4f}, F1 per class: {f1_per_class}")
    print(f"Confusion Matrix:\n{cm}")
    print(f"Training time: {training_time:.2f} min, Params: {model.count_params():,}")

    del model
    tf.keras.backend.clear_session()
    gc.collect()

    return results

# ==========================
# STEP 8: TRAIN ALL 4 MODELS
# ==========================
all_results = []
for model_name in ['BaselineCNN', 'MobileNetV2', 'EfficientNetB0', 'ResNet50']:
    if model_name == 'BaselineCNN':
        model = create_baseline_cnn(num_classes=NUM_CLASSES)
    else:
        model = create_transfer_model(model_name, num_classes=NUM_CLASSES)
    results = train_and_evaluate(model, model_name)
    all_results.append(results)

# ==========================
# STEP 9: SUMMARY
# ==========================
print("\n" + "="*60)
print("ALL TRAINING COMPLETE")
for r in all_results:
    print(f"{r['model_name']}: Weighted F1: {r['f1_weighted']:.4f}, Epochs: {r['epochs_trained']}, Params: {r['num_parameters']:,}")
print(f"\nAll results saved to: /content/drive/MyDrive/xray_research_results/")

Dataset already extracted at: /content/RSUA_dataset
Dataset: RSUA_chest_xray, Income Level: LMIC, Classes: ['Normal', 'Pneumonia']
Mounted at /content/drive
Found 98 images belonging to 2 classes.
Found 54 images belonging to 2 classes.
Found 54 images belonging to 2 classes.

Class weights: {0: np.float64(1.2564102564102564), 1: np.float64(0.8305084745762712)}

Training BaselineCNN
Epoch 1/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 1s/step - accuracy: 0.4394 - loss: 0.8327 - val_accuracy: 0.5741 - val_loss: 0.6833 - learning_rate: 1.0000e-04
Epoch 2/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 993ms/step - accuracy: 0.5595 - loss: 0.7244 - val_accuracy: 0.4815 - val_loss: 0.6932 - learning_rate: 1.0000e-04
Epoch 3/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 970ms/step - accuracy: 0.5513 - loss: 0.6893 - val_accuracy: 0.4444 - val_loss: 0.7046 - learning_rate: 1.0000e-04
Epoch 4/20
[1m13/13[0m [32m━━━━━━━━━━━━━━



[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 993ms/step
EfficientNetB0 Weighted F1: 0.4865, F1 per class: [0.0, 0.7727272727272727]
Confusion Matrix:
[[ 0 20]
 [ 0 34]]
Training time: 3.07 min, Params: 4,213,797
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 0us/step

Training ResNet50
Epoch 1/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 3s/step - accuracy: 0.3735 - loss: 1.0599 - val_accuracy: 0.6296 - val_loss: 0.6605 - learning_rate: 1.0000e-04
Epoch 2/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 2s/step - accuracy: 0.5531 - loss: 0.7701 - val_accuracy: 0.6296 - val_loss: 0.6642 - learning_rate: 1.0000e-04
Epoch 3/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 2s/step - accuracy: 0.4620 - loss: 0.7267 - val_accuracy: 0.6296 - val_l



[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2s/step
ResNet50 Weighted F1: 0.4865, F1 per class: [0.0, 0.7727272727272727]
Confusion Matrix:
[[ 0 20]
 [ 0 34]]
Training time: 5.12 min, Params: 23,850,242

ALL TRAINING COMPLETE
BaselineCNN: Weighted F1: 0.5774, Epochs: 5, Params: 11,132,098
MobileNetV2: Weighted F1: 0.6537, Epochs: 20, Params: 2,422,210
EfficientNetB0: Weighted F1: 0.4865, Epochs: 13, Params: 4,213,797
ResNet50: Weighted F1: 0.4865, Epochs: 9, Params: 23,850,242

All results saved to: /content/drive/MyDrive/xray_research_results/
