In [None]:
!pip install -q kagglehub tensorflow numpy pandas matplotlib scikit-learn seaborn
import tensorflow as tf
print(tf.config.list_physical_devices('GPU'))


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [None]:
import os
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.metrics import (
    confusion_matrix, classification_report, roc_curve, auc
)

warnings.filterwarnings('ignore')

In [None]:
# ============================================================================
# CONFIGURATION
# ============================================================================

IMG_SIZE = 224
BATCH_SIZE = 32
EPOCHS = 30

os.makedirs('models', exist_ok=True)
os.makedirs('results', exist_ok=True)

In [None]:
# ============================================================================
# GOOGLE COLAB & GPU DETECTION
# ============================================================================

def check_colab_and_gpu():
    """Check if running in Colab and GPU availability."""

    print("\n" + "=" * 80)
    print("GOOGLE COLAB ENVIRONMENT CHECK")
    print("=" * 80)

    try:
        import google.colab
        print("\n✅ Running in Google Colab")
    except ImportError:
        print("\n⚠️  Not running in Google Colab (local environment)")

    # Check GPU
    gpus = tf.config.list_physical_devices('GPU')
    if gpus:
        print(f"\n✅ GPU DETECTED: {len(gpus)} GPU(s) available")
        for gpu in gpus:
            print(f"   {gpu}")

        # Enable GPU memory growth
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("\n✅ GPU memory growth enabled")
    else:
        print("\n⚠️  No GPU detected. Training will be slow.")

    print(f"\nTensorFlow version: {tf.__version__}")
    print("=" * 80)

In [None]:
# ============================================================================
# DOWNLOAD KAGGLE DATASET
# ============================================================================

def download_kaggle_dataset():
    """Download balanced Kaggle dataset."""

    print("\n" + "=" * 80)
    print("DOWNLOADING KAGGLE DATASET")
    print("=" * 80)

    import kagglehub

    print("\nDataset: vuppalaadithyasairam/bone-fracture-detection-using-xrays")
    path = kagglehub.dataset_download(
        "vuppalaadithyasairam/bone-fracture-detection-using-xrays"
    )

    print(f"\n✅ Dataset downloaded to: {path}")

    return path

In [None]:
# ============================================================================
# FIND DATASET LOCATION
# ============================================================================

def find_dataset_path(kaggle_path):
    """Find where the train/val folders actually are."""

    print("\n" + "=" * 80)
    print("LOCATING DATASET STRUCTURE")
    print("=" * 80)

    # Try different possible locations
    candidates = [
        os.path.join(kaggle_path, '1', 'archive (6)'),
        os.path.join(kaggle_path, 'archive (6)'),
        kaggle_path,
    ]

    for candidate in candidates:
        if os.path.isdir(candidate):
            # Check if it has train folder
            train_path = os.path.join(candidate, 'train')
            val_path = os.path.join(candidate, 'val')

            if os.path.isdir(train_path) and os.path.isdir(val_path):
                print(f"\n✅ Found dataset at: {candidate}")
                print(f"   train/ folder: {train_path}")
                print(f"   val/ folder: {val_path}")

                # List contents
                print(f"\nTrain folder contents:")
                for item in os.listdir(train_path):
                    item_path = os.path.join(train_path, item)
                    if os.path.isdir(item_path):
                        count = len(os.listdir(item_path))
                        print(f"  {item}/: {count} images")

                print(f"\nValidation folder contents:")
                for item in os.listdir(val_path):
                    item_path = os.path.join(val_path, item)
                    if os.path.isdir(item_path):
                        count = len(os.listdir(item_path))
                        print(f"  {item}/: {count} images")

                return candidate

    raise RuntimeError(f"Could not find train/val folders in {kaggle_path}")

In [None]:
# ============================================================================
# DATA LOADING
# ============================================================================

def load_data(data_path):
    """Load data directly from dataset structure (NO reorganization)."""

    print("\n[1/5] Loading and preprocessing data...")

    # Training data augmentation (STRONG)
    train_datagen = ImageDataGenerator(
        rescale=1./255,
        rotation_range=30,
        width_shift_range=0.3,
        height_shift_range=0.3,
        horizontal_flip=True,
        vertical_flip=True,
        zoom_range=0.3,
        brightness_range=[0.8, 1.2],
        shear_range=0.2,
        fill_mode='nearest'
    )

    # Validation data - only rescale
    val_datagen = ImageDataGenerator(rescale=1./255)

    # Load training data
    # train/ has subfolders: "fractured" and "not fractured"
    train_gen = train_datagen.flow_from_directory(
        os.path.join(data_path, 'train'),
        target_size=(IMG_SIZE, IMG_SIZE),
        batch_size=BATCH_SIZE,
        class_mode='binary',
        shuffle=True,
        seed=42
    )

    # Load validation data
    # val/ has subfolders: "fractured" and "not fractured"
    val_gen = val_datagen.flow_from_directory(
        os.path.join(data_path, 'val'),
        target_size=(IMG_SIZE, IMG_SIZE),
        batch_size=BATCH_SIZE,
        class_mode='binary',
        shuffle=False,
        seed=42
    )

    print(f"\nFound {train_gen.samples} training images belonging to 2 classes.")
    print(f"Found {val_gen.samples} validation images belonging to 2 classes.")
    print(f"\nClass labels mapping:")
    print(f"  {train_gen.class_indices}")

    # Print class distribution
    print(f"\nTraining class distribution:")
    for class_name, class_idx in train_gen.class_indices.items():
        count = sum(train_gen.classes == class_idx)
        pct = (count / len(train_gen.classes)) * 100
        print(f"  {class_name}: {count} ({pct:.1f}%)")

    print(f"\nValidation class distribution:")
    for class_name, class_idx in val_gen.class_indices.items():
        count = sum(val_gen.classes == class_idx)
        pct = (count / len(val_gen.classes)) * 100
        print(f"  {class_name}: {count} ({pct:.1f}%)")

    return train_gen, val_gen

In [None]:
# ============================================================================
# MODEL BUILDING
# ============================================================================

def build_model():
    """Build CNN model."""

    print("\n[2/5] Building model architecture...")

    model = models.Sequential([
        # Block 1
        layers.Conv2D(32, (3, 3), padding='same', input_shape=(IMG_SIZE, IMG_SIZE, 3)),
        layers.BatchNormalization(),
        layers.Activation('relu'),
        layers.Conv2D(32, (3, 3), padding='same'),
        layers.BatchNormalization(),
        layers.Activation('relu'),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(0.25),

        # Block 2
        layers.Conv2D(64, (3, 3), padding='same'),
        layers.BatchNormalization(),
        layers.Activation('relu'),
        layers.Conv2D(64, (3, 3), padding='same'),
        layers.BatchNormalization(),
        layers.Activation('relu'),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(0.25),

        # Block 3
        layers.Conv2D(128, (3, 3), padding='same'),
        layers.BatchNormalization(),
        layers.Activation('relu'),
        layers.Conv2D(128, (3, 3), padding='same'),
        layers.BatchNormalization(),
        layers.Activation('relu'),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(0.25),

        # Block 4
        layers.Conv2D(256, (3, 3), padding='same'),
        layers.BatchNormalization(),
        layers.Activation('relu'),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(0.25),

        # Global pooling
        layers.GlobalAveragePooling2D(),

        # Dense layers
        layers.Dense(256, activation=None),
        layers.BatchNormalization(),
        layers.Activation('relu'),
        layers.Dropout(0.5),

        layers.Dense(128, activation=None),
        layers.BatchNormalization(),
        layers.Activation('relu'),
        layers.Dropout(0.5),

        # Output
        layers.Dense(1, activation='sigmoid')
    ])

    model.summary()

    return model

In [None]:
# ============================================================================
# MODEL TRAINING
# ============================================================================

def train_model(model, train_gen, val_gen, epochs=EPOCHS):
    """Train model with class weights."""

    print("\n[3/5] Training model...")

    # Calculate class weights
    class_counts = np.bincount(train_gen.classes)
    total_samples = sum(class_counts)

    class_weights = {}
    for i, count in enumerate(class_counts):
        class_weights[i] = total_samples / (len(class_counts) * count)

    print(f"\nClass Weights:")
    for i, weight in class_weights.items():
        print(f"  Class {i}: {class_counts[i]:5d} samples, weight: {weight:.4f}")

    # Compile model
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=1e-3),
        loss='binary_crossentropy',
        metrics=['accuracy', keras.metrics.AUC(name='auc')]
    )

    # Callbacks
    callbacks = [
        keras.callbacks.EarlyStopping(
            monitor='val_auc',
            patience=10,
            restore_best_weights=True,
            verbose=1,
            mode='max'
        ),
        keras.callbacks.ModelCheckpoint(
            'models/best_model.h5',
            monitor='val_auc',
            save_best_only=True,
            verbose=1,
            mode='max'
        ),
        keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=5,
            min_lr=1e-7,
            verbose=1
        ),
    ]

    # Train with class weights
    history = model.fit(
        train_gen,
        epochs=epochs,
        validation_data=val_gen,
        callbacks=callbacks,
        class_weight=class_weights,
        verbose=1
    )

    return history

In [None]:
# ============================================================================
# MODEL EVALUATION
# ============================================================================

def evaluate_model(model, val_gen, train_gen):
    """Evaluate model on validation set."""

    print("\n[4/5] Evaluating model...")

    # Get predictions on validation set
    y_true = val_gen.classes
    y_pred_prob = model.predict(val_gen, verbose=0)
    y_pred = (y_pred_prob > 0.5).astype(int).flatten()

    # Calculate metrics
    test_loss, test_accuracy, test_auc = model.evaluate(val_gen, verbose=0)

    # Classification report
    report = classification_report(
        y_true, y_pred,
        target_names=list(train_gen.class_indices.keys()),
        output_dict=True
    )

    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    # ROC-AUC
    fpr, tpr, _ = roc_curve(y_true, y_pred_prob)
    roc_auc = auc(fpr, tpr)

    print("\n" + "=" * 80)
    print("VALIDATION RESULTS")
    print("=" * 80)

    print(f"\nValidation Accuracy:  {test_accuracy:.4f}")
    print(f"Validation Precision: {report['fractured']['precision']:.4f}")
    print(f"Validation Recall:    {report['fractured']['recall']:.4f}")
    print(f"Validation F1-Score:  {report['fractured']['f1-score']:.4f}")
    print(f"Validation AUC-ROC:   {roc_auc:.4f}")

    print("\n" + classification_report(
        y_true, y_pred,
        target_names=list(train_gen.class_indices.keys())
    ))

    print("\nConfusion Matrix:")
    print(f"  TN: {cm[0,0]:3d}  FP: {cm[0,1]:3d}")
    print(f"  FN: {cm[1,0]:3d}  TP: {cm[1,1]:3d}")

    # Save metrics
    metrics_df = pd.DataFrame({
        'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'AUC-ROC'],
        'Value': [
            test_accuracy,
            report['fractured']['precision'],
            report['fractured']['recall'],
            report['fractured']['f1-score'],
            roc_auc
        ]
    })

    metrics_df.to_csv('results/metrics_summary.csv', index=False)
    print("\n✓ Saved: results/metrics_summary.csv")

    return {
        'history': None,
        'accuracy': test_accuracy,
        'precision': report['fractured']['precision'],
        'recall': report['fractured']['recall'],
        'f1_score': report['fractured']['f1-score'],
        'auc_roc': roc_auc,
        'cm': cm,
        'fpr': fpr,
        'tpr': tpr,
        'y_true': y_true,
        'y_pred': y_pred,
        'y_pred_prob': y_pred_prob,
        'class_names': list(train_gen.class_indices.keys())
    }


In [None]:
# ============================================================================
# VISUALIZATION
# ============================================================================

def create_visualizations(history, eval_results):
    """Create and save visualizations."""

    print("\n[5/5] Creating visualizations...")

    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    # Training curves
    if history:
        axes[0].plot(history.history['loss'], label='Train Loss', linewidth=2)
        axes[0].plot(history.history['val_loss'], label='Val Loss', linewidth=2)
        axes[0].set_xlabel('Epoch', fontsize=12)
        axes[0].set_ylabel('Loss', fontsize=12)
        axes[0].set_title('Model Loss Over Epochs', fontsize=14, fontweight='bold')
        axes[0].legend(fontsize=11)
        axes[0].grid(True, alpha=0.3)

        axes[1].plot(history.history['accuracy'], label='Train Accuracy', linewidth=2)
        axes[1].plot(history.history['val_accuracy'], label='Val Accuracy', linewidth=2)
        axes[1].set_xlabel('Epoch', fontsize=12)
        axes[1].set_ylabel('Accuracy', fontsize=12)
        axes[1].set_title('Model Accuracy Over Epochs', fontsize=14, fontweight='bold')
        axes[1].legend(fontsize=11)
        axes[1].grid(True, alpha=0.3)

    plt.tight_layout()
    plt.savefig('results/training_curves.png', dpi=300, bbox_inches='tight')
    plt.close()
    print("✓ Saved: results/training_curves.png")

    # Confusion matrix
    cm = eval_results['cm']
    class_names = eval_results['class_names']
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=class_names,
                yticklabels=class_names,
                cbar_kws={'label': 'Count'})
    plt.title(f"Confusion Matrix (Accuracy: {eval_results['accuracy']*100:.2f}%)",
              fontsize=14, fontweight='bold')
    plt.ylabel('True Label', fontsize=12)
    plt.xlabel('Predicted Label', fontsize=12)
    plt.tight_layout()
    plt.savefig('results/confusion_matrix.png', dpi=300, bbox_inches='tight')
    plt.close()
    print("✓ Saved: results/confusion_matrix.png")

    # ROC curve
    plt.figure(figsize=(8, 6))
    plt.plot(eval_results['fpr'], eval_results['tpr'],
             color='darkorange', lw=2, label=f"ROC Curve (AUC = {eval_results['auc_roc']:.3f})")
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate', fontsize=12)
    plt.ylabel('True Positive Rate', fontsize=12)
    plt.title('ROC Curve', fontsize=14, fontweight='bold')
    plt.legend(loc="lower right", fontsize=11)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig('results/roc_curve.png', dpi=300, bbox_inches='tight')
    plt.close()
    print("✓ Saved: results/roc_curve.png")


In [None]:
# ============================================================================
# MAIN
# ============================================================================

def main():
    """Main execution."""

    # Check environment
    check_colab_and_gpu()

    # Download dataset
    kaggle_path = download_kaggle_dataset()

    # Find dataset structure
    data_path = find_dataset_path(kaggle_path)

    # Load data (NO reorganization - uses as-is!)
    train_gen, val_gen = load_data(data_path)

    # Build model
    model = build_model()

    # Train model
    history = train_model(model, train_gen, val_gen)

    # Evaluate
    eval_results = evaluate_model(model, val_gen, train_gen)
    eval_results['history'] = history

    # Visualize
    create_visualizations(history, eval_results)

    print("\n" + "=" * 80)
    print("TRAINING COMPLETE!")
    print("=" * 80)
    print("\n✅ All results saved to ./results/")
    print("   - best_model.h5")
    print("   - metrics_summary.csv")
    print("   - training_curves.png")
    print("   - confusion_matrix.png")
    print("   - roc_curve.png")
    print("\n✅ Model saved to ./models/best_model.h5")

if __name__ == "__main__":
    main()



GOOGLE COLAB ENVIRONMENT CHECK

✅ Running in Google Colab

✅ GPU DETECTED: 1 GPU(s) available
   PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')

✅ GPU memory growth enabled

TensorFlow version: 2.19.0

DOWNLOADING KAGGLE DATASET

Dataset: vuppalaadithyasairam/bone-fracture-detection-using-xrays
Downloading from https://www.kaggle.com/api/v1/datasets/download/vuppalaadithyasairam/bone-fracture-detection-using-xrays?dataset_version_number=1...


100%|██████████| 172M/172M [00:09<00:00, 19.7MB/s]

Extracting files...






✅ Dataset downloaded to: /root/.cache/kagglehub/datasets/vuppalaadithyasairam/bone-fracture-detection-using-xrays/versions/1

LOCATING DATASET STRUCTURE

✅ Found dataset at: /root/.cache/kagglehub/datasets/vuppalaadithyasairam/bone-fracture-detection-using-xrays/versions/1/archive (6)
   train/ folder: /root/.cache/kagglehub/datasets/vuppalaadithyasairam/bone-fracture-detection-using-xrays/versions/1/archive (6)/train
   val/ folder: /root/.cache/kagglehub/datasets/vuppalaadithyasairam/bone-fracture-detection-using-xrays/versions/1/archive (6)/val

Train folder contents:
  not fractured/: 4383 images
  fractured/: 4480 images

Validation folder contents:
  not fractured/: 240 images
  fractured/: 360 images

[1/5] Loading and preprocessing data...
Found 8863 images belonging to 2 classes.
Found 600 images belonging to 2 classes.

Found 8863 training images belonging to 2 classes.
Found 600 validation images belonging to 2 classes.

Class labels mapping:
  {'fractured': 0, 'not fractur


[3/5] Training model...

Class Weights:
  Class 0:  4480 samples, weight: 0.9892
  Class 1:  4383 samples, weight: 1.0111
Epoch 1/30
[1m277/277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 596ms/step - accuracy: 0.5129 - auc: 0.5147 - loss: 0.8114
Epoch 1: val_auc improved from -inf to 0.80371, saving model to models/best_model.h5




[1m277/277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 623ms/step - accuracy: 0.5129 - auc: 0.5147 - loss: 0.8113 - val_accuracy: 0.6600 - val_auc: 0.8037 - val_loss: 0.6708 - learning_rate: 0.0010
Epoch 2/30
[1m277/277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 490ms/step - accuracy: 0.5342 - auc: 0.5520 - loss: 0.7241
Epoch 2: val_auc did not improve from 0.80371
[1m277/277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 494ms/step - accuracy: 0.5343 - auc: 0.5520 - loss: 0.7240 - val_accuracy: 0.5700 - val_auc: 0.7472 - val_loss: 0.5879 - learning_rate: 0.0010
Epoch 3/30
[1m277/277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 486ms/step - accuracy: 0.5607 - auc: 0.5893 - loss: 0.6883
Epoch 3: val_auc did not improve from 0.80371
[1m277/277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 491ms/step - accuracy: 0.5607 - auc: 0.5893 - loss: 0.6