In [63]:
from huggingface_hub import hf_hub_download
import zipfile
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import random

# 1. Download and extract dataset
repo_id = "BetoRivers/nhi_chest_xray"
hf_token = "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"  # Consider using environment variables for security

# Download the ZIP file
try:
    zip_path = hf_hub_download(
        repo_id=repo_id,
        filename="data.zip",
        repo_type="dataset",
        token=hf_token
    )
except Exception as e:
    raise Exception(f"Failed to download dataset: {str(e)}")

# Extract the zip
extract_dir = "/content/data"  # Base extraction directory
os.makedirs(extract_dir, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print(f"✅ Extracted to: {extract_dir}")

# 2. Process the dataset
def process_dataset(base_path):
    """
    Processes the dataset and returns a list of (image_path, label) tuples
    """
    samples = []

    # Read labels.csv if it exists
    labels_path = os.path.join(base_path, "labels.csv")
    if os.path.exists(labels_path):
        df_labels = pd.read_csv(labels_path)
        # Create a mapping from filename to label
        label_map = dict(zip(df_labels['filename'], df_labels['outcome']))
    else:
        label_map = {}
        print("⚠️ labels.csv not found - using folder structure for labels")

    # Walk through all image files
    for root, _, files in os.walk(base_path):
        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg')):
                file_path = os.path.join(root, file)

                # Try to get label from CSV first
                label = label_map.get(file, None)

                # If not in CSV, try to infer from folder structure
                if label is None:
                    if "NORMAL" in root.upper():
                        label = "NORMAL"
                    elif "PNEUMONIA" in root.upper():
                        if "VIRUS" in file.upper():
                            label = "P_VIRUS"
                        elif "BACTERIA" in file.upper():
                            label = "P_BAC"
                        else:
                            label = "PNEUMONIA"  # generic pneumonia label

                if label is not None:
                    samples.append((file_path, label))
                else:
                    print(f"⚠️ Could not determine label for: {file_path}")

    return samples

# Process all images
valid_samples = process_dataset(extract_dir)

if not valid_samples:
    raise ValueError("No valid samples found in the dataset!")

# 3. Create splits
# Convert to DataFrame
df_full = pd.DataFrame(valid_samples, columns=["path", "label"])

# Shuffle first
df_full = df_full.sample(frac=1, random_state=42).reset_index(drop=True)

# Split into prediction (10%) and rest (90%)
df_pred, df_rest = train_test_split(df_full, test_size=0.10, random_state=42)

# Split rest into train (80%) and val (20%) of the remaining 90%
# So final distribution: 72% train, 18% val, 10% pred
df_train, df_val = train_test_split(df_rest, test_size=0.20, random_state=42)

# 4. Save splits
#splits_dir = os.path.join(extract_dir, "splits")
#os.makedirs(splits_dir, exist_ok=True)

#df_train.to_csv(os.path.join(splits_dir, "train.csv"), index=False)
#df_val.to_csv(os.path.join(splits_dir, "val.csv"), index=False)
#df_pred.to_csv(os.path.join(splits_dir, "pred.csv"), index=False)

print("\n✅ Dataset splits created:")
print(f"Train: {len(df_train)} samples ({len(df_train)/len(df_full):.1%})")
print(f"Validation: {len(df_val)} samples ({len(df_val)/len(df_full):.1%})")
print(f"Prediction: {len(df_pred)} samples ({len(df_pred)/len(df_full):.1%})")
#print(f"\nSplits saved to: {splits_dir}")

✅ Extracted to: /content/data

✅ Dataset splits created:
Train: 937 samples (8.0%)
Validation: 235 samples (2.0%)
Prediction: 10540 samples (90.0%)


In [64]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from torchvision import transforms
import numpy as np
import pandas as pd
from PIL import Image
import random

class ChestXrayDataset(Dataset):
    def __init__(self, dataframe, transform=None, is_train=False):
        self.df = dataframe.reset_index(drop=True)
        self.transform = transform or self.get_default_transform()
        self.is_train = is_train

        self.label_mapping = {'NORMAL': 0, 'P_BAC': 1, 'P_VIRUS': 2}
        self.minority_classes = ['NORMAL', 'P_VIRUS']

        self.minority_transform = transforms.Compose([
            transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),
            transforms.RandomHorizontalFlip(),
            transforms.RandomRotation(20),
            transforms.ColorJitter(brightness=0.2, contrast=0.2),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])

    def get_default_transform(self):
        return transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img = Image.open(row['path']).convert('RGB')

        # Convert label to tensor
        label = torch.tensor(self.label_mapping[row['label']], dtype=torch.long)

        if self.is_train and row['label'] in self.minority_classes:
            if random.random() < 0.5:
                img = self.minority_transform(img)
                return img, label

        img = self.transform(img)
        return img, label

    def __len__(self):
        return len(self.df)

In [65]:
# Create dataloaders
from torch.utils.data import DataLoader

# 2. Create weighted sampler
from torch.utils.data import WeightedRandomSampler

# Create datasets with augmentation JUST FOR TRAINING
train_dataset = ChestXrayDataset(df_train, is_train=True)  # Modified class
val_dataset = ChestXrayDataset(df_val)
test_dataset = ChestXrayDataset(df_pred)

# Calculate weights (inverse of class frequency)
class_counts = df_train['label'].value_counts().sort_index()
weights = 1. / class_counts[df_train['label']].values
sampler = WeightedRandomSampler(weights, len(weights), replacement=True)

# 3. Create dataloaders
train_loader = DataLoader(
    train_dataset,
    batch_size=64,
    sampler=sampler,  # replaces shuffle
    num_workers=2,    # reduced for stability
    pin_memory=True
)

val_loader = DataLoader(val_dataset, batch_size=64, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=64, num_workers=2)

# 4. Use with Focal Loss
alpha = torch.tensor([1/0.27, 1/0.475, 1/0.255]).to(device)  # Your class ratios
criterion = FocalLoss(alpha=alpha, gamma=2.0)

In [66]:
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, classification_report
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm  # For progress bars

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train_and_validate(model, train_loader, val_loader, optimizer, criterion, num_epochs=10, patience=3):
    best_f1 = 0.0
    best_model_path = "chest-xray-resnet.pth"
    patience_counter = 0
    history = {
        'train_loss': [],
        'val_loss': [],
        'train_f1': [],
        'val_f1': [],
        'train_acc': [],
        'val_acc': []
    }

    for epoch in range(num_epochs):
        # === Training Phase ===
        model.train()
        train_losses = []
        train_preds = []
        train_targets = []

        # Wrap train_loader with tqdm for progress bar
        train_iter = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]", leave=False)
        for imgs, labels in train_iter:
            imgs = imgs.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            outputs = model(imgs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_losses.append(loss.item())

            # Get predicted class (multi-class)
            _, preds = torch.max(outputs, 1)
            train_preds.append(preds.cpu().numpy())
            train_targets.append(labels.cpu().numpy())

            # Update progress bar
            train_iter.set_postfix(loss=loss.item())

        # === Validation Phase ===
        model.eval()
        val_losses = []
        val_preds = []
        val_targets = []

        with torch.no_grad():
            val_iter = tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Val]", leave=False)
            for imgs, labels in val_iter:
                imgs = imgs.to(device)
                labels = labels.to(device)

                outputs = model(imgs)
                loss = criterion(outputs, labels)

                val_losses.append(loss.item())

                # Get predicted class (multi-class)
                _, preds = torch.max(outputs, 1)
                val_preds.append(preds.cpu().numpy())
                val_targets.append(labels.cpu().numpy())

                # Update progress bar
                val_iter.set_postfix(loss=loss.item())

        # === Compute Metrics ===
        # Convert lists to numpy arrays
        y_train_pred = np.concatenate(train_preds)
        y_train_true = np.concatenate(train_targets)
        y_val_pred = np.concatenate(val_preds)
        y_val_true = np.concatenate(val_targets)

        # Calculate metrics
        train_f1 = f1_score(y_train_true, y_train_pred, average='macro', zero_division=0)
        train_acc = accuracy_score(y_train_true, y_train_pred)
        val_f1 = f1_score(y_val_true, y_val_pred, average='macro', zero_division=0)
        val_acc = accuracy_score(y_val_true, y_val_pred)

        avg_train_loss = np.mean(train_losses)
        avg_val_loss = np.mean(val_losses)

        # Store metrics in history
        history['train_loss'].append(avg_train_loss)
        history['val_loss'].append(avg_val_loss)
        history['train_f1'].append(train_f1)
        history['val_f1'].append(val_f1)
        history['train_acc'].append(train_acc)
        history['val_acc'].append(val_acc)

        # === Print Detailed Report ===
        print(f"\n📍 Epoch {epoch+1}/{num_epochs}")
        print(f"Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")
        print(f"Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f}")
        print(f"Train F1: {train_f1:.4f} | Val F1: {val_f1:.4f}")

        #print("\nValidation Classification Report:")
        #print(classification_report(y_val_true, y_val_pred, target_names=['NORMAL', 'P_BAC', 'P_VIRUS']))

        # === Early Stopping Check ===
        if val_f1 > best_f1:
            best_f1 = val_f1
            torch.save({
                'epoch': epoch+1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': avg_val_loss,
                'f1': val_f1,
            }, best_model_path)
            patience_counter = 0
            print("✅ New best model saved.")
        else:
            patience_counter += 1
            print(f"⏳ Patience {patience_counter}/{patience}")

        if patience_counter >= patience:
            print("⛔ Early stopping triggered.")
            break

    print(f"\n🏆 Best Validation F1-macro achieved: {best_f1:.4f}")
    return best_model_path, history

In [67]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load pretrained ResNet50
model = models.resnet50(pretrained=True)

# Modify the final fully connected layer for 3-class classification
num_ftrs = model.fc.in_features
model.fc = nn.Sequential(
    nn.Linear(num_ftrs, 128),
    nn.ReLU(),
    nn.Dropout(0.3),
    nn.Linear(128, 3) # 3 output classes: NORMAL, P_BAC, P_VIRUS
)

# nn.ReLU(), nn.Linear(64, 3) PRIMERO QUE AUMENTA EL NUMERO DE NODOS Y LUEGO YA VEMOS SI SE METEN MÁS LAYERS
model = model.to(device)

# Loss and optimizer - using CrossEntropyLoss for multi-class classification
criterion = nn.CrossEntropyLoss()  # Changed from BCEWithLogitsLoss
optimizer = optim.Adam(model.parameters(), lr=0.001)

# If you have class imbalance, consider weighted loss
# class_weights = torch.tensor([1.0, 2.0, 2.0]).to(device)  # Adjust weights as needed
# criterion = nn.CrossEntropyLoss(weight=class_weights)

# Learning rate scheduler
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', patience=2, factor=0.1, verbose=True)

# Train and validate
best_model_path, history = train_and_validate(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=optimizer,
    criterion=criterion,
    num_epochs=20,  # Increased epochs
    patience=5      # Increased patience
)




📍 Epoch 1/20
Train Loss: 0.6818 | Val Loss: 2.0284
Train Acc: 0.7097 | Val Acc: 0.5660
Train F1: 0.7115 | Val F1: 0.5607
✅ New best model saved.





📍 Epoch 2/20
Train Loss: 0.4659 | Val Loss: 1.1837
Train Acc: 0.7940 | Val Acc: 0.6596
Train F1: 0.7940 | Val F1: 0.6326
✅ New best model saved.





📍 Epoch 3/20
Train Loss: 0.3887 | Val Loss: 1.1791
Train Acc: 0.8335 | Val Acc: 0.6723
Train F1: 0.8308 | Val F1: 0.6371
✅ New best model saved.





📍 Epoch 4/20
Train Loss: 0.3864 | Val Loss: 2.6922
Train Acc: 0.8378 | Val Acc: 0.6298
Train F1: 0.8359 | Val F1: 0.4944
⏳ Patience 1/5





📍 Epoch 5/20
Train Loss: 0.3520 | Val Loss: 0.5719
Train Acc: 0.8517 | Val Acc: 0.7532
Train F1: 0.8548 | Val F1: 0.7256
✅ New best model saved.





📍 Epoch 6/20
Train Loss: 0.3112 | Val Loss: 0.5939
Train Acc: 0.8815 | Val Acc: 0.7745
Train F1: 0.8811 | Val F1: 0.7243
⏳ Patience 1/5





📍 Epoch 7/20
Train Loss: 0.2575 | Val Loss: 1.6126
Train Acc: 0.9007 | Val Acc: 0.6043
Train F1: 0.8981 | Val F1: 0.5920
⏳ Patience 2/5





📍 Epoch 8/20
Train Loss: 0.2951 | Val Loss: 1.1182
Train Acc: 0.8815 | Val Acc: 0.7234
Train F1: 0.8827 | Val F1: 0.6207
⏳ Patience 3/5





📍 Epoch 9/20
Train Loss: 0.2822 | Val Loss: 0.7828
Train Acc: 0.8815 | Val Acc: 0.8128
Train F1: 0.8791 | Val F1: 0.7800
✅ New best model saved.





📍 Epoch 10/20
Train Loss: 0.2844 | Val Loss: 0.8288
Train Acc: 0.8901 | Val Acc: 0.6681
Train F1: 0.8890 | Val F1: 0.6532
⏳ Patience 1/5





📍 Epoch 11/20
Train Loss: 0.2450 | Val Loss: 0.8656
Train Acc: 0.9114 | Val Acc: 0.7745
Train F1: 0.9133 | Val F1: 0.7477
⏳ Patience 2/5





📍 Epoch 12/20
Train Loss: 0.1913 | Val Loss: 1.8884
Train Acc: 0.9264 | Val Acc: 0.6511
Train F1: 0.9263 | Val F1: 0.6438
⏳ Patience 3/5





📍 Epoch 13/20
Train Loss: 0.1861 | Val Loss: 1.2851
Train Acc: 0.9285 | Val Acc: 0.7021
Train F1: 0.9264 | Val F1: 0.6443
⏳ Patience 4/5


                                                                           


📍 Epoch 14/20
Train Loss: 0.1949 | Val Loss: 1.7240
Train Acc: 0.9328 | Val Acc: 0.6894
Train F1: 0.9316 | Val F1: 0.5505
⏳ Patience 5/5
⛔ Early stopping triggered.

🏆 Best Validation F1-macro achieved: 0.7800




In [68]:
from huggingface_hub import upload_file

repo_id = "BetoRivers/chest-xray-resnet"

upload_file(
    path_or_fileobj=best_model_path,
    path_in_repo="chest-xray-resnet.pth",
    repo_id=repo_id,
    token=hf_token,
    repo_type="model"
)


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  chest-xray-resnet.pth                 :   0%|          |  558kB /  286MB            

CommitInfo(commit_url='https://huggingface.co/BetoRivers/chest-xray-resnet/commit/1219fae29137ebea29c90addd91444b59f6d2090', commit_message='Upload chest-xray-resnet.pth with huggingface_hub', commit_description='', oid='1219fae29137ebea29c90addd91444b59f6d2090', pr_url=None, repo_url=RepoUrl('https://huggingface.co/BetoRivers/chest-xray-resnet', endpoint='https://huggingface.co', repo_type='model', repo_id='BetoRivers/chest-xray-resnet'), pr_revision=None, pr_num=None)

In [69]:
# After training completes, get the best metrics from history
best_epoch = np.argmax(history['val_f1'])  # Find epoch with highest validation F1
best_f1 = history['val_f1'][best_epoch]
best_acc = history['val_acc'][best_epoch]
best_loss = history['val_loss'][best_epoch]

# Create README with the best metrics
readme_content = f"""# Chest X-ray ResNet Model
Fine-tuned ResNet50 for pneumonia classification (3 classes: NORMAL, P_BAC, P_VIRUS)

## Model Info
- Architecture: ResNet50
- Pretrained: ImageNet
- Final Layers: 128 -> ReLU -> Dropout(0.3) -> 3
- Best Epoch: {best_epoch+1}/{len(history['train_loss'])}

## Best Validation Metrics
- F1-macro: {best_f1:.4f}
- Accuracy: {best_acc:.4f}
- Loss: {best_loss:.4f}

## Training Summary
- Total Epochs Trained: {len(history['train_loss'])}
- Final Training F1: {history['train_f1'][-1]:.4f}
- Final Training Accuracy: {history['train_acc'][-1]:.4f}

## Classes
0: NORMAL
1: P_BAC (Bacterial Pneumonia)
2: P_VIRUS (Viral Pneumonia)
"""

# Save and upload README
with open("README.md", "w") as f:
    f.write(readme_content)

from huggingface_hub import upload_file

upload_file(
    path_or_fileobj="README.md",
    path_in_repo="README.md",
    repo_id=repo_id,
    token=hf_token,
    repo_type="model"
)

# Also upload the model weights
upload_file(
    path_or_fileobj=best_model_path,
    path_in_repo="pytorch_model.bin",
    repo_id=repo_id,
    token=hf_token,
    repo_type="model"
)

- empty or missing yaml metadata in repo card


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  chest-xray-resnet.pth                 :  15%|#4        | 41.9MB /  286MB            

CommitInfo(commit_url='https://huggingface.co/BetoRivers/chest-xray-resnet/commit/087f8dce5c72364abe0feecc2deacf8491ed9de6', commit_message='Upload pytorch_model.bin with huggingface_hub', commit_description='', oid='087f8dce5c72364abe0feecc2deacf8491ed9de6', pr_url=None, repo_url=RepoUrl('https://huggingface.co/BetoRivers/chest-xray-resnet', endpoint='https://huggingface.co', repo_type='model', repo_id='BetoRivers/chest-xray-resnet'), pr_revision=None, pr_num=None)

In [70]:
from huggingface_hub import hf_hub_download
import torch
from torchvision import models
import torch.nn as nn


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Match repo + file name
model_path = hf_hub_download(
    repo_id="BetoRivers/chest-xray-resnet",  # ✅ match repo
    filename="chest-xray-resnet.pth",        # ✅ match uploaded model
    repo_type="model"
)

# Rebuild the model architecture
model = models.resnet50(pretrained=True)
model.fc = nn.Sequential(
    nn.Linear(model.fc.in_features, 64),
    nn.ReLU(),
    nn.Dropout(0.3),
    nn.Linear(64, 3)  # 15 classes
)

# Load weights and move to device
model.load_state_dict(torch.load(model_path, map_location=device))
model.to(device)
model.eval()


chest-xray-resnet.pth:   0%|          | 0.00/286M [00:00<?, ?B/s]



UnpicklingError: Weights only load failed. This file can still be loaded, to do so you have two options, [1mdo those steps only if you trust the source of the checkpoint[0m. 
	(1) In PyTorch 2.6, we changed the default value of the `weights_only` argument in `torch.load` from `False` to `True`. Re-running `torch.load` with `weights_only` set to `False` will likely succeed, but it can result in arbitrary code execution. Do it only if you got the file from a trusted source.
	(2) Alternatively, to load with `weights_only=True` please check the recommended steps in the following error message.
	WeightsUnpickler error: Unsupported global: GLOBAL numpy._core.multiarray.scalar was not an allowed global by default. Please use `torch.serialization.add_safe_globals([scalar])` or the `torch.serialization.safe_globals([scalar])` context manager to allowlist this global if you trust this class/function.

Check the documentation of torch.load to learn more about types accepted by default with weights_only https://pytorch.org/docs/stable/generated/torch.load.html.

In [None]:
from sklearn.metrics import f1_score, accuracy_score
import pandas as pd
import numpy as np

# 3. Evaluation on test set
all_preds = []
all_true = []
image_paths = []

with torch.no_grad():
    for i, (imgs, labels) in enumerate(test_loader):
        imgs = imgs.to(device)
        labels = labels.to(device)

        outputs = model(imgs)
        _, preds = torch.max(outputs, 1)  # Get class indices

        all_preds.extend(preds.cpu().numpy())
        all_true.extend(labels.cpu().numpy())

        # Get corresponding image paths
        batch_size = imgs.size(0)
        start_idx = i * test_loader.batch_size
        end_idx = start_idx + batch_size
        batch_paths = df_pred.iloc[start_idx:end_idx]["path"].tolist()
        image_paths.extend(batch_paths)

# Convert to numpy arrays
all_preds = np.array(all_preds)
all_true = np.array(all_true)

# Compute metrics
micro_f1 = f1_score(all_true, all_preds, average='micro')
macro_f1 = f1_score(all_true, all_preds, average='macro')
acc = accuracy_score(all_true, all_preds)

print("\n📊 Evaluation Results:")
print(f"Micro F1-score: {micro_f1:.4f}")
print(f"Macro F1-score: {macro_f1:.4f}")
print(f"Accuracy: {acc:.4f}")

print("\nClassification Report:")
print(classification_report(all_true, all_preds, target_names=['NORMAL', 'P_BAC', 'P_VIRUS']))

# Save predictions to CSV
results_df = pd.DataFrame({
    'image_path': image_paths,
    'true_label': all_true,
    'pred_label': all_preds
})
results_df.to_csv("test_predictions.csv", index=False)


## ESTO ES PARA COMPILAR CON KAGGLE NO CON COLAB.

In [None]:
from datetime import datetime
import pandas as pd

# Decode multi-hot vectors back to label strings
inverse_transform = mlb.inverse_transform(all_preds)
true_labels = mlb.inverse_transform(all_true)

# Create DataFrame with results
df_results = pd.DataFrame({
    "Image": image_paths,
    "Predicted": [" | ".join(p) if p else "None" for p in inverse_transform],
    "Actual": [" | ".join(t) if t else "None" for t in true_labels],
})

# Show first 10 predictions
print(df_results.head(10))

# Generate timestamp
now = datetime.now()
timestamp = now.strftime("%Y-%m-%d_%H-%M-%S")

# Create filenames
results_filename = f"predictions_vs_truth_{timestamp}.csv"
errors_filename = f"prediction_errors_{timestamp}.csv"

# Save CSV files to the working directory (Kaggle output)
df_results.to_csv(results_filename, index=False)
print(f"📁 Results saved as: {results_filename}")

# Keep only mismatched predictions
df_errors = df_results[df_results["Predicted"] != df_results["Actual"]]
df_errors.to_csv(errors_filename, index=False)
print(f"❌ Misclassifications saved as: {errors_filename} ({len(df_errors)} samples)")
