In [1]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from PIL import Image
import pandas as pd
from sklearn.metrics import roc_auc_score

In [3]:
def get_transforms():
    return transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])


In [4]:
# 2. Dataset class
class CheXpertDataset(Dataset):
    """
    Dataset for CheXpert
    Expects a CSV with columns: Path, and 14 binary label columns.
    """
    def __init__(self, img_dir, csv_file, label_cols, transform=None):
        self.img_dir = img_dir
        self.df = pd.read_csv(csv_file)
        # Replace uncertain labels (-1) with 0
        self.df[label_cols] = self.df[label_cols].fillna(0).replace(-1, 0)
        self.label_cols = label_cols
        self.transform = transform or get_transforms()

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = os.path.join(self.img_dir, row['Path'])
        image = Image.open(img_path).convert('RGB')
        image = self.transform(image)
        labels = torch.tensor(row[self.label_cols].values.astype(float), dtype=torch.float32)
        return image, labels

In [5]:
# 3. Model builder

def build_model(num_classes=14, pretrained=True):
    model = models.resnet50(pretrained=pretrained)
    in_features = model.fc.in_features
    model.fc = nn.Linear(in_features, num_classes)
    return model

In [6]:
from tqdm import tqdm

In [7]:
# 4. Training function

def train_tag_predictor(
    img_dir,
    csv_file,
    label_cols,
    epochs=5,
    batch_size=32,
    lr=1e-4,
    device='cuda'
):
    transform = get_transforms()
    dataset = CheXpertDataset(img_dir, csv_file, label_cols, transform)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=4)

    model = build_model(num_classes=len(label_cols)).to(device)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(1, epochs+1):
        model.train()
        running_loss = 0.0
        all_labels = []
        all_preds = []

        for images, labels in tqdm(dataloader, desc=f"Epoch {epoch}/{epochs}"):
            images = images.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * images.size(0)
            all_labels.append(labels.detach().cpu())
            all_preds.append(torch.sigmoid(outputs).detach().cpu())

        epoch_loss = running_loss / len(dataset)
        y_true = torch.cat(all_labels).numpy()
        y_pred = torch.cat(all_preds).numpy()
        aucs = []
        for i, col in enumerate(label_cols):
            try:
                auc = roc_auc_score(y_true[:, i], y_pred[:, i])
            except ValueError:
                auc = float('nan')
            aucs.append(auc)
        mean_auc = sum([a for a in aucs if not pd.isna(a)]) / len(label_cols)

        print(f"Epoch {epoch}/{epochs} | Loss: {epoch_loss:.4f} | Mean AUC: {mean_auc:.4f}")

    return model

In [8]:
df_train = pd.read_csv("/kaggle/input/chexpert/CheXpert-v1.0-small/train.csv")

In [9]:
df_train.head(5)

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Path,Sex,Age,Frontal/Lateral,AP/PA,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices
0,CheXpert-v1.0-small/train/patient00001/study1/...,Female,68,Frontal,AP,1.0,,,,,,,,,0.0,,,,1.0
1,CheXpert-v1.0-small/train/patient00002/study2/...,Female,87,Frontal,AP,,,-1.0,1.0,,-1.0,-1.0,,-1.0,,-1.0,,1.0,
2,CheXpert-v1.0-small/train/patient00002/study1/...,Female,83,Frontal,AP,,,,1.0,,,-1.0,,,,,,1.0,
3,CheXpert-v1.0-small/train/patient00002/study1/...,Female,83,Lateral,,,,,1.0,,,-1.0,,,,,,1.0,
4,CheXpert-v1.0-small/train/patient00003/study1/...,Male,41,Frontal,AP,,,,,,1.0,,,,0.0,,,,


In [10]:
LABEL_COLS = [
    'Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Pleural Effusion',
    'Pleural Other', 'Pneumonia', 'Pneumothorax', 'Fracture', 'Support Devices',
    'Enlarged Cardiomediastinum', 'Lung Lesion', 'Lung Opacity', 'No Finding'
]

In [11]:
df_train[LABEL_COLS] = df_train[LABEL_COLS].fillna(0).replace(-1, 0)

In [13]:
df_train.head(5)

Unnamed: 0,Path,Sex,Age,Frontal/Lateral,AP/PA,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices
0,CheXpert-v1.0-small/train/patient00001/study1/...,Female,68,Frontal,AP,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,CheXpert-v1.0-small/train/patient00002/study2/...,Female,87,Frontal,AP,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,CheXpert-v1.0-small/train/patient00002/study1/...,Female,83,Frontal,AP,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,CheXpert-v1.0-small/train/patient00002/study1/...,Female,83,Lateral,,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,CheXpert-v1.0-small/train/patient00003/study1/...,Male,41,Frontal,AP,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
img_dir = "/kaggle/input/chexpert"
csv_dir = "/kaggle/input/chexpert/CheXpert-v1.0-small/train.csv"

In [15]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [16]:
device

'cuda'

In [17]:
trained_model = train_tag_predictor(
        img_dir=img_dir,
        csv_file=csv_dir,
        label_cols=LABEL_COLS,
        epochs=5,
        batch_size=16,
        lr=1e-4,
        device=device
    )

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 229MB/s]
Epoch 1/5: 100%|██████████| 13964/13964 [23:22<00:00,  9.96it/s]


Epoch 1/5 | Loss: 0.2928 | Mean AUC: 0.7505


Epoch 2/5: 100%|██████████| 13964/13964 [23:25<00:00,  9.93it/s]


Epoch 2/5 | Loss: 0.2778 | Mean AUC: 0.7835


Epoch 3/5: 100%|██████████| 13964/13964 [23:25<00:00,  9.93it/s]


Epoch 3/5 | Loss: 0.2688 | Mean AUC: 0.8013


Epoch 4/5: 100%|██████████| 13964/13964 [23:25<00:00,  9.93it/s]


Epoch 4/5 | Loss: 0.2585 | Mean AUC: 0.8203


Epoch 5/5: 100%|██████████| 13964/13964 [23:24<00:00,  9.95it/s]


Epoch 5/5 | Loss: 0.2444 | Mean AUC: 0.8438


In [18]:
torch.save(trained_model.state_dict(), 'chexpert_tag_model1.pth')