In [1]:
import os
import shutil
import pandas as pd
import torch
import random
from torch.utils.data import Dataset, random_split, DataLoader, Subset
import torch.nn as nn
import torch.optim as optim
import torchvision
from torchvision import transforms
from PIL import Image
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('/kaggle/input/data/Data_Entry_2017.csv')

mass_mask = df['Finding Labels'].apply(lambda labels: any(label == 'Mass' for label in labels.split('|')))
nodule_mask = df['Finding Labels'].apply(lambda labels: any(label == 'Nodule' for label in labels.split('|')))
atelectasis_mask = df['Finding Labels'].apply(lambda labels: any(label == 'Atelectasis' for label in labels.split('|')))
consolidation_mask = df['Finding Labels'].apply(lambda labels: any(label == 'Consolidation' for label in labels.split('|')))
infiltration_mask = df['Finding Labels'].apply(lambda labels: any(label == 'Infiltration' for label in labels.split('|')))
pneumothorax_mask = df['Finding Labels'].apply(lambda labels: any(label == 'Pneumothorax' for label in labels.split('|')))
edema_mask = df['Finding Labels'].apply(lambda labels: any(label == 'Edema' for label in labels.split('|')))
emphysema_mask = df['Finding Labels'].apply(lambda labels: any(label == 'Emphysema' for label in labels.split('|')))
effusion_mask = df['Finding Labels'].apply(lambda labels: any(label == 'Effusion' for label in labels.split('|')))
pneumonia_mask = df['Finding Labels'].apply(lambda labels: any(label == 'Pneumonia' for label in labels.split('|')))
pleural_thickening_mask = df['Finding Labels'].apply(lambda labels: any(label == 'Pleural_thickening' for label in labels.split('|')))
cardiomegaly_mask = df['Finding Labels'].apply(lambda labels: any(label == 'Cardiomegaly' for label in labels.split('|')))
hernia_mask = df['Finding Labels'].apply(lambda labels: any(label == 'Hernia' for label in labels.split('|')))
fibrosis_mask = df['Finding Labels'].apply(lambda labels: any(label == 'Fibrosis' for label in labels.split('|')))
no_finding_mask = df['Finding Labels'] == 'No Finding'

mass_df = df[mass_mask]
nodule_df = df[nodule_mask]
atelectasis_mask_df = df[atelectasis_mask]
consolidation_mask_df = df[consolidation_mask]
infiltration_mask_df = df[infiltration_mask]
pneumothorax_mask_df = df[pneumothorax_mask]
edema_mask_df = df[edema_mask]
emphysema_mask_df = df[emphysema_mask]
effusion_mask_df = df[effusion_mask]
pneumonia_mask_df = df[pneumonia_mask]
pleural_thickening_mask_df = df[pleural_thickening_mask]
cardiomegaly_mask_df = df[cardiomegaly_mask]
hernia_mask_df = df[hernia_mask]
fibrosis_mask_df = df[fibrosis_mask]
no_finding_df = df[no_finding_mask]

num_mass = len(mass_df)
num_nodule = len(nodule_df)
num_atelectasis_mask_df = len(atelectasis_mask_df)
num_consolidation_mask_df = len(consolidation_mask_df)
num_infiltration_mask_df = len(infiltration_mask_df)
num_pneumothorax_mask_df = len(pneumothorax_mask_df)
num_edema_mask_df = len(edema_mask_df)
num_emphysema_mask_df = len(emphysema_mask_df)
num_effusion_mask_df = len(effusion_mask_df)
num_pneumonia_mask_df = len(pneumonia_mask_df)
num_pleural_thickening_mask_df = len(pleural_thickening_mask_df)
num_cardiomegaly_mask_df = len(cardiomegaly_mask_df)
num_hernia_mask_df = len(hernia_mask_df)
num_fibrosis_mask_df = len(fibrosis_mask_df)
num_no_finding = len(no_finding_df)
num_all = len(df)

print(num_mass, num_nodule, num_atelectasis_mask_df, num_consolidation_mask_df)
print(num_infiltration_mask_df, num_pneumothorax_mask_df, num_edema_mask_df, num_emphysema_mask_df)
print(num_effusion_mask_df, num_pneumonia_mask_df, num_pleural_thickening_mask_df, num_cardiomegaly_mask_df)
print(num_hernia_mask_df, num_fibrosis_mask_df, num_no_finding, num_all)

need_to_add = int((num_all - num_nodule) / num_nodule)

print(need_to_add)

5782 6331 11559 4667
19894 5302 2303 2516
13317 1431 0 2776
227 1686 60361 112120
16


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
BATCH_SIZE = 16
LR = 0.001
EPOCHS = 10

folder_ranges = [
    (1335, 6, 'images_001'),
    (3923, 13, 'images_002'),
    (6585, 6, 'images_003'),
    (9232, 3, 'images_004'),
    (11558, 7, 'images_005'),
    (13774, 25, 'images_006'),
    (16051, 9, 'images_007'),
    (18387, 34, 'images_008'),
    (20945, 49, 'images_009'),
    (24717, 0, 'images_010'),
    (28173, 2, 'images_011'),
    (30805, 0, 'images_012')
]

source_base_dir = '/kaggle/input/data/'
main_dest_dir = '/kaggle/working/'

class ChestXRayDataset(Dataset):
    def __init__(self):
        self.image_paths = []
        self.labels = []
        self.normalize = transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
                     
        self.transform_positive = transforms.Compose([
            transforms.Resize(224),
            transforms.ToTensor(),
            self.normalize
        ])
        
        self.transform_negative = transforms.Compose([
            transforms.Resize(224),
            transforms.ToTensor(),
            self.normalize
        ])
        
        with open('/kaggle/input/data/Data_Entry_2017.csv', 'r') as f:
            next(f)
            for line in f:
                items = line.strip().split(',')
                image_filename = items[0].strip('"')
                pathologies = items[1].strip('"').split('|')
                label = 1.0 if 'Nodule' in pathologies else 0.0
                
                base_name = os.path.splitext(image_filename)[0]
                part1_str, part2_str = base_name.split('_')
                part1 = int(part1_str)
                part2 = int(part2_str)
                source_folder = None
                
                for f_part1, f_part2, f_name in folder_ranges:
                    if part1 < f_part1 or (part1 == f_part1 and part2 <= f_part2):
                        source_folder = f_name
                        break
                        
                if not source_folder:
                    continue 
                    
                img_path = os.path.join(source_base_dir, source_folder, 'images', image_filename)
                
                self.image_paths.append(img_path)
                self.labels.append(label)

    def __getitem__(self, index):
        img_path = self.image_paths[index]
        label = self.labels[index]
        image = Image.open(img_path).convert('RGB')
        
        if label == 1.0:
            image = self.transform_positive(image)
        else:
            image = self.transform_negative(image)
        
        return image, torch.tensor(label, dtype=torch.float32)

    def __len__(self):
        return len(self.image_paths)
        
    def tackle_idxs(self, idxs):
        image_paths_temp = []
        labels_temp = []
        
        for i in idxs:
            label = self.labels[i]
            img_path = self.image_paths[i]
            
            image_paths_temp.append(img_path)
            labels_temp.append(label)
        
        combined = list(zip(image_paths_temp, labels_temp))
        random.shuffle(combined)
        self.image_paths, self.labels = map(list, zip(*combined))
        
    def tackle_train(self, idxs):
        image_paths_temp = []
        labels_temp = []
        
        for i in idxs:
            label = self.labels[i]
            img_path = self.image_paths[i]
            
            image_paths_temp.append(img_path)
            labels_temp.append(label)
            
            if label == 1.0:
                for _ in range(need_to_add - 1):
                    image_paths_temp.append(img_path)
                    labels_temp.append(label)
                    
        self.transform_positive = transforms.Compose([
            transforms.Resize(224),
            transforms.RandomHorizontalFlip(p=0.6),
            transforms.RandomRotation(degrees=15),
            transforms.RandomAffine(degrees=0, translate=(0.05, 0.05)),
            transforms.ToTensor(),
            self.normalize
        ])
        
        combined = list(zip(image_paths_temp, labels_temp))
        random.shuffle(combined)
        self.image_paths, self.labels = map(list, zip(*combined))
        
class DenseNet121(nn.Module):
    def __init__(self):
        super().__init__()
        self.densenet = torchvision.models.densenet121(weights="IMAGENET1K_V1")

        num_features = self.densenet.classifier.in_features
        #self.densenet.classifier = nn.Linear(num_features, 1)
        
        self.densenet.classifier = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(num_features, 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, 1)
        )
        
        for param in self.densenet.parameters():
            param.requires_grad = False

        to_unfreeze = [
            "features.denseblock2",
            "features.transition2",
            "features.denseblock3",
            "features.transition3",
            "features.denseblock4",
            "features.norm5",
            "classifier",
        ]
        
        for name, param in self.densenet.named_parameters():
            if any(name.startswith(layer) for layer in to_unfreeze):
                param.requires_grad = True
    
    def forward(self, x):
        return self.densenet(x)
    
def train_and_evaluate(model, model_name, train_loader, val_loader, test_loader, pos_weight):
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=LR, betas=(0.9, 0.999))
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.1, patience=1)
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight.to(device))
    
    print(f"\n----- Training {model_name} -----")
    for epoch in range(EPOCHS):
        model.train()
        train_loss = 0.0
        for images, labels in train_loader:
            images = images.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs.view(-1), labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * images.size(0)
            print(train_loss)

        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for images, labels in val_loader:
                images = images.to(device)
                labels = labels.to(device)
                outputs = model(images)
                loss = criterion(outputs.view(-1), labels)
                val_loss += loss.item() * images.size(0)
                print(val_loss)
        
        avg_val_loss = val_loss / len(val_loader.dataset)
        scheduler.step(avg_val_loss)
        
        print(f'Epoch {epoch+1}/{EPOCHS}')
        print(f'Train Loss: {train_loss/len(train_loader.dataset):.4f}')
        print(f'Val Loss: {avg_val_loss:.4f}\n')
    
    model.eval()
    all_preds = []
    all_labels = []
    all_probs = []
    with torch.no_grad():
        for images, labels in test_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = torch.sigmoid(model(images))
            probs = outputs.view(-1).cpu().numpy()
            predicted = (probs >= 0.5).astype(float)
            all_preds.extend(predicted)
            all_probs.extend(probs)
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, zero_division=0)
    recall = recall_score(all_labels, all_preds, zero_division=0)
    f1 = f1_score(all_labels, all_preds, zero_division=0)
    auc = roc_auc_score(all_labels, all_probs)
    fpr, tpr, _ = roc_curve(all_labels, all_probs)
    
    print(f'----- {model_name} Test Metrics -----')
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1 Score: {f1:.4f}')
    print(f'ROC AUC: {auc:.4f}\n')
    
    plt.figure()
    plt.plot(fpr, tpr, label=f'ROC curve (area = {auc:.4f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'Receiver Operating Characteristic - {model_name}')
    plt.legend(loc="lower right")
    plt.savefig(f"{model_name}_roc_curve_new1.png")
    plt.close()
    
    torch.save(model.state_dict(), f"{model_name}_final_model_new1.pth")
    
    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1, 'auc': auc}

def main():
    dataset = ChestXRayDataset()
    n = len(dataset)
    labels = dataset.labels
    
    idxs = list(range(n))
    train_idxs, testval_idxs, train_labels, testval_labels = train_test_split(
        idxs,
        labels,
        test_size=0.30,
        stratify=labels,
        random_state=42
    )

    relative_val_size = 0.15 / 0.30
    test_idxs, val_idxs, test_labels, val_labels = train_test_split(
        testval_idxs,
        testval_labels,
        test_size=relative_val_size,
        stratify=testval_labels,
        random_state=42
    )
    
    train_dataset = ChestXRayDataset()
    train_dataset.tackle_train(train_idxs)
    
    val_dataset = ChestXRayDataset()
    val_dataset.tackle_idxs(val_idxs)
    
    test_dataset = ChestXRayDataset()
    test_dataset.tackle_idxs(test_idxs)
    
    pos_count = sum(train_dataset.labels)
    neg_count = len(train_dataset) - pos_count
    pos_weight = torch.tensor([neg_count / pos_count])
    
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)
    
    model = DenseNet121()
    
    result = train_and_evaluate(model, 'DenseNet121', train_loader, val_loader, test_loader, pos_weight)
    print("----- Overall Results -----")
    print(f"DenseNet121: {result}")
        
#if __name__ == '__main__':
main()