In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#what is the size of these images
from PIL import Image
import pydicom

dicom_path = "/kaggle/input/rsna-pneumonia-detection-challenge/stage_2_train_images/0004cfab-14fd-4e49-80ba-63a80b6bddd6.dcm"
ds = pydicom.dcmread(dicom_path)

image_array = ds.pixel_array
height, width = image_array.shape[:2]

print(f"DICOM image size: {width}x{height}")

In [None]:
#confirming if Target = 1 means lung_opacity and target = 0 is otherwise
import pandas

csv1 = pd.read_csv("/kaggle/input/rsna-pneumonia-detection-challenge/stage_2_detailed_class_info.csv")
csv2 = pd.read_csv("/kaggle/input/rsna-pneumonia-detection-challenge/stage_2_train_labels.csv")

In [None]:
csv1.head()

In [None]:
csv2.head()

In [None]:
combined_df = pd.merge(csv1, csv2, on = 'patientId', how = 'inner')
combined_df.head()

In [None]:
pd.crosstab(combined_df['class'], combined_df['Target'], margins = True)

In [None]:
!pip install -q transformers datasets accelerate scikit-learn

In [None]:
!pip install -q pydicom joblib

In [None]:
import os
import glob
import pydicom
import cv2
import pandas as pd
import numpy as np
from joblib import Parallel, delayed
from tqdm.notebook import tqdm

# ==========================================
# 1. CONFIGURATION
# ==========================================
# Input Paths
DICOM_DIR = "/kaggle/input/rsna-pneumonia-detection-challenge/stage_2_train_images"
LABELS_CSV = "/kaggle/input/rsna-pneumonia-detection-challenge/stage_2_train_labels.csv"
CLASS_INFO_CSV = "/kaggle/input/rsna-pneumonia-detection-challenge/stage_2_detailed_class_info.csv"

# Output Directory
OUTPUT_DIR = "/kaggle/working/dataset_384"
IMG_SIZE = 384 

# ==========================================
# 2. MERGE & MAP LABELS
# ==========================================
print("Reading CSVs...")
train_labels = pd.read_csv(LABELS_CSV)
class_info = pd.read_csv(CLASS_INFO_CSV)

# Merge them on patientId
# This is crucial: We join the 'Target' from train_labels with the 'class' from class_info
combined_df = pd.merge(train_labels, class_info, on='patientId', how='left')

# Drop Duplicates
# The CSVs have one row per Bounding Box. For classification, we only need 
# one row per image.
combined_df = combined_df.drop_duplicates(subset=['patientId'])

print(f"Total Unique Images to Process: {len(combined_df)}")

# Create Mapping: {'patient_id': 'Lung Opacity'}
id_to_class = dict(zip(combined_df.patientId, combined_df['class']))

# Create Subfolders
unique_classes = combined_df['class'].unique()
for c in unique_classes:
    safe_name = c.replace(" ", "_").replace("/", "_")
    os.makedirs(os.path.join(OUTPUT_DIR, safe_name), exist_ok=True)
    
print(f"Created folders for: {unique_classes}")

# ==========================================
# 3. CONVERSION WORKER
# ==========================================
def process_dicom(dcm_path):
    try:
        patient_id = os.path.basename(dcm_path).replace('.dcm', '')
        
        # Look up class
        class_name = id_to_class.get(patient_id)
        if class_name is None:
            # If an image is in the folder but not the CSV, we skip it
            return 
            
        safe_class_name = class_name.replace(" ", "_").replace("/", "_")
        dest_path = os.path.join(OUTPUT_DIR, safe_class_name, f"{patient_id}.jpg")
        
        # Skip if already done
        if os.path.exists(dest_path):
            return

        # Read DICOM
        dcm = pydicom.dcmread(dcm_path)
        img = dcm.pixel_array.astype(float)
        
        # Normalize (Scale to 0-255)
        img = (img - np.min(img)) / (np.max(img) - np.min(img)) * 255.0
        img = img.astype(np.uint8)
        
        # Resize (Speed up training)
        img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
        
        # Save as JPG
        cv2.imwrite(dest_path, img)
        
    except Exception as e:
        print(f"Error processing {patient_id}: {e}")

# ==========================================
# 4. RUN PARALLEL PROCESSING
# ==========================================
dcm_files = glob.glob(os.path.join(DICOM_DIR, "*.dcm"))
print(f"Starting conversion of {len(dcm_files)} files...")

# Use all CPU cores
Parallel(n_jobs=-1)(delayed(process_dicom)(f) for f in tqdm(dcm_files))

print("Processing Complete. Data ready at:", OUTPUT_DIR)

In [None]:
# ====================================================
# CELL 2: ViT TRAINING (FIXED & OPTIMIZED)
# ====================================================
import torch
import torch.nn as nn
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoImageProcessor,
    AutoModelForImageClassification,
    TrainingArguments,
    Trainer,
    DefaultDataCollator
)
from torchvision import transforms
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight

# --- CONFIG ---
MODEL_CHECKPOINT = "google/vit-base-patch16-384"
DATA_DIR = "/kaggle/working/dataset_384"
BATCH_SIZE = 24
NUM_EPOCHS = 15

# --- 1. LOAD DATA ---
dataset = load_dataset("imagefolder", data_dir=DATA_DIR)
# Rename 'label' to 'labels' to ensure Trainer detects it correctly
dataset = dataset.rename_column("label", "labels") 

splits = dataset["train"].train_test_split(test_size=0.15, seed=42)
train_ds = splits["train"]
val_ds = splits["test"]
labels_list = train_ds.features["labels"].names

print(f"Classes: {labels_list}")

# --- 2. CLASS WEIGHTS ---
y_train = np.array(train_ds["labels"])
weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = torch.tensor(weights, dtype=torch.float).cuda()
print(f"Active Class Weights: {class_weights}")

# --- 3. TRANSFORMS (THE FIX IS HERE) ---
image_processor = AutoImageProcessor.from_pretrained(MODEL_CHECKPOINT)
normalize = transforms.Normalize(mean=image_processor.image_mean, std=image_processor.image_std)

train_transforms = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor(),
    normalize,
])
val_transforms = transforms.Compose([transforms.ToTensor(), normalize])

def preprocess_train(batch):
    batch["pixel_values"] = [train_transforms(x.convert("RGB")) for x in batch["image"]]
    # CRITICAL FIX: Delete the raw 'image' column so the collator doesn't crash
    del batch["image"] 
    return batch

def preprocess_val(batch):
    batch["pixel_values"] = [val_transforms(x.convert("RGB")) for x in batch["image"]]
    del batch["image"] # CRITICAL FIX
    return batch

train_ds.set_transform(preprocess_train)
val_ds.set_transform(preprocess_val)

# --- 4. CUSTOM TRAINER & METRICS ---
def compute_metrics(pred):
    labels_ids = pred.label_ids
    preds_ids = pred.predictions.argmax(-1)
    acc = (labels_ids == preds_ids).mean()
    print("\n" + classification_report(labels_ids, preds_ids, target_names=labels_list, digits=4))
    return {'accuracy': acc}

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

model = AutoModelForImageClassification.from_pretrained(
    MODEL_CHECKPOINT, num_labels=len(labels_list), 
    id2label={i: l for i, l in enumerate(labels_list)}, 
    label2id={l: i for i, l in enumerate(labels_list)}, 
    ignore_mismatched_sizes=True
)

# --- 5. RUN TRAINING ---
args = TrainingArguments(
    output_dir="/kaggle/working/vit-384-final",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=2,
    num_train_epochs=NUM_EPOCHS,
    warmup_ratio=0.1,
    weight_decay=0.01,
    fp16=True,                   
    dataloader_num_workers=4,    
    dataloader_pin_memory=True,
    
    # Corrected argument name for newer Transformers versions
    eval_strategy="epoch",      
    save_strategy="epoch",
    logging_strategy="epoch",
    
    load_best_model_at_end=True,
    report_to="none",
    remove_unused_columns=False
)

trainer = WeightedTrainer(
    model=model, args=args, train_dataset=train_ds, 
    eval_dataset=val_ds, compute_metrics=compute_metrics,
    data_collator=DefaultDataCollator(),
)

print("Starting Optimized Training...")
trainer.train()
trainer.save_model("/kaggle/working/final_model_p100")

In [None]:
import shutil
import os

# List of folders we used in previous steps
folders_to_delete = [
    "/kaggle/working/final_model_p100",         # Old ViT output
    "/kaggle/working/vit-384-final",      # New ResNet output
]

print("üßπ Cleaning up old checkpoints...")

for folder in folders_to_delete:
    if os.path.exists(folder):
        try:
            shutil.rmtree(folder)
            print(f"‚úÖ Deleted: {folder}")
        except Exception as e:
            print(f"‚ùå Error deleting {folder}: {e}")
    else:
        print(f"‚ö™ Not found (already clean): {folder}")

print("‚ú® Ready for fresh training!")

In [None]:
import os
import gc
import torch
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoImageProcessor, AutoModelForImageClassification, 
    TrainingArguments, Trainer, DefaultDataCollator
)
from torchvision import transforms
from sklearn.metrics import classification_report, recall_score
from sklearn.utils.class_weight import compute_class_weight

# ====================================================
# CONFIGURATION
# ====================================================
DATA_DIR = "/kaggle/working/dataset_384"
OUTPUT_DIR = "/kaggle/working/final_model_vit_recall"

# --- SAFETY CHECK ---
# If your session restarted, this folder might be gone.
if not os.path.exists(DATA_DIR):
    raise FileNotFoundError(
        f"‚ùå ERROR: The folder {DATA_DIR} is missing!\n"
        "Your Kaggle session may have reset.\n"
        "Please re-run the 'Data Preparation' script to generate the 384px images again."
    )
else:
    print(f"‚úÖ Data found at {DATA_DIR}. Proceeding to training...")

# ====================================================
# ViT TRAINING (Max Macro Recall)
# ====================================================
print("üöÄ Starting ViT Training...")

# 1. Load Data
dataset = load_dataset("imagefolder", data_dir=DATA_DIR)
dataset = dataset.rename_column("label", "labels") 
splits = dataset["train"].train_test_split(test_size=0.15, seed=42)
train_ds = splits["train"]
val_ds = splits["test"]
labels_list = train_ds.features["labels"].names

# 2. Weights & Transforms
y_train = np.array(train_ds["labels"])
weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = torch.tensor(weights, dtype=torch.float).cuda()

model_ckpt = "google/vit-base-patch16-384"
processor = AutoImageProcessor.from_pretrained(model_ckpt)
norm = transforms.Normalize(mean=processor.image_mean, std=processor.image_std)

train_tf = transforms.Compose([
    transforms.RandomHorizontalFlip(), transforms.RandomRotation(15),
    transforms.ColorJitter(0.2, 0.2), transforms.ToTensor(), norm
])
val_tf = transforms.Compose([transforms.ToTensor(), norm])

def preprocess_train(batch):
    batch["pixel_values"] = [train_tf(x.convert("RGB")) for x in batch["image"]]
    del batch["image"]
    return batch
def preprocess_val(batch):
    batch["pixel_values"] = [val_tf(x.convert("RGB")) for x in batch["image"]]
    del batch["image"]
    return batch

train_ds.set_transform(preprocess_train)
val_ds.set_transform(preprocess_val)

# 3. Custom Metrics (MACRO RECALL TARGET)
def compute_metrics_recall(pred):
    labels_ids = pred.label_ids
    preds_ids = pred.predictions.argmax(-1)
    
    # Calculate Macro Recall
    macro_recall = recall_score(labels_ids, preds_ids, average='macro')
    
    print("\n" + classification_report(labels_ids, preds_ids, target_names=labels_list, digits=4))
    
    return {
        'accuracy': (labels_ids == preds_ids).mean(),
        'eval_macro_recall': macro_recall 
    }

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

model = AutoModelForImageClassification.from_pretrained(
    model_ckpt, num_labels=3, 
    id2label={i: l for i, l in enumerate(labels_list)}, 
    label2id={l: i for i, l in enumerate(labels_list)}, ignore_mismatched_sizes=True
)

args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    learning_rate=2e-5, per_device_train_batch_size=24, per_device_eval_batch_size=24,
    gradient_accumulation_steps=2, num_train_epochs=15,
    fp16=True, dataloader_num_workers=4, dataloader_pin_memory=True,
    eval_strategy="epoch", save_strategy="epoch", 
    
    # --- MAXIMIZE MACRO RECALL ---
    load_best_model_at_end=True,
    metric_for_best_model="eval_macro_recall", 
    greater_is_better=True,
    save_total_limit=2,
    # -----------------------------
    report_to="none", remove_unused_columns=False
)

trainer = WeightedTrainer(
    model=model, args=args, train_dataset=train_ds, 
    eval_dataset=val_ds, compute_metrics=compute_metrics_recall,
    data_collator=DefaultDataCollator(),
)

# Start training
# We check for existing model file to prevent accidental overwrites if you run this cell twice
if not os.path.exists(os.path.join(OUTPUT_DIR, "pytorch_model.bin")):
    trainer.train()
    trainer.save_model(OUTPUT_DIR)
    print("‚úÖ ViT Training Complete.")
else:
    print("‚úÖ Output already exists. Skipping training.")

In [None]:
import os
print(os.listdir("/kaggle/working/final_model_vit_recall"))

In [None]:
import shutil
shutil.make_archive("vit_best_model", 'zip', "/kaggle/working/final_model_vit_recall")

In [None]:
from IPython.display import FileLink

# This creates a clickable link to download the file directly
FileLink(r'vit_best_model.zip')

In [None]:
import os
import gc
import torch
import glob
import cv2
import shutil
import pandas as pd
import numpy as np
import pydicom
from joblib import Parallel, delayed
from tqdm.notebook import tqdm
from datasets import load_dataset
from transformers import (
    AutoImageProcessor, AutoModelForImageClassification, 
    TrainingArguments, Trainer, DefaultDataCollator
)
from torchvision import transforms
from sklearn.metrics import classification_report, recall_score
from sklearn.utils.class_weight import compute_class_weight

# ====================================================
# CONFIGURATION
# ====================================================
RESNET_OUTPUT_DIR = "/kaggle/working/final_model_resnet_recall"
DATA_DIR_640 = "/kaggle/working/dataset_640"
MODEL_CHECKPOINT = "microsoft/resnet-101"

# ====================================================
# 1. DATA GENERATION (Run if 640px folder is missing)
# ====================================================
def prepare_data_640():
    if os.path.exists(DATA_DIR_640):
        print(f"‚úÖ Data found at {DATA_DIR_640}. Skipping generation.")
        return

    print("‚öôÔ∏è Generating 640px Dataset (ResNet requires High Res)...")
    DICOM_DIR = "/kaggle/input/rsna-pneumonia-detection-challenge/stage_2_train_images"
    LABELS_CSV = "/kaggle/input/rsna-pneumonia-detection-challenge/stage_2_train_labels.csv"
    CLASS_INFO_CSV = "/kaggle/input/rsna-pneumonia-detection-challenge/stage_2_detailed_class_info.csv"
    
    # Merge & Map
    train_labels = pd.read_csv(LABELS_CSV)
    class_info = pd.read_csv(CLASS_INFO_CSV)
    combined = pd.merge(train_labels, class_info, on='patientId', how='left').drop_duplicates(subset=['patientId'])
    id_to_class = dict(zip(combined.patientId, combined['class']))
    
    # Create Folders
    for c in combined['class'].unique():
        safe_name = c.replace(" ", "_").replace("/", "_")
        os.makedirs(os.path.join(DATA_DIR_640, safe_name), exist_ok=True)
        
    # Parallel Convert
    def process(dcm_path):
        try:
            pid = os.path.basename(dcm_path).replace('.dcm', '')
            cname = id_to_class.get(pid)
            if cname:
                safe_cname = cname.replace(" ", "_").replace("/", "_")
                save_path = os.path.join(DATA_DIR_640, safe_cname, f"{pid}.jpg")
                if not os.path.exists(save_path):
                    dcm = pydicom.dcmread(dcm_path)
                    img = dcm.pixel_array.astype(float)
                    img = (img - np.min(img)) / (np.max(img) - np.min(img)) * 255.0
                    img = cv2.resize(img.astype(np.uint8), (640, 640))
                    cv2.imwrite(save_path, img)
        except: pass

    files = glob.glob(os.path.join(DICOM_DIR, "*.dcm"))
    Parallel(n_jobs=-1)(delayed(process)(f) for f in tqdm(files))
    print("‚úÖ Generated 640px Data.")

prepare_data_640()

# ====================================================
# 2. RESNET TRAINING (Max Macro Recall)
# ====================================================
print("\nüöÄ Starting ResNet-101 Training...")

# Load Data
dataset = load_dataset("imagefolder", data_dir=DATA_DIR_640)
dataset = dataset.rename_column("label", "labels") 
splits = dataset["train"].train_test_split(test_size=0.15, seed=42)
train_ds = splits["train"]
val_ds = splits["test"]
labels_list = train_ds.features["labels"].names

# Class Weights
y_train = np.array(train_ds["labels"])
weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = torch.tensor(weights, dtype=torch.float).cuda()

# Transforms (ResNet Standard)
processor = AutoImageProcessor.from_pretrained(MODEL_CHECKPOINT)
# Use ImageNet stats if processor doesn't have them
mean = processor.image_mean if hasattr(processor, 'image_mean') else [0.485, 0.456, 0.406]
std = processor.image_std if hasattr(processor, 'image_std') else [0.229, 0.224, 0.225]
norm = transforms.Normalize(mean=mean, std=std)

train_tf = transforms.Compose([
    transforms.RandomHorizontalFlip(), transforms.RandomRotation(10),
    transforms.ColorJitter(0.1, 0.1), transforms.ToTensor(), norm
])
val_tf = transforms.Compose([transforms.ToTensor(), norm])

def preprocess_train(batch):
    batch["pixel_values"] = [train_tf(x.convert("RGB")) for x in batch["image"]]
    del batch["image"]
    return batch
def preprocess_val(batch):
    batch["pixel_values"] = [val_tf(x.convert("RGB")) for x in batch["image"]]
    del batch["image"]
    return batch

train_ds.set_transform(preprocess_train)
val_ds.set_transform(preprocess_val)

# Metrics (Macro Recall)
def compute_metrics_recall(pred):
    labels_ids = pred.label_ids
    preds_ids = pred.predictions.argmax(-1)
    macro_recall = recall_score(labels_ids, preds_ids, average='macro')
    print("\n" + classification_report(labels_ids, preds_ids, target_names=labels_list, digits=4))
    return {'accuracy': (labels_ids == preds_ids).mean(), 'eval_macro_recall': macro_recall}

# Custom Trainer
class WeightedTrainerResNet(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

model = AutoModelForImageClassification.from_pretrained(
    MODEL_CHECKPOINT, num_labels=3, 
    id2label={i: l for i, l in enumerate(labels_list)}, 
    label2id={l: i for i, l in enumerate(labels_list)}, ignore_mismatched_sizes=True
)

# Arguments (Optimized for P100 @ 640px)
args = TrainingArguments(
    output_dir=RESNET_OUTPUT_DIR,
    learning_rate=2e-5, 
    per_device_train_batch_size=12,   # Smaller batch for 640px
    per_device_eval_batch_size=12,
    gradient_accumulation_steps=4,    # Accumulate to reach effective batch ~48
    num_train_epochs=15,
    fp16=True, 
    dataloader_num_workers=4, 
    dataloader_pin_memory=True,
    eval_strategy="epoch", 
    save_strategy="epoch", 
    
    # MAXIMIZE RECALL
    load_best_model_at_end=True,
    metric_for_best_model="eval_macro_recall", 
    greater_is_better=True,
    save_total_limit=2,
    
    report_to="none", remove_unused_columns=False
)

trainer = WeightedTrainerResNet(
    model=model, args=args, train_dataset=train_ds, 
    eval_dataset=val_ds, compute_metrics=compute_metrics_recall,
    data_collator=DefaultDataCollator(),
)

# Train & Save
trainer.train()
trainer.save_model(RESNET_OUTPUT_DIR)

# Zip Immediately
print("‚úÖ Training Complete. Zipping model...")
shutil.make_archive("resnet_best_model", 'zip', RESNET_OUTPUT_DIR)
print("üéâ Done! Download 'resnet_best_model.zip' from Output.")

In [None]:
from IPython.display import FileLink

# This creates a clickable link to download the file directly
FileLink(r'resnet_best_model.zip')