## 0 Server and Directory Checks

In [1]:
import sys
import os

print("Python executable:", sys.executable)
print("Current working directory:", os.getcwd())

Python executable: /anaconda/envs/azureml_py310_sdkv2/bin/python
Current working directory: /home/student/ActiveScanLab


## 1 Imports

In [2]:
import os
import pandas as pd
from PIL import Image
from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
import torch.nn as nn
import torch.optim as optim
from torchvision.models import resnet50
import random
from tqdm import tqdm
from classifier_models import Resnet50Model, Resnet18Model, Densenet121Model, BaseResnetModel
from active_learning_models import *
import numpy as np
from sklearn.cluster import KMeans
from costume_dataset import ChestXrayDataset

Total image files found: 112120
Filtered dataset size: 112120
Label distribution:
 label
0    60361
1    51759
Name: count, dtype: int64
                 Finding Labels  Follow-up #  Patient ID  Patient Age  \
Image Index                                                             
00001720_001.jpg     No Finding            1        1720           13   
00002510_002.jpg           Mass            2        2510           48   
00008261_000.jpg     No Finding            0        8261           39   
00004990_000.jpg     No Finding            0        4990           58   
00012075_000.jpg     No Finding            0       12075           47   

                 Patient Gender View Position  OriginalImage[Width  Height]  \
Image Index                                                                   
00001720_001.jpg              M            PA                 2048     2500   
00002510_002.jpg              M            PA                 2500     2048   
00008261_000.jpg              M    

## 1 Run Parameters


In [3]:
dataset_path = "nih_chest_xrays_light"
batch_size = 32
epochs = 3

In [4]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print(device)

cuda


## Create Datasets and Loaders

In [5]:
dataset = ChestXrayDataset( dataset_path, split_type='from_files')
print("Sample data:\n", dataset.df.head())

Total image files found: 112120
Filtered dataset size: 112120
Label distribution:
 label
0    60361
1    51759
Name: count, dtype: int64
Sample data:
                           Finding Labels  Follow-up #  Patient ID  \
Image Index                                                         
00000001_000.jpg            Cardiomegaly            0           1   
00000001_001.jpg  Cardiomegaly|Emphysema            1           1   
00000001_002.jpg   Cardiomegaly|Effusion            2           1   
00000002_000.jpg              No Finding            0           2   
00000003_000.jpg                  Hernia            0           3   

                  Patient Age Patient Gender View Position  \
Image Index                                                  
00000001_000.jpg           58              M            PA   
00000001_001.jpg           58              M            PA   
00000001_002.jpg           58              M            PA   
00000002_000.jpg           81              M           

# Explore models

## ResNet-18 Model

In [6]:
resnet18_model = Resnet18Model(optimizer='Adam', loss_function='BCEWithLogitsLoss', freeze=False, pretrained=True)

Loading pretrained ResNet18 model...


In [None]:
train_loader = dataset.get_dataloader(from_split='train')
test_loader = dataset.get_dataloader(from_split='test')
resnet18_model.train_model(device, train_loader, epochs=3)
resnet18_model.evaluate(device, test_loader)

## ResNet-50 Model

In [9]:
# Define ResNet-50 Model
resnet50_model = Resnet50Model(optimizer='Adam', loss_function='BCEWithLogitsLoss', freeze=True, pretrained=True)

Loading pretrained ResNet50 model...


In [None]:
# Train & Evaluate Full Dataset
train_loader = dataset.get_dataloader(from_split='train')
test_loader = dataset.get_dataloader(from_split='test')

resnet50_model.train_model(device, train_loader, epochs=4)
resnet50_model.evaluate(device, test_loader)

In [None]:
# # Take just 10000 samples to train faster
# small_train_df = train_df.sample(10000, random_state=42)
# small_train_dataset = ChestXrayDataset(small_train_df, "nih_chest_xrays_light")
# small_train_loader = DataLoader(small_train_dataset, batch_size=32, shuffle=True)

small_train_loader = dataset.get_dataloader(from_split='train', sample_size=10000)
test_loader = dataset.get_dataloader(from_split='test')
# Train & Evaluate Full Dataset
resnet50_model.train_model(device, small_train_loader, epochs=4)
resnet50_model.evaluate(device, test_loader)

# AL pipeline


In [7]:
# print(dataset)
active_learning_pipeline = RandomSamplingActiveLearning(
    device=device,
    iterations=10,
    root_dir=dataset_path,
    epochs_per_iter=3,
    budget_per_iter=100,
    model_name='resnet18',
    objective_function_name='BCEWithLogitsLoss',
    optimizer_name='Adam',
    seed=42
    # dataset=dataset
)



Total image files found: 112120
Filtered dataset size: 112120
Label distribution:
 label
0    60361
1    51759
Name: count, dtype: int64


In [8]:
active_learning_pipeline.run_pipeline()
plot_results(active_learning_pipeline)

Iteration 1/10
Loading pretrained ResNet18 model...


ValueError: num_samples should be a positive integer value, but got num_samples=0

In [6]:
dataset

NameError: name 'dataset' is not defined