# Breast Cancer Classification

In [2]:
import torch
import torch.nn.functional
import torch.optim
import numpy as np
import matplotlib.pyplot
import torchvision
import torchvision.transforms
import torchvision.models
import torchvision.datasets
import os
from collections import Counter
# https://www.kaggle.com/datasets/paultimothymooney/breast-histopathology-images/data

## Locate Data

In [3]:
data_dir = 'data_small' # uses directory to find image files

In [4]:
os.listdir(data_dir)

['breast_cancer']

## Transform and Create Dataset

In [5]:
transform = torchvision.transforms.Compose([ # allows me to use Resize and ToTensor together
    torchvision.transforms.Resize((50, 50)), # Resizes the images to match Kaggle size description
    torchvision.transforms.ToTensor()]) #converts image to multi-dim matrix
all_images = []
for image in os.listdir(data_dir):
      all_images.append(torchvision.datasets.ImageFolder(os.path.join(data_dir, image), transform=transform)) 
        # loads all images
datasets = torch.utils.data.ConcatDataset(all_images) # creates a concat dataset which holds all images, file names, etc

## Determines number of negative and positive cases

In [6]:
i=0
for dataset in datasets.datasets: # dataset opens a patient id which contains files '0' and '1'
    if i==0: # opens file '0' which contains images with negative results
        result = Counter(dataset.targets) # counts the number of images in '0' folder
        i += 1 # opnes file '1' which contains images with positive results
    else:
        result += Counter(dataset.targets) # counts the number of images in '1' folder

result = dict(result) # creates a dictionary of the number of positive and negative results
print("""Total Number of Images in files '0' and '1':
    number of images in file '0' (Negative for Breast Cancer): {} 
    number of images in file '1' (Positive for Breasr Cancer): {}""".format(result[0], result[1]))
# images in file '0' are negative
# images in file '1' are positive

Total Number of Images in files '0' and '1':
    number of images in file '0' (Negative for Breast Cancer): 4268 
    number of images in file '1' (Positive for Breasr Cancer): 1872


In [7]:
torch.manual_seed(2)

<torch._C.Generator at 0x1999e5746d0>

## Train and Test Splits

In [8]:
total_result = result[0] + result[1] # adds total number of images
train_size = int(0.8*(total_result)) # 80% training set
test_size = total_result-train_size # 20% test set

In [9]:
train_dataset, test_dataset = torch.utils.data.random_split(datasets, [train_size, test_size])
# randomly splits the dataset with 80% training and 20% test split

In [10]:
trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=128,
                                          shuffle=True, num_workers=2)
testloader = torch.utils.data.DataLoader(test_dataset, batch_size=128,
                                         shuffle=False, num_workers=2)
# Dataloaders allow for more convenient access to the images
# shuffle is done on training dataset for better generalization

## Use GPU if it has CUDA Cores

In [11]:
def set_device():
    if torch.cuda.is_available():
        dev = "cuda:0"
    else:
        dev = "cpu"
    return torch.device(dev)

## Training Model

In [12]:
def train_nn(model, train_loader, test_loader, criterion, optimizer, n_epochs):
    device = set_device()

    for epoch in range(n_epochs):
        print("Epoch number %d " % (epoch + 1))
        model.train()
        running_loss = 0.0
        running_correct = 0.0
        total = 0

        for data in trainloader:
            images, labels = data
            images = images.to(device)
            labels = labels.to(device)
            total += labels.size(0)
    
            optimizer.zero_grad()
            
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            
            loss = criterion(outputs, labels)
    
            loss.backward()
    
            optimizer.step()
    
            running_loss += loss.item()
            running_correct += (labels==predicted).sum().item()

        epoch_loss = running_loss/len(trainloader)
        epoch_accuracy = 100 * running_correct / total

        print(" - Training Dataset. Got %d out of %d images correctly (%.3f%%). Epoch loss: %.3f"
             % (running_correct, total, epoch_accuracy, epoch_loss))

        evaluate_model_on_test_set(model, testloader)

    print("Finished")
    return model                          
                          

In [13]:
def evaluate_model_on_test_set(model, testloader):
    model.eval()
    predicted_correctly_on_epoch = 0
    total = 0
    device = set_device()

    with torch.no_grad():
        for data in testloader:
            images, labels = data
            images = images.to(device)
            labels = labels.to(device)
            total += labels.size(0)
    
            outputs = model(images)
    
            _, predicted = torch.max(outputs.data, 1)
    
            predicted_correctly_on_epoch += (predicted == labels).sum().item()

    epoch_accuracy = 100.0 * predicted_correctly_on_epoch / total
    print(" - Testing Dataset. Got %d out of %d images correctly (%.3f%%)"
         % (predicted_correctly_on_epoch, total, epoch_accuracy))

In [16]:
resnet18_model = torchvision.models.resnet18(pretrained=True) # uses pretrained model and weights
num_features = resnet18_model.fc.in_features # size of each input sample
num_of_classifiers = 2 # classifies 0 and 1 
resnet18_model.fc = torch.nn.Linear(num_features, num_of_classifiers) 
# applies a linear transformation using the num_features (input) and num_of_classifiers (output) to generate the output
device = set_device() # either gpu or cpu will be used 
resnet_18_model = resnet18_model.to(device)
loss_fn = torch.nn.CrossEntropyLoss() # determines our error between expected output and actual output
optimizer = torch.optim.SGD(resnet18_model.parameters(), lr = 0.01, momentum = 0.9, weight_decay = 0.01) 
# Stochastic gradient descent 
# momentum helps point the gradient vectors to the right direction
# weight decay helps to prevent overfitting

In [None]:
train_nn(resnet18_model, trainloader, testloader, loss_fn, optimizer, 20)

Epoch number 1 
 - Training Dataset. Got 4037 out of 4912 images correctly (82.186%). Epoch loss: 0.410
 - Testing Dataset. Got 1051 out of 1228 images correctly (85.586%)
Epoch number 2 
 - Training Dataset. Got 4452 out of 4912 images correctly (90.635%). Epoch loss: 0.222
 - Testing Dataset. Got 1054 out of 1228 images correctly (85.831%)
Epoch number 3 
 - Training Dataset. Got 4683 out of 4912 images correctly (95.338%). Epoch loss: 0.119
 - Testing Dataset. Got 1068 out of 1228 images correctly (86.971%)
Epoch number 4 
 - Training Dataset. Got 4754 out of 4912 images correctly (96.783%). Epoch loss: 0.080
 - Testing Dataset. Got 1044 out of 1228 images correctly (85.016%)
Epoch number 5 
 - Training Dataset. Got 4836 out of 4912 images correctly (98.453%). Epoch loss: 0.047
 - Testing Dataset. Got 1066 out of 1228 images correctly (86.808%)
Epoch number 6 
 - Training Dataset. Got 4792 out of 4912 images correctly (97.557%). Epoch loss: 0.064
 - Testing Dataset. Got 1062 out of 

In [25]:
# 89.938% best testing score using batch_size = 64

In [None]:
# 90.065% 128 lr = 0.01

In [None]:
# 86.564% 128 lr = 0.01 w/o momentum or decay

In [None]:
# 88.925% 128 lr = 0.005

In [None]:
# 87.866% 128 lr = 0.001

In [None]:
# 88.76% 256

In [None]:
# 87.948% 32

In [55]:
# 87.134% 64