In [32]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import pandas as pd

In [35]:
import os
from PIL import Image
from torch.utils.data import Dataset, DataLoader

# get the dataset and preprocessing it
path = r'/kaggle/input/breast-histopathology-images'
class Histopathology(Dataset):
    def __init__(self, root_path, transform = None):
        self.transform = transform 
        self.image_path = []
        self.labels = []

        for folders in (os.listdir(path)):
            sub_folder = os.path.join(root_path, folders)

            for internal in (os.listdir(sub_folder)):
                internal_folders = os.path.join(sub_folder, internal)
                # self.image_path.append(internal_folders)
                for class_folders in (os.listdir(internal_folders)):
                    class_folder = os.path.join(internal_folders, class_folders)
                    if class_folder.endswith('png' or 'jpg' or 'jpeg'):
                        self.image_path.append(class_folder)

                        # take the labels from the images.
                        if "class1" in class_folder:
                            label = 1
                        else:
                            label = 0
                        self.labels.append(label)
    

        print("Length : ", len(self.image_path))
        print("Labels : ", len(self.labels))

    def __len__(self):
        return len(self.image_path)

    def __getitem__(self, index):
        image_path = self.image_path[index]
        image = Image.open(image_path).convert("RGB")
        labels = self.labels[index]
       
        if self.transform:
            image = self.transform(image)
        return image, labels

In [36]:
# split the dataset into train and test
# all_images, all_labels =  Histopathology(path, transform = transform) - won't work
# But Histopathology(...) returns a Dataset object, not two variables (all_images, all_labels).

# calling the class and it will automatically call only the _init_ constructor not the len and getitem
dataset =  Histopathology(path)
all_images = dataset.image_path
all_labels = dataset.labels

Length :  277524
Labels :  277524


In [10]:
class Histopathology_dataset(Dataset):
    def __init__(self, images, labels , transform = None):
        self.transform = transform 
        self.image_path = images
        self.labels = labels

        # This is the check of mapping the image to the respective label
        if "class1" in self.image_path:
            labels = 1
        else:
            labels = 0
            
    def __len__(self):
        return len(self.image_path)

    def __getitem__(self, index):
        image_path = self.image_path[index]
        image = Image.open(image_path).convert("RGB")
        labels = self.labels[index]
       
        if self.transform:
            image = self.transform(image)
        return image, labels

In [28]:
from sklearn.model_selection import train_test_split

# tranform the images and prepare the dataset
transform = transforms.Compose([transforms.Resize((50,50)), transforms.ToTensor()])

train_images, test_images, train_labels, test_labels = train_test_split(all_images, all_labels, test_size = 0.3)
print("Train images : ", len(train_images))
print("Train labels : ", len(train_labels))

print(f"Test images : {len(test_images)}, Test_labels : {len(test_labels)}")

train_dataset = Histopathology_dataset(train_images, train_labels, transform = transform)
test_dataset = Histopathology_dataset(test_images, test_labels, transform = transform)


# load the data will call len and getitem and retrieve the data
train_dataloader= DataLoader(dataset=train_dataset, shuffle = True, batch_size = 32)
test_dataloader= DataLoader(dataset=test_dataset, shuffle = False, batch_size = 32)

batch = iter(train_dataloader)
samples, labels = next(batch)
print(f"Samples shape : {samples.shape}, labels shape : {labels.shape}")

batch1 = iter(test_dataloader)
samples1, labels1 = next(batch)
print(f"Samples shape : {samples1.shape}, labels shape : {labels1.shape}")

Train images :  194266
Train labels :  194266
Test images : 83258, Test_labels : 83258
Samples shape : torch.Size([32, 3, 50, 50]), labels shape : torch.Size([32])
Samples shape : torch.Size([32, 3, 50, 50]), labels shape : torch.Size([32])


In [29]:
class ConvoNetwork(nn.Module):
    def __init__(self):
        super(ConvoNetwork, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2,2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(16*9*9, 100)
        self.fc2 = nn.Linear(100, 64)
        self.fc3 = nn.Linear(64, 1)

    def forward(self, x):
        x1 = self.relu(self.conv1(x))
        x1 = self.pool(x1)
        x2 = self.relu(self.conv2(x1))
        x2 = self.pool(x2)

        # flatten the layer
        x2 = x2.view(-1, 16*9*9)

        # Pass to the fully connected layer
        output = self.relu(self.fc1(x2))
        output = self.relu(self.fc2(output))
        output = self.relu(self.fc3(output))
        # Give the sigmoid layer
        output = torch.sigmoid(output)

        return output

model = ConvoNetwork()
# Loss as BinaryCorssEntropy
loss = nn.BCELoss()
learning_rate = 0.001
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)

In [30]:
total_steps = len(train_dataloader)
print(total_steps)

6071


In [None]:
# train your model
total_epoch = 3
for epoch in range(total_epoch):
    for i, (images, labels) in enumerate(train_dataloader):
        output = model(images)
        # make the label size as same size of output
        labels = labels.view(-1, 1).float()
        criterion = loss(output, labels)
        optimizer.zero_grad()
        criterion.backward()
        optimizer.step()

        if (i+1) % 1000 == 0:
            print(f"epoch : {epoch + 1}, steps : {i+1}/ {total_steps}, loss = {criterion.item():.5f}")

epoch : 1, steps : 1000/ 6071, loss = 0.69315


In [261]:
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_dataloader:
        output = model(images)
        _, predictions = torch.max(output, 1)
        correct += (predictions == labels).sum().item()
        total += labels.shape[0]

    accuracy = 100 * correct / total
    print(f"Test Accuracy: {accuracy:.2f}%")
        

Test Accuracy: 71.65%
