In [1]:
# importing necessary libraries 

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torchvision 
from torchvision import transforms, datasets, models 
from torch.nn import functional as F 
from PIL import Image
import pandas as pd 
import numpy as np
import tensorflow as tf
import os
import sys
import glob
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt 
import cv2
import json 
from tqdm import tqdm
from sklearn.decomposition import PCA 
from sklearn.preprocessing import StandardScaler
from skimage.transform import rotate, AffineTransform
import random
from scipy import ndimage
import openslide
import matplotlib.patches as patches
from matplotlib.patches import Polygon
import xml.etree.ElementTree as ET 

2024-08-22 14:13:52.412004: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-22 14:13:52.412137: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-22 14:13:52.569969: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


**EXTRACTING PATCH EMBEDDINGS** 

In [2]:
# feature extractor for patches 

class Resnet50(nn.Module):
    def __init__(self,num_classes):
        #define necessary layers
        super().__init__()
        self.num_classes = num_classes
        self.model = models.resnet50(weights=True)
        
        # Unfreeze model weights
        for param in self.model.parameters():
            param.requires_grad = False 
        
    def forward(self,X):
        #define forward pass here
        X = self.model(X)
        return X        
            
model = Resnet50(1)

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:07<00:00, 13.8MB/s]


In [3]:
# transform function for patches 

transform = torchvision.transforms.Compose(
    [ 
        torchvision.transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),  # normalization
    ] 
) 

In [4]:
# function to obtain feature embedding for a given patch 

def get_feature_vector(img): 
    img = torch.from_numpy(img.astype(np.double)) 
    img = img.permute(2, 0, 1) 
    img = transform(img) 
    img = img.float() 
    img = torch.unsqueeze(img, dim=0) 
    return model(img) 

In [5]:
# function to obtain the percentage of while area in a given patch 

# threshold for labelling a pixel as white 
thresh = 200 

# mask for visualization of white area in a patch 
mask = np.zeros((224, 224, 3), dtype = np.uint8)

# function to get percentage of white area in a patch 
def get_perc_white_area(img): 
    
    count = 0 
    white = 0 
    
    i = 0 
    for row in img: 
        j = 0 
        for index in row: 

            count = count + 1 
            rgb = 0.299 * index[0] + 0.587 * index[1] + 0.114 * index[2] 
            if rgb > thresh: 
                white = white + 1 
                
            j += 1
        i += 1

    return (white / count) * 100

In [6]:
# function to return feature vectors along with labels for patches 

def get_patch_embeddings(wsi_nos): 
    
    wsi_ids = [] 
    ann_ids = [] 
    feature_vectors = [] 
    labels = [] 
    white_area_percentages = [] 
    x_pixels = [] 
    y_pixels = [] 
    
    for k in range(len(wsi_nos)): 
        
        print(wsi_nos[k]) 
        
        # Path to the SVS and XML files
        img_path = '/kaggle/input/bach-breast-cancer-histology-images/ICIAR2018_BACH_Challenge/ICIAR2018_BACH_Challenge/WSI/' + wsi_nos[k] + '.svs' 
        ann_path = '/kaggle/input/bach-breast-cancer-histology-images/ICIAR2018_BACH_Challenge/ICIAR2018_BACH_Challenge/WSI/' + wsi_nos[k] + '.xml' 

        # Open the SVS file
        slide = openslide.OpenSlide(img_path)

        # Choosing a level (e.g., 0 for the highest resolution)
        level = 0

        # Get slide dimensions at the selected level
        width, height = slide.level_dimensions[level] 

        # Load and parse the XML file
        tree = ET.parse(ann_path)
        root = tree.getroot() 

        ann_no = 1 
        # Extract coordinates for each region and group by class
        for region in root.findall('.//Region'):
            # Try to get the region_value(class) from the Attribute tag first
            attribute = region.find('.//Attribute')
            if attribute is not None:
                region_value = attribute.get('Value')
            else:
                # If no Attribute is found, fall back to the Text attribute in the Region tag
                region_value = region.get('Text') 
            
            min_x = float('inf')
            max_x = float('-inf')
            min_y = float('inf')
            max_y = float('-inf') 
            
            for vertex in region.findall('.//Vertex'): 
                
                # Update min and max x and y coordinates
                min_x = min(min_x, float(vertex.get('X')))
                max_x = max(max_x, float(vertex.get('X')))
                min_y = min(min_y, float(vertex.get('Y')))
                max_y = max(max_y, float(vertex.get('Y'))) 
            
            # loop to slide vertically for patches 
            for i in range(int(min_y), int(max_y), 224): 

                # loop to slide horizontally 
                for j in range(int(min_x), int(max_x), 224): 

                    # Read a region of the slide at the selected level (current patch) 
                    curr_patch = slide.read_region((j, i), level, (224, 224)) 

                    # Convert to numpy array 
                    curr_patch = np.array(curr_patch) 
                    curr_patch = curr_patch[:, :, :3] 
                    
                    # Normalization (remove in case of issues with results) 
                    #curr_patch /= 255.0 
                    
                    # Storing percentage of white area 
                    white_area_perc = get_perc_white_area(curr_patch) 
                    white_area_percentages.append(white_area_perc) 

                    # Storing feature vector 
                    feature_vector = get_feature_vector(curr_patch) 
                    squeezed_arr = np.squeeze(feature_vector) 
                    
                    feature_vectors.append(squeezed_arr) 

                    # Storing labels 
                    labels.append(region_value) 

                    # Storing WSI Id 
                    wsi_ids.append(k) 

                    # Storing annotation Id 
                    ann_ids.append(ann_no) 
                    
                    # Storing starting pixel values 
                    x_pixels.append(j) 
                    y_pixels.append(i) 
            
            print('annotation', ann_no, 'done') 
            
            ann_no += 1 

        # Close the slide
        slide.close() 
    
    return wsi_ids, ann_ids, feature_vectors, white_area_percentages, x_pixels, y_pixels, labels 

In [7]:
# unique ids for annotated wsis 

ids = ['A01', 'A02', 'A03', 'A04', 'A05', 'A06', 'A07', 'A08', 'A09', 'A10'] 

In [8]:
# obtaining patch embeddings for all wsis 

start = 2 
stop = 4 

wsi_ids, ann_ids, patch_embeddings, white_area_percentages, x_loc, y_loc, patch_labels = get_patch_embeddings(ids[start:stop]) 

A03
annotation 1 done
annotation 2 done
annotation 3 done
annotation 4 done
annotation 5 done
annotation 6 done
annotation 7 done
annotation 8 done
annotation 9 done
annotation 10 done
A04
annotation 1 done


**SAVING PATCH LEVEL DATA & EMBEDDINGS** 

In [9]:
# storing patch info in a pandas dataframe 

patch_info_df = pd.DataFrame({ 
                'WSI Id' : wsi_ids, 
                'Annotation No.' : ann_ids, 
                'X Start Pixel': x_loc, 
                'Y Start Pixel': y_loc, 
                'White Area %': white_area_percentages, 
                'Label': patch_labels 
}) 

In [10]:
patch_info_df.head() 

Unnamed: 0,WSI Id,Annotation No.,X Start Pixel,Y Start Pixel,White Area %,Label
0,0,1,6920,8112,34.757653,Carcinoma in situ
1,0,1,7144,8112,27.377631,Carcinoma in situ
2,0,1,7368,8112,29.771205,Carcinoma in situ
3,0,1,7592,8112,18.351403,Carcinoma in situ
4,0,1,7816,8112,11.882175,Carcinoma in situ


In [11]:
# saving csv file 

csv_file_path = '/kaggle/working/bach_patch_info_2_4.csv'

# Save the DataFrame to a CSV file 
patch_info_df.to_csv(csv_file_path, index=False) 

In [12]:
# storing patch embeddings in pth file 

torch.save(patch_embeddings,'bach_patch_info_2_4.pth') 

In [13]:
# merging 3 train csvs together 

#path_1 = 'somepath1' 
#path_2 = 'somepath2' 
#path_3 = 'somepath3' 

#df1 = pd.read_csv(path_1) 
#df2 = pd.read_csv(path_2) 
#df3 = pd.read_csv(path_3) 

#df_all = pd.concat([df1, df2, df3], axis=0).reset_index(drop = True) 

#df_all.to_csv(kaggle_directory1 + 'train_annotations.csv', index = False) 