# Code Description 
## Exploratory Data Analysis Code 
- This notebook tackles all code related to dataset preprocessing

## Part 0 : Code Imports

In [None]:
import numpy as np
import os
import cv2
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import cupy as cp
import staintools
from tqdm import tqdm



## Part 1 : Code for Macenko Stain Normalization

In [1]:
def compute_stain_variance(img):
    img = img.astype(np.float32) + 1.0
    od = -np.log(img / 255.0)

    mask = np.all(od >= 0.15, axis=2)
    od = od[mask]

    if od.size == 0:
        return 0

    try:
       
        od_gpu = cp.asarray(od)

        # SVD
        _, _, v = cp.linalg.svd(od_gpu, full_matrices=False)
        stain_matrix = v[:2].T

        # Compute concentrations (dot product)
        concentrations = od_gpu @ stain_matrix

        # Standard deviation 
        std_dev = cp.std(concentrations)

        
        return float(std_dev.get())
    
    except Exception as e:
        print(f"⚠️ GPU error: {e}")
        return 0

def find_best_reference_image(folder):
    best_image = None
    max_variance = -1

    for fname in os.listdir(folder):
        if not fname.lower().endswith(('.jpg', '.jpeg', '.png')):
            continue

        fpath = os.path.join(folder, fname)
        img = cv2.imread(fpath)
        if img is None:
            print(f"⚠️ Failed to load: {fpath}")
            continue

        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        if img.shape[2] != 3:
            print(f"⚠️ Image {fname} does not have 3 channels.")
            continue

        try:
            var = compute_stain_variance(img)
            if var > max_variance:
                max_variance = var
                best_image = fpath
        except Exception as e:
            print(f"⚠️ Skipping {fname}: {e}")
            continue

    print("\n✅ Best reference image selected:")
    print(f"Path: {best_image}")
    print(f"Stain variance score: {max_variance:.4f}")
    return best_image

def normalize_dataset(dataset_dir, output_dir):

    reference_image_path = find_best_reference_image("Dataset/Margin Positive")

    # --- Load reference image ---
    ref_img = cv2.imread(reference_image_path)
    if ref_img is None:
        raise FileNotFoundError(f"Reference image not found at: {reference_image_path}")
    ref_img = cv2.cvtColor(ref_img, cv2.COLOR_BGR2RGB)
    ref_img = staintools.LuminosityStandardizer.standardize(ref_img)

    # --- Initialize normalizer with reference ---
    normalizer = staintools.StainNormalizer(method='macenko')
    normalizer.fit(ref_img)

    # --- Traverse dataset and normalize each image ---
    for class_name in os.listdir(dataset_dir):
        class_path = os.path.join(dataset_dir, class_name)
        if not os.path.isdir(class_path):
            continue

        out_class_path = os.path.join(output_dir, class_name)
        os.makedirs(out_class_path, exist_ok=True)

        for fname in tqdm(os.listdir(class_path), desc=f"Normalizing {class_name}"):
            if not fname.lower().endswith(('.jpg', '.jpeg', '.png')):
                continue

            in_path = os.path.join(class_path, fname)
            out_path = os.path.join(out_class_path, fname)

            try:
                img = cv2.imread(in_path)
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                img = staintools.LuminosityStandardizer.standardize(img)

                # Optical density transformation
                img_gpu = cp.asarray(img, dtype=cp.float32) + 1e-6
                od_gpu = -cp.log(img_gpu / 255.0)
                od = cp.asnumpy(od_gpu)

                # --- Transform with Macenko ---
                norm_img = normalizer.transform(img)
                norm_img = cv2.cvtColor(norm_img, cv2.COLOR_RGB2BGR)

                cv2.imwrite(out_path, norm_img)

            except Exception as e:
                print(f"⚠️ Skipping {fname}: {e}")

## Part 2 : Use Case for Dataset Preprocessing

In [None]:
normalize_dataset("Dataset", "Normalized_Dataset") # first parameter is the input dataset directory, second parameter is the output directory for normalized images.