In [1]:
import pandas as pd
import numpy as np
import os
import zipfile
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
import matplotlib.pyplot as plt
import cv2
import random
from tensorflow.keras.models import Sequential
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
import seaborn as sns
from skimage.feature import local_binary_pattern
from skimage.feature import graycomatrix, graycoprops
from sklearn.model_selection import train_test_split

sns.set(style="whitegrid")
# warnings.filterwarnings("ignore")

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

In [4]:
excel_path = '/content/drive/My Drive/unzipped_folder/public.csv'
df = pd.read_csv(excel_path)
print(df.head())

                                                name  ground truth
0  S-2006-005094_PAS_1of2_64552732435c92704a3d37c...             0
1  S-2006-005094_PAS_1of2_64552732435c92704a3d37c...             0
2  S-2006-005094_PAS_1of2_64552732435c92704a3d37c...             0
3  S-2006-005094_PAS_1of2_64552732435c92704a3d37c...             0
4  S-2006-005094_PAS_1of2_64552732435c92704a3d37d...             0


In [5]:
df.info()
df['ground truth'] = df['ground truth'].astype(str)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5758 entries, 0 to 5757
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   name          5758 non-null   object
 1   ground truth  5758 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 90.1+ KB


In [6]:
folder1_path = '/content/drive/My Drive/unzipped_folder/globally_sclerotic_glomeruli/'
folder2_path = '/content/drive/My Drive/unzipped_folder/non_globally_sclerotic_glomeruli/'
img_paths = [folder1_path, folder2_path]
df['image_path'] = df['name'].apply(
        lambda x: os.path.join(folder1_path, x) if os.path.exists(os.path.join(folder1_path, x))
        else os.path.join(folder2_path, x) if os.path.exists(os.path.join(folder2_path, x))
        else None
    )


In [7]:
df.head(20)

Unnamed: 0,name,ground truth,image_path
0,S-2006-005094_PAS_1of2_64552732435c92704a3d37c...,0,/content/drive/My Drive/unzipped_folder/non_gl...
1,S-2006-005094_PAS_1of2_64552732435c92704a3d37c...,0,/content/drive/My Drive/unzipped_folder/non_gl...
2,S-2006-005094_PAS_1of2_64552732435c92704a3d37c...,0,/content/drive/My Drive/unzipped_folder/non_gl...
3,S-2006-005094_PAS_1of2_64552732435c92704a3d37c...,0,/content/drive/My Drive/unzipped_folder/non_gl...
4,S-2006-005094_PAS_1of2_64552732435c92704a3d37d...,0,/content/drive/My Drive/unzipped_folder/non_gl...
5,S-2006-005094_PAS_1of2_64552732435c92704a3d37d...,0,/content/drive/My Drive/unzipped_folder/non_gl...
6,S-2006-005094_PAS_1of2_64552732435c92704a3d37d...,0,/content/drive/My Drive/unzipped_folder/non_gl...
7,S-2006-005094_PAS_1of2_64552732435c92704a3d37d...,0,/content/drive/My Drive/unzipped_folder/non_gl...
8,S-2006-005094_PAS_1of2_64552732435c92704a3d37d...,0,/content/drive/My Drive/unzipped_folder/non_gl...
9,S-2006-005094_PAS_1of2_64552732435c92704a3d37d...,0,/content/drive/My Drive/unzipped_folder/non_gl...


In [8]:
def custom_data_generator(df, batch_size, target_size=(224, 224), augment=False):
    datagen = ImageDataGenerator(
        rescale=1.0 / 255,
        horizontal_flip=True if augment else False,
        rotation_range=30 if augment else 0,
        zoom_range=0.2 if augment else 0.0
    )

    while True:
        batch_data = df.sample(n=batch_size)
        images = []
        labels = []

        for _, row in batch_data.iterrows():
            img = load_img(row['image_path'], target_size=target_size)
            img_array = img_to_array(img)
            images.append(img_array)
            labels.append(row['ground truth'])

        images = np.array(images)
        labels = np.array(labels)

        # Yield augmented images and their corresponding labels
        yield datagen.flow(images, labels, batch_size=batch_size).__next__()


In [9]:
# Parameters
batch_size = 32
target_size = (224, 224)

train_generator = custom_data_generator(df, batch_size, target_size, augment=True)

# Testing the generator
images, labels = next(train_generator)
print(f"Batch image shape: {images.shape}")  # (batch_size, 224, 224, 3)
print(f"Batch labels: {labels}")


Batch image shape: (32, 224, 224, 3)
Batch labels: ['0' '0' '0' '0' '0' '0' '0' '0' '1' '0' '0' '0' '0' '0' '0' '0' '0' '0'
 '0' '1' '0' '1' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0']


In [11]:

import torch

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define feature extraction functions
def extract_morphological_features(image):
    _, binary_image = cv2.threshold(image, 128, 255, cv2.THRESH_BINARY_INV)
    contours, _ = cv2.findContours(binary_image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if contours:
        largest_contour = max(contours, key=cv2.contourArea)
        area = cv2.contourArea(largest_contour)
        perimeter = cv2.arcLength(largest_contour, True)
        x, y, w, h = cv2.boundingRect(largest_contour)
        aspect_ratio = float(w) / h
        rect_area = w * h
        extent = float(area) / rect_area
        return [area, perimeter, aspect_ratio, extent]
    else:
        return [0, 0, 0, 0]

def extract_textural_features(image):
    radius = 3
    n_points = 8 * radius

    # Compute LBP (Local Binary Pattern)
    lbp = local_binary_pattern(image.cpu().numpy(), n_points, radius, method='uniform')

    # Compute Haralick features using GLCM (Gray Level Co-occurrence Matrix)
    glcm = graycomatrix(image.cpu().numpy(), distances=[1], angles=[0], levels=256, symmetric=True, normed=True)

    contrast = graycoprops(glcm, 'contrast')[0][0]
    dissimilarity = graycoprops(glcm, 'dissimilarity')[0][0]
    homogeneity = graycoprops(glcm, 'homogeneity')[0][0]
    energy = graycoprops(glcm, 'energy')[0][0]
    correlation = graycoprops(glcm, 'correlation')[0][0]

    return [np.mean(lbp), np.var(lbp), contrast, dissimilarity, homogeneity, energy, correlation]

def process_images_in_batches(csv_path, batch_size=32):
    # Load CSV file
    data_df = pd.read_csv(csv_path)

    data_df['image_path'] = data_df['name'].apply(
        lambda x: os.path.join(folder1_path, x) if os.path.exists(os.path.join(folder1_path, x))
        else os.path.join(folder2_path, x) if os.path.exists(os.path.join(folder2_path, x))
        else None
    )
    # Initialize lists to store features and labels
    features_list = []
    labels_list = []

    # Split data into batches
    num_images = len(data_df)
    for batch_start in range(0, num_images, batch_size):
        batch_end = min(batch_start + batch_size, num_images)
        batch_data = data_df.iloc[batch_start:batch_end]

        print(f"Processing batch {batch_start // batch_size + 1}/{(num_images + batch_size - 1) // batch_size}...")

        for _, row in batch_data.iterrows():
            image_path = row['image_path']  # Assuming this column contains full paths to images
            label = row['ground truth']  # Assuming this column contains the class label

            # Read image in grayscale
            image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

            # Check if the image was loaded successfully
            if image is None:
                print(f"Warning: Unable to load image at {image_path}. Skipping...")
                continue

            # Move image to GPU as a tensor
            image_tensor = torch.tensor(image).to(device)

            # Extract features (morphological and textural)
            morph_features = extract_morphological_features(image)
            textural_features = extract_textural_features(image_tensor)

            # Combine features and append to list
            features_list.append(morph_features + textural_features)
            labels_list.append(label)

        print(f"Batch {batch_start // batch_size + 1} completed.")

    return np.array(features_list), np.array(labels_list)

# Example usage
csv_path = '/content/drive/My Drive/unzipped_folder/public.csv'

# Process images in batches and extract features on GPU
features, labels = process_images_in_batches(csv_path)

# Save extracted features for later use (optional)
np.save('features.npy', features)
np.save('labels.npy', labels)

# Split data into train/test sets (optional)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Save extracted features for later use (optional)
np.save('X_train.npy', X_train)
np.save('y_train.npy', y_train)
np.save('X_test.npy', X_test)
np.save('y_test.npy', y_test)

Using device: cpu
Processing batch 1/180...
Batch 1 completed.
Processing batch 2/180...
Batch 2 completed.
Processing batch 3/180...
Batch 3 completed.
Processing batch 4/180...
Batch 4 completed.
Processing batch 5/180...
Batch 5 completed.
Processing batch 6/180...
Batch 6 completed.
Processing batch 7/180...
Batch 7 completed.
Processing batch 8/180...
Batch 8 completed.
Processing batch 9/180...
Batch 9 completed.
Processing batch 10/180...
Batch 10 completed.
Processing batch 11/180...
Batch 11 completed.
Processing batch 12/180...
Batch 12 completed.
Processing batch 13/180...
Batch 13 completed.
Processing batch 14/180...
Batch 14 completed.
Processing batch 15/180...
Batch 15 completed.
Processing batch 16/180...
Batch 16 completed.
Processing batch 17/180...
Batch 17 completed.
Processing batch 18/180...
Batch 18 completed.
Processing batch 19/180...
Batch 19 completed.
Processing batch 20/180...
Batch 20 completed.
Processing batch 21/180...
Batch 21 completed.
Processing ba

In [12]:
from google.colab import files

# Download features.npy
files.download('features.npy')

# Download labels.npy
files.download('labels.npy')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [13]:
files.download('X_train.npy')

# Download labels.npy
files.download('X_test.npy')
files.download('y_train.npy')
files.download('y_test.npy')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>