In [1]:
import os
import sys
import torch
import torch.nn as nn
import pandas as pd
from PIL import Image
from torchvision import models, transforms
from torch.utils.data import DataLoader, Dataset
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
import matplotlib.pyplot as plt
import cv2
import random
from tensorflow.keras.models import Sequential
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
import seaborn as sns
from skimage.feature import local_binary_pattern
from skimage.feature import graycomatrix, graycoprops
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import precision_score, recall_score, accuracy_score
import numpy as np
from itertools import chain
import yaml

In [2]:
import yaml

# Load config.yaml
with open("./config.yaml", "r") as file:
    config = yaml.safe_load(file)

**Data preprocessing and visualization**

In [3]:
excel_path = config['csv_path']
df = pd.read_csv(excel_path)
print(df.head())

                                                name  ground truth
0  S-2006-005094_PAS_1of2_64552732435c92704a3d37c...             0
1  S-2006-005094_PAS_1of2_64552732435c92704a3d37c...             0
2  S-2006-005094_PAS_1of2_64552732435c92704a3d37c...             0
3  S-2006-005094_PAS_1of2_64552732435c92704a3d37c...             0
4  S-2006-005094_PAS_1of2_64552732435c92704a3d37d...             0


In [4]:
#path to both the folders
folder1_path = config['globally_sclerotic_folder']
folder2_path = config['non_globally_sclerotic_folder']
img_paths = [folder1_path, folder2_path]

#adding a new column to the dataframe that contains the path to the image and it's name
df['image_path'] = df['name'].apply(
        lambda x: os.path.join(folder1_path, x) if os.path.exists(os.path.join(folder1_path, x))
        else os.path.join(folder2_path, x) if os.path.exists(os.path.join(folder2_path, x))
        else None
    )


**Custom data generator for feature extraction process**

In [5]:
#custom data generator to prepare the image data for batches for feature extraction process, using ImageDataGenerator
def custom_data_generator(df, batch_size, target_size=(224, 224), augment=False):
    datagen = ImageDataGenerator(
        rescale=1.0 / 255,
        horizontal_flip=True if augment else False,
        rotation_range=30 if augment else 0,
        zoom_range=0.2 if augment else 0.0
    )

    while True:
        batch_data = df.sample(n=batch_size)
        images = []
        labels = []

        #iterating throught the batch
        for _, row in batch_data.iterrows():
            img = load_img(row['image_path'], target_size=target_size)
            img_array = img_to_array(img)
            images.append(img_array)
            labels.append(row['ground truth'])

        images = np.array(images)
        labels = np.array(labels)

        # Yield augmented images and their corresponding labels
        yield datagen.flow(images, labels, batch_size=batch_size).__next__()


In [6]:
# Parameters
batch_size = 32
target_size = (224, 224)

train_generator = custom_data_generator(df, batch_size, target_size, augment=True)

# Testing the generator
images, labels = next(train_generator)
print(f"Batch image shape: {images.shape}")  # (batch_size, 224, 224, 3)
print(f"Batch labels: {labels}")


Batch image shape: (32, 224, 224, 3)
Batch labels: [0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0]


**Defining the process of extraction of features from inages**

In [7]:

import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# feature extraction functions
def extract_morphological_features(image):
    _, binary_image = cv2.threshold(image, 128, 255, cv2.THRESH_BINARY_INV)
    contours, _ = cv2.findContours(binary_image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if contours:
        largest_contour = max(contours, key=cv2.contourArea)
        area = cv2.contourArea(largest_contour)
        perimeter = cv2.arcLength(largest_contour, True)
        x, y, w, h = cv2.boundingRect(largest_contour)
        aspect_ratio = float(w) / h
        rect_area = w * h
        extent = float(area) / rect_area
        return [area, perimeter, aspect_ratio, extent]
    else:
        return [0, 0, 0, 0]

def extract_textural_features(image):
    radius = 3
    n_points = 8 * radius

    # Compute LBP (Local Binary Pattern)
    lbp = local_binary_pattern(image.cpu().numpy(), n_points, radius, method='uniform')

    # Compute Haralick features using GLCM (Gray Level Co-occurrence Matrix)
    glcm = graycomatrix(image.cpu().numpy(), distances=[1], angles=[0], levels=256, symmetric=True, normed=True)

    contrast = graycoprops(glcm, 'contrast')[0][0]
    dissimilarity = graycoprops(glcm, 'dissimilarity')[0][0]
    homogeneity = graycoprops(glcm, 'homogeneity')[0][0]
    energy = graycoprops(glcm, 'energy')[0][0]
    correlation = graycoprops(glcm, 'correlation')[0][0]

    return [np.mean(lbp), np.var(lbp), contrast, dissimilarity, homogeneity, energy, correlation]

def process_images_in_batches(csv_path, batch_size=32):
    # Load CSV file
    data_df = pd.read_csv(csv_path)

    data_df['image_path'] = data_df['name'].apply(
        lambda x: os.path.join(folder1_path, x) if os.path.exists(os.path.join(folder1_path, x))
        else os.path.join(folder2_path, x) if os.path.exists(os.path.join(folder2_path, x))
        else None
    )
    # Initialize lists to store features and labels
    features_list = []
    labels_list = []

    # Split data into batches
    num_images = len(data_df)
    for batch_start in range(0, num_images, batch_size):
        batch_end = min(batch_start + batch_size, num_images)
        batch_data = data_df.iloc[batch_start:batch_end]

        print(f"Processing batch {batch_start // batch_size + 1}/{(num_images + batch_size - 1) // batch_size}...")

        for _, row in batch_data.iterrows():
            image_path = row['image_path']  #column contains full paths to images
            label = row['ground truth']  #column contains the class label

            # Read image in grayscale
            image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

            # Check if the image was loaded successfully
            if image is None:
                print(f"Warning: Unable to load image at {image_path}. Skipping...")
                continue

            # Move image to GPU as a tensor
            image_tensor = torch.tensor(image).to(device)

            # Extract features (morphological and textural)
            morph_features = extract_morphological_features(image)
            textural_features = extract_textural_features(image_tensor)

            # Combine features and append to list
            features_list.append(morph_features + textural_features)
            labels_list.append(label)

        print(f"Batch {batch_start // batch_size + 1} completed.")

    return np.array(features_list), np.array(labels_list)

csv_path = config['csv_path']

# Process images in batches and extract features
features, labels = process_images_in_batches(csv_path)

# Save extracted features for later use
np.save('features.npy', features)
np.save('labels.npy', labels)




Using device: cpu
Processing batch 1/180...
Batch 1 completed.
Processing batch 2/180...
Batch 2 completed.
Processing batch 3/180...
Batch 3 completed.
Processing batch 4/180...
Batch 4 completed.
Processing batch 5/180...
Batch 5 completed.
Processing batch 6/180...
Batch 6 completed.
Processing batch 7/180...
Batch 7 completed.
Processing batch 8/180...
Batch 8 completed.
Processing batch 9/180...
Batch 9 completed.
Processing batch 10/180...
Batch 10 completed.
Processing batch 11/180...
Batch 11 completed.
Processing batch 12/180...
Batch 12 completed.
Processing batch 13/180...
Batch 13 completed.
Processing batch 14/180...
Batch 14 completed.
Processing batch 15/180...
Batch 15 completed.
Processing batch 16/180...
Batch 16 completed.
Processing batch 17/180...
Batch 17 completed.
Processing batch 18/180...
Batch 18 completed.
Processing batch 19/180...
Batch 19 completed.
Processing batch 20/180...
Batch 20 completed.
Processing batch 21/180...
Batch 21 completed.
Processing ba

**Data processing on the extracted features**

In [8]:
df_array = pd.DataFrame(features, columns=[f'f{i}' for i in range(1,12)])   #converting the features.npy file into a dataframe
df = pd.concat([df, df_array], ignore_index=True,axis=1)                    #concatenating the df_array and df dataframes

print(df.shape)


(5758, 14)


In [9]:
#renaming the columns for better interpretability
df.rename(columns={0: 'name', 1: 'ground truth',2:'image_path'}, inplace=True)
df.rename(columns={3: 'f1', 4: 'f2',5:'f3',6:'f4',7:'f5',8:'f6',9:'f7',10:'f8',11:'f9',12:'f10',13:'f11'}, inplace=True)

#normalizing some of the columns
scaler = MinMaxScaler()
df[['f1', 'f2','f5','f6','f7']] = scaler.fit_transform(df[['f1', 'f2','f5','f6','f7']])
df.head()


Unnamed: 0,name,ground truth,image_path,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11
0,S-2006-005094_PAS_1of2_64552732435c92704a3d37c...,0,/kaggle/input/glomeruloscelorosis/non_globally...,0.135396,0.17214,0.613532,0.460757,0.54081,0.67909,0.306279,7.813146,0.306785,0.07238,0.980402
1,S-2006-005094_PAS_1of2_64552732435c92704a3d37c...,0,/kaggle/input/glomeruloscelorosis/non_globally...,0.106163,0.262284,0.52687,0.355202,0.375573,0.677076,0.302721,9.25342,0.155894,0.016113,0.967392
2,S-2006-005094_PAS_1of2_64552732435c92704a3d37c...,0,/kaggle/input/glomeruloscelorosis/non_globally...,0.160521,0.065619,1.155963,0.742209,0.371169,0.65572,0.197254,7.287852,0.222909,0.036135,0.982127
3,S-2006-005094_PAS_1of2_64552732435c92704a3d37c...,0,/kaggle/input/glomeruloscelorosis/non_globally...,0.02714,0.088588,0.787671,0.285193,0.372979,0.657166,0.320763,9.17204,0.17585,0.021963,0.970531
4,S-2006-005094_PAS_1of2_64552732435c92704a3d37d...,0,/kaggle/input/glomeruloscelorosis/non_globally...,0.093153,0.319723,0.902439,0.243723,0.396571,0.670934,0.291144,8.801991,0.179107,0.022915,0.970858


In [10]:
nan_rows = df[df.isna().any(axis=1)]

# Display rows containing NaN values
print(nan_rows)

Empty DataFrame
Columns: [name, ground truth, image_path, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11]
Index: []


**Defining a custom data generator for loading both types of data**

In [11]:
# Custom Dataset Class
class CustomDataset(Dataset):
    def __init__(self, df, target_size=(224,224), augment=False):
        self.df=df
        self.target_size=target_size
        
        self.label_counts = self.df['ground truth'].value_counts().to_dict()
        self.label_glomeruli = self.label_counts.get(1, 0)  # Count of 1 instances
        self.label_non_glomeruli = self.label_counts.get(0, 0)

        #transformations
        self.transform=transforms.Compose([
            transforms.Resize(self.target_size),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229,0.224,0.225])
        ])
        
        #implementing data augmentation to increase the number of samples
        if augment==False:
            self.augment_transform= transforms.Compose([
                transforms.RandomHorizontalFlip(),
                transforms.RandomRotation(30),
                self.random_zoom(),
                self.transform
            ])

    def random_zoom(self):
        return transforms.RandomAffine(
            degrees=0,           # No rotation
            translate=None,      # No translation
            scale=(0.8, 1.2)     # Random scaling (zoom in or out)
        )

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row=self.df.iloc[idx]

        # Load image
        img_name= row['name']
        img=Image.open(row['image_path'])

        # Convert RGBA to RGB if necessary
        if img.mode=='RGBA':
            img=img.convert('RGB')
            
        # Apply augmentations if specified
        img=self.augment_transform(img)

        # Extract textual features
        feature_values = row[['f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11']].values
        feature_values = np.array(feature_values, dtype=np.float32)
        
        # Extract label
        label=float(row['ground truth'])
      
        return img, feature_values, label, self.label_glomeruli, self.label_non_glomeruli, img_name

test_dataset = CustomDataset(df, target_size=(224, 224), augment=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

#loop over DataLoader
for images, features, labels,count_glomeruli, count_non_glomeruli, img_name in test_loader:
    print(f"Batch images shape: {images.shape}")
    print(f"Batch features shape: {features.shape}")
    print(f"Batch labels shape: {labels.shape}")
    print(count_glomeruli.dtype)
    break  

Batch images shape: torch.Size([32, 3, 224, 224])
Batch features shape: torch.Size([32, 11])
Batch labels shape: torch.Size([32])
torch.int64


In [12]:
#defining the model that will take both the images and features as input, by extending the nn.Module class
class MultiInputModel(nn.Module):
    def __init__(self, image_input_shape, feature_input_shape):
        super(MultiInputModel, self).__init__()

        # Image input branch 
        self.image_fc=nn.Sequential(
            nn.Flatten(),
        )

        # Feature input branch 
        self.feature_fc=nn.Sequential(
            nn.Linear(feature_input_shape, 64),
            nn.ReLU(),
            nn.Dropout(0.2)
        )

        # Combine both branches
        self.combined_fc=nn.Sequential(
            nn.Linear(224*224*3+64, 64),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        
        # Output layer (for binary classification)
        self.output_fc=nn.Sequential(
            nn.Linear(64,2),
            nn.Sigmoid()
        )

    def forward(self, image_input, feature_input):
        x = self.image_fc(image_input)  # Process the image input
        y = self.feature_fc(feature_input)  # Process the feature input
        
        # Concatenate both branches
        combined = torch.cat((x, y), dim=1)
        
        # Forward pass through the combined layers
        z = self.combined_fc(combined)
        
        # Output prediction
        output = self.output_fc(z)
        
        return output


image_input_shape=(224, 224, 3)

criterion = nn.CrossEntropyLoss()

feature_input_shape=11

model=MultiInputModel(image_input_shape, feature_input_shape)
print(model)

MultiInputModel(
  (image_fc): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
  )
  (feature_fc): Sequential(
    (0): Linear(in_features=11, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
  )
  (combined_fc): Sequential(
    (0): Linear(in_features=150592, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
  )
  (output_fc): Sequential(
    (0): Linear(in_features=64, out_features=2, bias=True)
    (1): Sigmoid()
  )
)


In [13]:
model_path=config['model_path']       #loading the model parameters
model.load_state_dict(torch.load(model_path,map_location=torch.device('cpu')))
model.eval()  

  model.load_state_dict(torch.load(model_path,map_location=torch.device('cpu')))


MultiInputModel(
  (image_fc): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
  )
  (feature_fc): Sequential(
    (0): Linear(in_features=11, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
  )
  (combined_fc): Sequential(
    (0): Linear(in_features=150592, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
  )
  (output_fc): Sequential(
    (0): Linear(in_features=64, out_features=2, bias=True)
    (1): Sigmoid()
  )
)

**Generating the CSV**

In [16]:
# Test evaluation
import tqdm
def calculate_metrics(y_true, y_pred):
    # Convert probabilities to binary predictions (class 0 or 1)
    y_pred_binary = torch.argmax(y_pred, dim=1).cpu().detach().numpy()  # Get the class with highest probability
    
    # Convert y_true to numpy
    y_true = y_true.cpu().numpy()
    
    # Calculate precision and recall
    precision = precision_score(y_true, y_pred_binary)
    recall = recall_score(y_true, y_pred_binary)
    accuracy=accuracy_score(y_true,y_pred_binary)
    
    return accuracy,precision, recall

test_loss = 0
all_labels = []
all_preds = []
img_names=[]

with torch.no_grad():
    for images, features, labels, count_glomeruli, count_non_glomeruli, img_name in tqdm.tqdm(test_loader, desc="Test Evaluation", leave=False):
        images, features, labels = images.to(device).float(), features.to(device).float(), labels.to(device).long()

        outputs = model(images, features)
        loss = criterion(outputs, labels)
        test_loss += loss.item()

        all_labels.append(labels)
        all_preds.append(outputs)
        img_names.append(img_name)
        
# Compute test loss and metrics
test_loss /= len(test_loader)
all_labels = torch.cat(all_labels)
all_preds = torch.cat(all_preds)
test_accuracy, test_precision, test_recall = calculate_metrics(all_labels, all_preds)



# Print test results
print(f"Test Loss: {test_loss:.3f}")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall: {test_recall:.4f}")


                                                                  

Test Loss: 0.647
Test Accuracy: 0.8227
Test Precision: 0.5192
Test Recall: 0.4231




In [17]:
binary_preds = torch.argmax(all_preds, dim=1).cpu().numpy()
img_names=list(chain.from_iterable(img_names))

# Create a DataFrame for the CSV file
results_df = pd.DataFrame({
    'Image Name': img_names,
    'Prediction': binary_preds
})

# Save the DataFrame to a CSV file
results_df.to_csv("evaluation.csv", index=False)