## Filter bad images

In [None]:
import pandas as pd
import numpy as np
import cv2
import torch
from torchvision.transforms import functional as F
from PIL import Image
import timm
import torch
import os
import joblib

  from .autonotebook import tqdm as notebook_tqdm


Dataset URL - https://www.kaggle.com/datasets/ascanipek/eyepacs-aptos-messidor-diabetic-retinopathy

## Create the excel based on the folder structure of the dataset

In [None]:
# Define base path
base_path = "./data/dr_unified_v2"

# Prepare list to collect data
data = []

# Folders to loop through
splits = ["train", "val", "test"]

# Traverse through each split
for split in splits:
    split_path = os.path.join(base_path, split)
    if not os.path.exists(split_path):
        continue
    
    # Loop through label directories (0, 1, 2, 3, 4)
    for label in os.listdir(split_path):
        label_path = os.path.join(split_path, label)
        if not os.path.isdir(label_path):
            continue
        
        # Loop through images in each label directory
        for img_name in os.listdir(label_path):
            if img_name.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.gif')):
                data.append({
                    "image_name": img_name,
                    "label": int(label),
                    "split": split
                })

# Create DataFrame
df = pd.DataFrame(data)

# Save to Excel
df.to_excel("dr_unified_v2.xlsx", index=False)

print(f"Excel file created with {len(df)} entries.")


Excel file created with 92501 entries.


## Creating the Image Quality excel

In [None]:
# Reading the image excel and renaming the columns
df = pd.read_excel("./data/dr_unified_v2.xlsx")
df.columns = ['Img_Path', 'Retinopathy_grade', 'Split']

In [None]:
# Function to load the images from the image path present in the excel
def load_image(image_path, label, split):
    full_path = f"./data/dr_unified_v2/{split}/{label}/{image_path}"
    if not os.path.exists(full_path):
        raise FileNotFoundError(f"Image file not found: {full_path}")
    img = cv2.imread(full_path)
    if img is None:
        raise ValueError(f"Failed to read image: {full_path}")
    return img

In [None]:
# Using the DenseNet model to classify the image in Good, Usable or Bad Quality
model = timm.create_model('densenet121.tv_in1k', pretrained=True, num_classes=0)
model.eval().cuda()
clf = joblib.load('./quickqual_dn121_512.pkl')

def getQual(img_path, label, split):
    img_test = load_image(img_path, label, split)
    img_rgb = cv2.cvtColor(img_test, cv2.COLOR_BGR2RGB)
    img_pil = Image.fromarray(img_rgb)
    img = F.to_tensor(F.resize(img_pil, 512))
    img = F.normalize(img, [0.5] * 3, [0.5] * 3).cuda().unsqueeze(0)
    with torch.no_grad():
        features = model(img).squeeze().cpu().reshape(1, -1)
    pred = clf.predict_proba(features)  # order of class probabilities: Good, Usable, Bad
    class_idx = np.argmax(pred[0])
    classes = ["Good", "Usable", "Bad"]
    return classes[class_idx]

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [None]:
# Applying the getQual function to each row to get the Quality of Image
df["Quality"] = df.apply(lambda row: getQual(row["Img_Path"], row["Retinopathy_grade"], row['Split']), axis=1)

In [10]:
df["Quality"].value_counts()

Quality
Good      41299
Bad       37512
Usable    13690
Name: count, dtype: int64

In [11]:
df.to_excel("dr_unified_trainLabel_Quality.xlsx", index=False)

In [12]:
filtered_df = df[df["Quality"].isin(["Good", "Usable"])]

In [13]:
filtered_df.to_excel("dr_unified_dataset_Good_Useable.xlsx", index=False)