In [1]:
import pandas as pd
import numpy as np
nih_dataset = '../data/nih-dataset'
cleared_dataset = '../cleared-data/nih-dataset'

In [2]:
# Patients Age Clearing
# Reload the dataset
data_entry_file = f'{nih_dataset}/Data_Entry_2017.csv'
data_frame = pd.read_csv(data_entry_file, delimiter=',', nrows=None)

total_count = len(data_frame)

# Group by Patient ID and get the first age for each patient
unique_patients = data_frame.groupby('Patient ID')['Patient Age'].first()

# Calculate mean and standard deviation of ages
mean_unique_age = unique_patients.mean()
std_dev_unique_age = unique_patients.std()

# Define outlier thresholds - age
outlier_threshold_upper_std = round(mean_unique_age + 3 * std_dev_unique_age, 4)
outlier_threshold_lower_std = 0 # Can't be younger than 0 years 

# Patients in age between [0 and 3σ]
filtered_unique_patients = unique_patients[
    (unique_patients >= outlier_threshold_lower_std) & 
    (unique_patients <= outlier_threshold_upper_std)
]

filtered_unique_patients = filtered_unique_patients.reset_index()
valid_patient_ids = filtered_unique_patients['Patient ID'].tolist()
filtered_data = data_frame[data_frame['Patient ID'].isin(valid_patient_ids)]

In [3]:
# Undersampling Clearing
# Reload the dataset
total_size = len(filtered_data)

dominant_class = "No Finding"
dominant_class_data = filtered_data[filtered_data['Finding Labels'] == dominant_class]
minority_classes_data = filtered_data[filtered_data['Finding Labels'] != dominant_class]

assert total_size == len(dominant_class_data) + len(minority_classes_data)

undersample_rate = round(len(minority_classes_data) / len(dominant_class_data), 2)
undersample_size = int(len(dominant_class_data) * undersample_rate)

balanced_dominant_class = dominant_class_data.sample(n=undersample_size, random_state=42)
balanced_data = pd.concat([balanced_dominant_class, minority_classes_data], ignore_index=True)
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

In [4]:
# Store in cleated Dataset
cleared_data_enetry = f'{cleared_dataset}/Data_Entry_2017.csv'
balanced_data.to_csv(cleared_data_enetry, index=False)

In [5]:
# Prepare Bounding Box Data
data_entry_file = f'{nih_dataset}/BBox_List_2017.csv'
bbox_data = pd.read_csv(data_entry_file, delimiter=',', nrows=None)
renamed_bbox_data = bbox_data.dropna(axis=1, how='all').copy()
renamed_bbox_data.rename(columns={"Bbox [x": "x", "h]": "h"}, inplace=True)

# Calculage widh & heigh data bounds
mean_w = renamed_bbox_data['w'].mean()
std_w = renamed_bbox_data['w'].std()
mean_h = renamed_bbox_data['h'].mean()
std_h = renamed_bbox_data['h'].std()
w_outlier_threshold = round(mean_w + 3 * std_w, 2)
h_outlier_threshold = round(mean_h + 3 * std_h, 2)

print(f"W: {w_outlier_threshold}, H: {h_outlier_threshold}")

filtered_bbox_data = renamed_bbox_data[
    (renamed_bbox_data['w'] <= w_outlier_threshold) & 
    (renamed_bbox_data['h'] <= h_outlier_threshold)
]

print(f"Original dataset size: {len(renamed_bbox_data)}")
print(f"Filtered dataset size: {len(filtered_bbox_data)}")
print(f"Data lose: {len(renamed_bbox_data) - len(filtered_bbox_data)}")
print(f"Data lose ration: {round(100 - (len(filtered_bbox_data) / len(renamed_bbox_data)) * 100,2)} ")

# Store in cleated Dataset
cleared_data_enetry = f'{cleared_dataset}/BBox_List_2017.csv'
filtered_bbox_data.to_csv(cleared_data_enetry, index=False)

W: 759.22, H: 730.63
Original dataset size: 985
Filtered dataset size: 973
Data lose: 12
Data lose ration: 1.22 


In [6]:
# Images Data Clearing
from PIL import Image
import re
import os
import cv2
from tqdm import tqdm

pattern = re.compile(r'images_\d{3}')
matching_dirs = [
    os.path.join(nih_dataset, d)
    for d in os.listdir(nih_dataset)
    if os.path.isdir(os.path.join(nih_dataset, d)) and pattern.match(d)
]

image_files = set()
for directory in matching_dirs:
    nested_dir = f"{directory}/images"
    image_files_in_dir = os.listdir(nested_dir)
    for image in image_files_in_dir:
        image_path = os.path.join(nested_dir, image)
        image_files.add(image_path)

images_df = pd.DataFrame(list(image_files))
images_df.columns = ['image_path']
images_df['intensity'] = None
images_df['ssim'] = None
print(images_df.head())

                                          image_path intensity  ssim
0  ../data/nih-dataset/images_011/images/00026083...      None  None
1  ../data/nih-dataset/images_002/images/00003274...      None  None
2  ../data/nih-dataset/images_005/images/00009320...      None  None
3  ../data/nih-dataset/images_010/images/00022931...      None  None
4  ../data/nih-dataset/images_002/images/00003535...      None  None


In [7]:
# Filter by intensity

def calculate_pixel_intensity_stats(image_file):
    with Image.open(image_file) as img:
        img = img.convert('L')
        img_array = np.array(img, dtype=np.float32)  # Convert to NumPy array
        return img_array.mean()
            
intensities = []
for index, row in tqdm(images_df.iterrows(), total=len(images_df)):
    try:
        image_path = row['image_path']
        images_df.at[index, 'intensity'] = calculate_pixel_intensity_stats(image_path)
    except Exception as e:
        tqdm.write(f"Error processing {image}: {e}")
        continue

mean_intensity = np.mean(images_df['intensity'])
std_intensity = np.std(images_df['intensity'])

mean_intensity_upper_outlier = round(mean_intensity + 3 * std_intensity, 4)
mean_intensity_lower_outlier = round(mean_intensity - 3 * std_intensity, 4)

print("Upper Outlier Threshold:", mean_intensity_upper_outlier)
print("Lower Outlier Threshold:", mean_intensity_lower_outlier)

  1%|█▏                                                                                                                                  | 1059/112120 [00:07<13:56, 132.73it/s]


KeyboardInterrupt: 

In [None]:
filtered_images_df = images_df[
    (images_df['intensity'] >= mean_intensity_lower_outlier) &
    (images_df['intensity'] <= mean_intensity_upper_outlier)
]

print(f"Final dataset size before removing outliers: {len(images_df)}")
print(f"Final dataset size after removing outliers: {len(filtered_images_df)}")
print(f"Dataset size diff: {len(images_df) - len(filtered_images_df)}")

In [None]:
from skimage.metrics import structural_similarity as ssim
median_intensity = np.median(filtered_images_df['intensity'])
closest_image_index = np.argmin([abs(intensity - median_intensity) for intensity in filtered_images_df['intensity']])

print(f"Median intensity: {median_intensity}")
print(f"Closest image index: {closest_image_index}")
print(f"Closest image intensity: {filtered_images_df.iloc[closest_image_index]['intensity']}")

reference_image_path = filtered_images_df.iloc[closest_image_index]['images_path']
reference_image = Image.open(reference_image_path).convert("L")
reference_image = np.array(reference_image)  # Konwersja do NumPy

for index, row in tqdm(filtered_images_df.iterrows(), total=len(filtered_images_df)):
    try:
        comparison_image = Image.open(row['images_path']).convert("L")
        comparison_image = np.array(comparison_image)

        if reference_image.shape != comparison_image.shape:
            comparison_image = Image.fromarray(comparison_image).resize(
                (reference_image.shape[1], reference_image.shape[0]), Image.LANCZOS
            )
            comparison_image = np.array(comparison_image)

        ssim_value, _ = ssim(reference_image, comparison_image, full=True)
        filtered_images_df.at[index, 'ssim'] = ssim_value

    except Exception as e:
        print(f"Błąd przetwarzania {image_path}: {e}")

print(filtered_images_df.head())

In [8]:
# Ensure SSIM is numeric
filtered_images_df['ssim'] = pd.to_numeric(filtered_images_df['ssim'], errors='coerce')

# Compute mean and standard deviation
mean_ssim = np.mean(filtered_images_df['ssim'])
std_ssim = np.std(filtered_images_df['ssim'])

# Define outlier thresholds
mean_ssim_upper_outlier = round(mean_ssim + 3 * std_ssim, 4)
mean_ssim_lower_outlier = round(mean_ssim - 3 * std_ssim, 4)

print("Upper Outlier Threshold:", mean_ssim_upper_outlier)
print("Lower Outlier Threshold:", mean_ssim_lower_outlier)

# Corrected filtering condition (inside the range, not outside)
simm_images_df = filtered_images_df[
    (filtered_images_df['ssim'] <= mean_ssim_upper_outlier) &  # Inside upper limit
    (filtered_images_df['ssim'] >= mean_ssim_lower_outlier)    # Inside lower limit
]

print(f"Final dataset size before removing outliers: {len(filtered_images_df)}")
print(f"Final dataset size after removing outliers: {len(simm_images_df)}")
print(f"Dataset size diff: {len(filtered_images_df) - len(simm_images_df)}")

Upper Outlier Threshold: 0.8236
Lower Outlier Threshold: 0.4617
Final dataset size before removing outliers: 111822
Final dataset size after removing outliers: 110999
Dataset size diff: 823


In [9]:
# Save cleared data copy
import shutil
import os

# Define source and destination folders
images_dataset = f"{cleared_dataset}/images"
os.makedirs(images_dataset, exist_ok=True)

for index, row in tqdm(simm_images_df.iterrows(), total=len(simm_images_df)):
    try:
        image_path = row['images_path']
        source_path = os.path.abspath(image_path)  # If paths are absolute in the DataFrame
        destination_path = os.path.join(images_dataset, os.path.basename(image_path))

        if os.path.exists(source_path):
            shutil.copy2(source_path, images_dataset)  # Preserve metadata
        else:
            tqdm.warning(f"Warning: File not found - {source_path}")
    except Exception as e:
        print(f"Error copying {image_path}: {e}")

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 110999/110999 [01:59<00:00, 932.07it/s]
