In [1]:
# Local Imports
from src.utils.consts import RAW_DATASET, CLEARED_DATASET
from src.utils.calculations import standard_deviation_bounds

from src.data.image_utils import get_image_array
from src.data.image_utils import calculate_ssim

# Packages Imports
import re
import os
import cv2
import numpy as np
import pandas as pd

from PIL import Image
from tqdm import tqdm

ModuleNotFoundError: No module named 'src.consts'

In [None]:
# Patients Age Clearing
# Reload the dataset
data_entry_path = f'{RAW_DATASET}/Data_Entry_2017.csv'
data_entry_df = pd.read_csv(data_entry_path, delimiter=',', nrows=None)
total_count = len(data_entry_df)

# Group by Patient ID and get the first age for each patient
unique_patients = data_entry_df.groupby('Patient ID')['Patient Age'].first()

# Calculate mean and standard deviation of ages
mean_unique_age = unique_patients.mean()
std_dev_unique_age = unique_patients.std()

# Define outlier thresholds - age
outlier_threshold_upper_std = round(mean_unique_age + 3 * std_dev_unique_age, 4)
outlier_threshold_lower_std = 0 # Can't be younger than 0 years 

# Patients in age between [0 and 3σ]
filtered_unique_patients = unique_patients[
    (unique_patients >= outlier_threshold_lower_std) & 
    (unique_patients <= outlier_threshold_upper_std)
]

filtered_unique_patients = filtered_unique_patients.reset_index()
valid_patient_ids = filtered_unique_patients['Patient ID'].tolist()
filtered_data = data_entry_df[data_entry_df['Patient ID'].isin(valid_patient_ids)]

print(f"Total Patients: {len(data_entry_df)}")
print(f"Filtered Patients: {len(filtered_data)}")
print(f"Total Unique Patients: {len(unique_patients)}")
print(f"Filtered Unique Parients: {len(filtered_unique_patients)}")
print(f"Lose Ration: {round(100 - (len(filtered_data) / len(data_entry_df)) * 100, 2)}%")

print(f"Data Entry Diff: {len(data_entry_df) - len(filtered_data)}")
print(f"Max Age: {outlier_threshold_upper_std}")

In [None]:
# Undersampling Clearing
filtered_data_copy = filtered_data.copy()
total_size = len(filtered_data)

dominant_class = "No Finding"
dominant_class_data = filtered_data_copy[filtered_data_copy['Finding Labels'] == dominant_class]
minority_classes_data = filtered_data_copy[filtered_data_copy['Finding Labels'] != dominant_class]

assert total_size == len(dominant_class_data) + len(minority_classes_data)

undersample_rate = round(len(minority_classes_data) / len(dominant_class_data), 2)
undersample_size = int(len(dominant_class_data) * undersample_rate)

balanced_dominant_class = dominant_class_data.sample(n=undersample_size, random_state=42)
balanced_data = pd.concat([balanced_dominant_class, minority_classes_data], ignore_index=True)
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Total Patients: {len(filtered_data_copy)}")
print(f"Filtered Patients: {len(balanced_data)}")
print(f"Lose Ration: {round(100 - (len(balanced_data) / len(filtered_data_copy)) * 100, 2)}%")

print(f"Total Lose: {(len(data_entry_df) - len(balanced_data))}")
print(f"Total Lose Ration: {round(100 - (len(balanced_data) / len(data_entry_df)) * 100, 2)}%")

In [None]:
# Store in cleared Dataset
cleared_data_enetry = f'{CLEARED_DATASET}/Data_Entry_2017.csv'
balanced_data.to_csv(cleared_data_enetry, index=False)

In [None]:
# Prepare Bounding Box Data
data_entry_file = f'{RAW_DATASET}/BBox_List_2017.csv'
bbox_data_df = pd.read_csv(data_entry_file, delimiter=',', nrows=None)

renamed_bbox_data = bbox_data_df.dropna(axis=1, how='all').copy()
renamed_bbox_data.rename(columns={"Bbox [x": "x", "h]": "h"}, inplace=True)

# Calculage widh & heigh data bounds
mean_w = renamed_bbox_data['w'].mean()
std_w = renamed_bbox_data['w'].std()
_, w_outlier_threshold = standard_deviation_bounds(mean_w, std_w, 3)

mean_h = renamed_bbox_data['h'].mean()
std_h = renamed_bbox_data['h'].std()
_lower_bound, h_outlier_threshold = standard_deviation_bounds(mean_h, std_h, 3)


filtered_bbox_data = renamed_bbox_data[
    (renamed_bbox_data['w'] <= w_outlier_threshold) & 
    (renamed_bbox_data['h'] <= h_outlier_threshold)
]

print(f"W: {w_outlier_threshold}, H: {h_outlier_threshold}")
print(f"Original dataset size: {len(renamed_bbox_data)}")
print(f"Filtered dataset size: {len(filtered_bbox_data)}")
print(f"Data lose: {len(renamed_bbox_data) - len(filtered_bbox_data)}")
print(f"Data lose ration: {round(100 - (len(filtered_bbox_data) / len(renamed_bbox_data)) * 100, 2)}%")

In [None]:
# Store in cleared Dataset
cleared_data_enetry = f'{CLEARED_DATASET}/BBox_List_2017.csv'
filtered_bbox_data.to_csv(cleared_data_enetry, index=False)

In [None]:
# Images Data Clearing
pattern = re.compile(r'images_\d{3}')
matching_dirs = [
    os.path.join(RAW_DATASET, d)
    for d in os.listdir(RAW_DATASET)
    if os.path.isdir(os.path.join(RAW_DATASET, d)) and pattern.match(d)
]

image_files = set()
for directory in matching_dirs:
    nested_dir = f"{directory}/images"
    image_files_in_dir = os.listdir(nested_dir)
    for image in image_files_in_dir:
        image_path = os.path.join(nested_dir, image)
        image_files.add(image_path)

images_df = pd.DataFrame(list(image_files))
images_df.columns = ['image_path']
images_df['intensity'] = None
images_df['ssim'] = None

In [None]:
# Calculate Image Intensity
for index, row in tqdm(images_df.iterrows(), total=len(images_df)):
    image_path = row['image_path']
    img_array = get_image_array(image_path)
    intensity = img_array.mean()
    images_df.at[index, 'intensity'] = intensity

In [None]:
# Filter By Intensity
mean_intensity = np.mean(images_df['intensity'])
std_intensity = np.std(images_df['intensity'])
lower_bound, upper_bound = standard_deviation_bounds(mean_intensity, std_intensity, 3)
filtered_images_df = images_df[(images_df['intensity'] >= lower_bound) & (images_df['intensity'] <= upper_bound)]

print("Upper Outlier Threshold:", round(upper_bound, 2))
print("Lower Outlier Threshold:", round(lower_bound, 2))
print(f"Final dataset size before removing outliers: {len(images_df)}")
print(f"Final dataset size after removing outliers: {len(filtered_images_df)}")
print(f"Dataset size diff: {len(images_df) - len(filtered_images_df)}")

In [None]:
from skimage.metrics import structural_similarity as ssim
import traceback

# Compute median intensity
median_intensity = np.median(filtered_images_df['intensity'])
print(f"Median pixel intensity: {median_intensity:.2f}")

closest_image_index = np.argmin([abs(intensity - median_intensity) for intensity in filtered_images_df['intensity']])
print(f"Closest image index: {closest_image_index}")
print(f"Closest image intensity: {filtered_images_df.iloc[closest_image_index]['intensity']}")

reference_image_path = filtered_images_df.iloc[closest_image_index]['image_path']
reference_image = get_image_array(image_path)

# TODO: Process images concurrently.
for index, row in tqdm(filtered_images_df.iterrows(), total=len(filtered_images_df)):
    try:
        comparison_image = get_image_array(row['image_path'])
        ssim_value = calculate_ssim(comparison_image, reference_image)
        filtered_images_df.at[index, 'ssim'] = ssim_value

    except Exception as e:
        print(f"Błąd przetwarzania {image_path}: {e}")
        print(traceback.format_exc())

print(filtered_images_df.head())

In [None]:
# Compute mean and standard deviation
mean_ssim = np.mean(filtered_images_df['ssim'])
std_ssim = np.std(filtered_images_df['ssim'])

lower_bound, upper_bound = standard_deviation_bounds(mean_ssim, std_ssim, 3)
print("Upper Outlier Threshold:", round(upper_bound, 4))
print("Lower Outlier Threshold:", round(lower_bound, 4))

simm_images_df = filtered_images_df[
    (filtered_images_df['ssim'] <= upper_bound) &  # Inside upper limit
    (filtered_images_df['ssim'] >= lower_bound)    # Inside lower limit
]

print(f"Final dataset size before removing outliers: {len(filtered_images_df)}")
print(f"Final dataset size after removing outliers: {len(simm_images_df)}")
print(f"Dataset size diff: {len(filtered_images_df) - len(simm_images_df)}")

In [None]:
# Reload data entry
clean_data_entry_df = pd.read_csv(f"{CLEARED_DATASET}/Data_Entry_2017.csv")

# Extract Image Index from images_path (assuming file name is the last part of the path)
simm_images_df["Image Index"] = simm_images_df["image_path"].apply(lambda x: os.path.basename(x))
simm_images_df["Image Index"] = simm_images_df["Image Index"].str.strip()

# Filter simm_images_df to keep only existing images in DataEntry
existing_images_df = simm_images_df[simm_images_df["Image Index"].isin(clean_data_entry_df["Image Index"])]

print(f"Final dataset size before removing outliers: {len(simm_images_df)}")
print(f"Final dataset size after removing outliers: {len(existing_images_df)}")
print(f"Dataset size diff: {len(simm_images_df) - len(existing_images_df)}")
print(f"Init dataset size: {len(images_df)}")

print(f"Lose Ration: {round(100 - (len(existing_images_df) / len(images_df)) * 100, 4)}%")

In [None]:
# Save cleared data copy
import shutil

# Define source and destination folders
images_dataset = f"{CLEARED_DATASET}/images"
os.makedirs(images_dataset, exist_ok=True)

for index, row in tqdm(existing_images_df.iterrows(), total=len(existing_images_df)):
    try:
        image_path = row['image_path']
        source_path = os.path.abspath(image_path)  # If paths are absolute in the DataFrame
        destination_path = os.path.join(images_dataset, os.path.basename(image_path))

        if os.path.exists(source_path):
            shutil.copy2(source_path, images_dataset)
        else:
            tqdm.warning(f"Warning: File not found - {source_path}")
    except Exception as e:
        print(f"Error copying {image_path}: {e}")