In [1]:
import pandas as pd
import numpy as np
nih_dataset = '../data/nih-dataset'
cleared_dataset = '../cleared-data/nih-dataset'

In [2]:
# Patients Age Clearing
# Reload the dataset
data_entry_file = f'{nih_dataset}/Data_Entry_2017.csv'
data_frame = pd.read_csv(data_entry_file, delimiter=',', nrows=None)

total_count = len(data_frame)

# Group by Patient ID and get the first age for each patient
unique_patients = data_frame.groupby('Patient ID')['Patient Age'].first()

# Calculate mean and standard deviation of ages
mean_unique_age = unique_patients.mean()
std_dev_unique_age = unique_patients.std()

# Define outlier thresholds - age
outlier_threshold_upper_std = round(mean_unique_age + 3 * std_dev_unique_age, 4)
outlier_threshold_lower_std = 0 # Can't be younger than 0 years 

# Patients in age between [0 and 3σ]
filtered_unique_patients = unique_patients[
    (unique_patients >= outlier_threshold_lower_std) & 
    (unique_patients <= outlier_threshold_upper_std)
]

filtered_unique_patients = filtered_unique_patients.reset_index()
valid_patient_ids = filtered_unique_patients['Patient ID'].tolist()
filtered_data = data_frame[data_frame['Patient ID'].isin(valid_patient_ids)]

In [3]:
# Undersampling Clearing
# Reload the dataset
total_size = len(filtered_data)

dominant_class = "No Finding"
dominant_class_data = filtered_data[filtered_data['Finding Labels'] == dominant_class]
minority_classes_data = filtered_data[filtered_data['Finding Labels'] != dominant_class]

assert total_size == len(dominant_class_data) + len(minority_classes_data)

undersample_rate = round(len(minority_classes_data) / len(dominant_class_data), 2)
undersample_size = int(len(dominant_class_data) * undersample_rate)

balanced_dominant_class = dominant_class_data.sample(n=undersample_size, random_state=42)
balanced_data = pd.concat([balanced_dominant_class, minority_classes_data], ignore_index=True)
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

0.86


In [4]:
# Store in cleated Dataset
cleared_data_enetry = f'{cleared_dataset}/Data_Entry_2017.csv'
balanced_data.to_csv(cleared_data_enetry, index=False)

In [24]:
# Prepare Bounding Box Data
data_entry_file = f'{nih_dataset}/BBox_List_2017.csv'
bbox_data = pd.read_csv(data_entry_file, delimiter=',', nrows=None)
renamed_bbox_data = bbox_data.dropna(axis=1, how='all').copy()
renamed_bbox_data.rename(columns={"Bbox [x": "x", "h]": "h"}, inplace=True)

# Calculage widh & heigh data bounds
mean_w = renamed_bbox_data['w'].mean()
std_w = renamed_bbox_data['w'].std()
mean_h = renamed_bbox_data['h'].mean()
std_h = renamed_bbox_data['h'].std()
w_outlier_threshold = round(mean_w + 3 * std_w, 2)
h_outlier_threshold = round(mean_h + 3 * std_h, 2)

print(f"W: {w_outlier_threshold}, H: {h_outlier_threshold}")

filtered_bbox_data = cleaned_bbox_data[
    (cleaned_bbox_data['w'] <= w_outlier_threshold) & 
    (cleaned_bbox_data['h'] <= h_outlier_threshold)
]

print(f"Original dataset size: {len(cleaned_bbox_data)}")
print(f"Filtered dataset size: {len(filtered_bbox_data)}")
print(f"Data lose: {len(cleaned_bbox_data) - len(filtered_bbox_data)}")
print(f"Data lose ration: {round(100 - (len(filtered_bbox_data) / len(cleaned_bbox_data)) * 100,2)} ")

# Store in cleated Dataset
cleared_data_enetry = f'{cleared_dataset}/BBox_List_2017.csv'
filtered_bbox_data.to_csv(cleared_data_enetry, index=False)

W: 759.22, H: 730.63
Original dataset size: 984
Filtered dataset size: 973
Data lose: 11
Data lose ration: 1.12 


In [None]:
# 