In [1]:
import pandas as pd
import numpy as np
nih_dataset = '../data/nih-dataset'
cleared_dataset = '../cleared-data/nih-dataset'

In [2]:


# Patients Age Clearing
# Reload the dataset
data_entry_file = f'{nih_dataset}/Data_Entry_2017.csv'
data_frame = pd.read_csv(data_entry_file, delimiter=',', nrows=None)

total_count = len(data_frame)

# Group by Patient ID and get the first age for each patient
unique_patients = data_frame.groupby('Patient ID')['Patient Age'].first()

# Calculate mean and standard deviation of ages
mean_unique_age = unique_patients.mean()
std_dev_unique_age = unique_patients.std()

# Define outlier thresholds - age
outlier_threshold_upper_std = round(mean_unique_age + 3 * std_dev_unique_age, 4)
outlier_threshold_lower_std = 0 # Can't be younger than 0 years 

# Patients in age between [0 and 3σ]
filtered_unique_patients = unique_patients[
    (unique_patients >= outlier_threshold_lower_std) & 
    (unique_patients <= outlier_threshold_upper_std)
]

filtered_unique_patients = filtered_unique_patients.reset_index()
valid_patient_ids = filtered_unique_patients['Patient ID'].tolist()
filtered_data = data_frame[data_frame['Patient ID'].isin(valid_patient_ids)]


print(len(filtered_unique_patients))
print(len(filtered_data))
print(total_count)
print(112120 - 112068)

30797
112068
112120
52


In [3]:
# Undersampling Clearing
# Reload the dataset
total_size = len(filtered_data)

dominant_class = "No Finding"
dominant_class_data = filtered_data[filtered_data['Finding Labels'] == dominant_class]
minority_classes_data = filtered_data[filtered_data['Finding Labels'] != dominant_class]

assert total_size == len(dominant_class_data) + len(minority_classes_data)

undersample_rate = round(len(minority_classes_data) / len(dominant_class_data), 2)
print(undersample_rate)
undersample_size = int(len(dominant_class_data) * undersample_rate)

balanced_dominant_class = dominant_class_data.sample(n=undersample_size, random_state=42)
balanced_data = pd.concat([balanced_dominant_class, minority_classes_data], ignore_index=True)
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Lose Ratio: {round((100.00 - ((103621 / 112120) * 100)), 2)}%")

0.86
Lose Ratio: 7.58%


In [4]:
# Store in cleated Dataset
cleared_data_enetry = f'{cleared_dataset}/Data_Entry_2017.csv'
balanced_data.to_csv(cleared_data_enetry, index=False)