In [1]:
import os
import pandas as pd
from dotenv import load_dotenv
from src.prePro import preprocess_metadata, calculate_balanced_label_statistics, stratified_split_by_individual_labels

# Load environment variables from .env file
load_dotenv()
data_dir = os.getenv('DATA_DIR')

filtered_df = preprocess_metadata(
    f'../{data_dir}/raw/xraysMD.csv',
    f'../{data_dir}/raw/xrays',
    f'../{data_dir}/processed/xraysMD.csv'
)

filtered_df_stats = calculate_balanced_label_statistics(filtered_df)



In [2]:
train_df, test_df = stratified_split_by_individual_labels(filtered_df, train_size=7000, test_size=3000)

# Print sizes of the resulting DataFrames
print(f"Training set size: {len(train_df)}")
print(f"Test/Validation set size: {len(test_df)}")

# Optional: Check the label distribution in training and testing sets
train_distribution = pd.Series([label for labels in train_df['Labels'] for label in labels]).value_counts()
test_distribution = pd.Series([label for labels in test_df['Labels'] for label in labels]).value_counts()

print("\nTraining set label distribution:")
print(train_distribution)

print("\nTest/Validation set label distribution:")
print(test_distribution)

Training set size: 7000
Test/Validation set size: 3000

Training set label distribution:
0      678
1      186
2      257
3      105
4      706
5      137
6      139
7       20
8     1016
9      265
10    4083
11     331
12     202
13      72
14     270
dtype: int64

Test/Validation set label distribution:
0      279
1       81
2       99
3       41
4      284
5       62
6       74
7        7
8      407
9      122
10    1726
11     164
12     104
13      37
14     138
dtype: int64
