In [1]:
import os
import pandas as pd
import numpy as np
import shutil
import glob
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def create_single_pathology_dataset(base_dir, output_dir, max_images_per_class=1000, seed=42):
    """
    Create a new dataset with only single-pathology images, organized by class
    
    Args:
        base_dir: Base directory containing the original NIH dataset
        output_dir: Output directory for the new dataset
        max_images_per_class: Maximum number of images per pathology class
        seed: Random seed for reproducibility
    """
    np.random.seed(seed)
    
    # Setup paths
    csv_path = os.path.join(base_dir, 'Data_Entry_2017.csv')
    new_dataset_dir = os.path.join(output_dir, 'single_pathology_dataset')
    new_images_dir = os.path.join(new_dataset_dir, 'images')
    
    # Create output directories
    os.makedirs(new_dataset_dir, exist_ok=True)
    os.makedirs(new_images_dir, exist_ok=True)
    
    # Find all images in the NIH dataset structure
    print("Finding all images in the dataset...")
    image_pattern = os.path.join(base_dir, 'images_*/images/*.png')
    image_paths = glob.glob(image_pattern)
    
    # Create a mapping from image filename to full path for quick lookup
    image_map = {os.path.basename(path): path for path in image_paths}
    print(f"Found {len(image_paths)} images")
    
    # Read the CSV file
    print("Reading and processing CSV data...")
    df = pd.read_csv(csv_path)
    
    # List of pathologies we want to include in our dataset
    conditions = [
        'Cardiomegaly', 'Emphysema', 'Effusion', 'Hernia', 'Infiltration', 
        'Mass', 'Nodule', 'Atelectasis', 'Pneumothorax', 'Pleural_Thickening', 
        'Pneumonia', 'Fibrosis', 'Edema', 'Consolidation'
    ]
    
    # Add 'No Finding' as a class
    conditions.append('No Finding')
    
    # Initialize columns for each condition with zeros
    for condition in conditions:
        df[condition] = 0
    
    # Parse the 'Finding Labels' column to populate condition columns
    for index, row in df.iterrows():
        labels = row['Finding Labels'].split('|')
        for condition in conditions:
            if condition in labels:
                df.at[index, condition] = 1
    
    # Add path information
    df['path'] = df['Image Index'].apply(lambda x: image_map.get(x, ''))
    
    # Keep only rows where the image file was found
    df = df[df['path'] != '']
    
    # Create a new dataframe for our single-pathology dataset
    new_dataset = []
    
    # Add a directory for normal cases
    os.makedirs(os.path.join(new_images_dir, 'No Finding'), exist_ok=True)
    
    # Statistics for reporting
    stats = {condition: {'available': 0, 'selected': 0} for condition in conditions}
    
    # Create directories for each pathology class
    for condition in conditions:
        os.makedirs(os.path.join(new_images_dir, condition), exist_ok=True)
    
    # Process each condition
    for condition in tqdm(conditions, desc="Processing conditions"):
        # Find single-pathology cases for this condition
        if condition == 'No Finding':
            condition_df = df[df['Finding Labels'] == 'No Finding']
        else:
            # Select rows where only this condition is present
            condition_mask = df[condition] == 1
            other_conditions_mask = df[[c for c in conditions if c != condition]].sum(axis=1) == 0
            condition_df = df[condition_mask & other_conditions_mask]
        
        stats[condition]['available'] = len(condition_df)
        
        # Sample up to max_images_per_class images
        if len(condition_df) > max_images_per_class:
            condition_df = condition_df.sample(max_images_per_class, random_state=seed)
        
        stats[condition]['selected'] = len(condition_df)
        
        # Copy images to new directory
        for _, row in tqdm(condition_df.iterrows(), 
                          desc=f"Copying {condition} images", 
                          total=len(condition_df),
                          leave=False):
            source_path = row['path']
            image_filename = row['Image Index']
            dest_path = os.path.join(new_images_dir, condition, image_filename)
            
            # Copy the image file
            shutil.copy2(source_path, dest_path)
            
            # Add to our new dataset dataframe
            new_row = row.copy()
            new_row['pathology'] = condition
            new_row['new_path'] = dest_path
            new_dataset.append(new_row)
    
    # Convert the list to a dataframe
    new_df = pd.DataFrame(new_dataset)
    
    # Save the new CSV file
    new_csv_path = os.path.join(new_dataset_dir, 'single_pathology_dataset.csv')
    new_df.to_csv(new_csv_path, index=False)
    
    # Create train/validation/test splits based on patient ID
    print("Creating train/validation/test splits...")
    
    # Get unique patient IDs
    patient_ids = new_df['Patient ID'].unique()
    
    # Split patient IDs into train, validation, and test sets
    train_ids, temp_ids = train_test_split(patient_ids, test_size=0.3, random_state=seed)
    valid_ids, test_ids = train_test_split(temp_ids, test_size=0.5, random_state=seed)
    
    # Create dataframes for each split
    train_df = new_df[new_df['Patient ID'].isin(train_ids)].copy()
    valid_df = new_df[new_df['Patient ID'].isin(valid_ids)].copy()
    test_df = new_df[new_df['Patient ID'].isin(test_ids)].copy()
    
    # Save split CSVs
    train_df.to_csv(os.path.join(new_dataset_dir, 'train.csv'), index=False)
    valid_df.to_csv(os.path.join(new_dataset_dir, 'valid.csv'), index=False)
    test_df.to_csv(os.path.join(new_dataset_dir, 'test.csv'), index=False)
    
    # Print statistics
    print("\nDataset creation complete!")
    print("\nStatistics by condition:")
    print("-" * 50)
    print(f"{'Condition':<20} {'Available':<10} {'Selected':<10}")
    print("-" * 50)
    for condition in conditions:
        print(f"{condition:<20} {stats[condition]['available']:<10} {stats[condition]['selected']:<10}")
    print("-" * 50)
    
    print(f"\nTotal images in new dataset: {len(new_df)}")
    print(f"Training set: {len(train_df)} images")
    print(f"Validation set: {len(valid_df)} images")
    print(f"Test set: {len(test_df)} images")
    
    return {
        'dataset_dir': new_dataset_dir,
        'csv_path': new_csv_path,
        'train_csv': os.path.join(new_dataset_dir, 'train.csv'),
        'valid_csv': os.path.join(new_dataset_dir, 'valid.csv'),
        'test_csv': os.path.join(new_dataset_dir, 'test.csv'),
        'stats': stats
    }

def explore_new_dataset(dataset_info):
    """
    Generate summary statistics and visualizations for the new dataset
    
    Args:
        dataset_info: Dictionary with dataset information from create_single_pathology_dataset
    """
    # Load the CSV files
    full_df = pd.read_csv(dataset_info['csv_path'])
    train_df = pd.read_csv(dataset_info['train_csv'])
    valid_df = pd.read_csv(dataset_info['valid_csv'])
    test_df = pd.read_csv(dataset_info['test_csv'])
    
    # Print class distribution
    print("\nClass distribution in the dataset:")
    class_counts = full_df['pathology'].value_counts()
    for pathology, count in class_counts.items():
        print(f"{pathology}: {count} images")
    
    # Check that there's no patient overlap between splits
    train_patients = set(train_df['Patient ID'].unique())
    valid_patients = set(valid_df['Patient ID'].unique())
    test_patients = set(test_df['Patient ID'].unique())
    
    print("\nChecking for patient overlap between splits:")
    print(f"Train-Valid overlap: {len(train_patients.intersection(valid_patients))} patients")
    print(f"Train-Test overlap: {len(train_patients.intersection(test_patients))} patients")
    print(f"Valid-Test overlap: {len(valid_patients.intersection(test_patients))} patients")
    
    # Generate summary by gender
    print("\nGender distribution:")
    gender_counts = full_df['Patient Gender'].value_counts()
    for gender, count in gender_counts.items():
        print(f"{gender}: {count} images ({count/len(full_df)*100:.1f}%)")
    
    # Generate age distribution
    age_mean = full_df['Patient Age'].mean()
    age_median = full_df['Patient Age'].median()
    age_min = full_df['Patient Age'].min()
    age_max = full_df['Patient Age'].max()
    
    print("\nAge distribution:")
    print(f"Mean age: {age_mean:.1f} years")
    print(f"Median age: {age_median:.1f} years")
    print(f"Range: {age_min} to {age_max} years")
    
    return {
        'class_counts': class_counts,
        'gender_counts': gender_counts,
        'age_stats': {
            'mean': age_mean,
            'median': age_median,
            'min': age_min,
            'max': age_max
        }
    }

# Example usage
if __name__ == "__main__":
    # Set the base directory to your dataset location
    BASE_DIR = 'D:/healthcare'  # Change this to your dataset path
    OUTPUT_DIR = 'D:/CV_project/processed'  # Change this to your desired output path
    
    # Create the dataset
    dataset_info = create_single_pathology_dataset(
        base_dir=BASE_DIR,
        output_dir=OUTPUT_DIR,
        max_images_per_class=1000
    )
    
    # Explore the created dataset
    explore_new_dataset(dataset_info)

Finding all images in the dataset...
Found 112120 images
Reading and processing CSV data...


Processing conditions: 100%|██████████| 15/15 [13:53<00:00, 55.55s/it]


Creating train/validation/test splits...

Dataset creation complete!

Statistics by condition:
--------------------------------------------------
Condition            Available  Selected  
--------------------------------------------------
Cardiomegaly         1093       1000      
Emphysema            892        892       
Effusion             3955       1000      
Hernia               110        110       
Infiltration         9547       1000      
Mass                 2139       1000      
Nodule               2705       1000      
Atelectasis          4215       1000      
Pneumothorax         2194       1000      
Pleural_Thickening   1126       1000      
Pneumonia            322        322       
Fibrosis             727        727       
Edema                628        628       
Consolidation        1310       1000      
No Finding           60361      1000      
--------------------------------------------------

Total images in new dataset: 12679
Training set: 8861 images
Va