In [58]:
import wsi_preprocessing as pp
import os
import json
import shutil

In [59]:
def preprocess_WSI_slides(base_dir="cases"):
    case_dirs = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]
    general_metadata_dir = os.path.join(base_dir, "GENERAL_METADATA")
    os.makedirs(general_metadata_dir, exist_ok=True)

    for case_id in case_dirs:
        case_dir = os.path.join(base_dir, case_id)
        biospecimen_dir = os.path.join(case_dir, "Biospecimen")
        metadata_path = os.path.join(case_dir, "aggregated_data", f'{case_id}_data.json')

        if not os.path.exists(metadata_path):
            print(f"No metadata found for case {case_id}, skipping...")
            continue
        
        with open(metadata_path, 'r') as f:
            case_metadata = json.load(f)
        
        # Check for Tiles.part
        tiles_part_dir = os.path.join(biospecimen_dir, 'Tiles.part')
        if os.path.exists(tiles_part_dir):
            shutil.rmtree(tiles_part_dir)
            print(f"Partially downloaded directory {tiles_part_dir} and its contents removed successfully.")

        # If Tiles exists, skip
        tiles_dir = os.path.join(biospecimen_dir, 'Tiles')
        if os.path.exists(tiles_dir):
            print(f"Skipping case {case_id} because 'Tiles' directory already exists.")
            continue

        slides_to_process = []
        slide_names_to_process = []
        
        biospecimen_data = case_metadata.get('biospecimen', {}).get('biospecimen_data', [])
        
        # Collect svs image file names from JSON metadata file
        for sample in biospecimen_data:
            if sample['sample_type'] == "Primary Tumor":
                for slide in sample.get('slides', []):
                    image_name = slide['image_file_name']
                    slide_path = os.path.join(biospecimen_dir, image_name)
                    if os.path.exists(slide_path):
                        slides_to_process.append(slide_path)
                        slide_names_to_process.append(image_name)
                        for file in os.listdir(os.getcwd()):
                            # check for garbage files
                            if image_name in file:
                                file_path = os.path.join(os.getcwd(), file)
                                if os.path.isdir(file_path):
                                    shutil.rmtree(file_path, ignore_errors=True)
                                    print(f"Removed directory: {file_path}")
                                elif os.path.isfile(file_path):
                                    os.remove(file_path)
                                    print(f"Removed file: {file_path}")
                    else:
                        print(f"Slide {image_name} not found for case {case_id}")
        
        if not slides_to_process:
            print(f"No Primary Tumor slides found for case {case_id}")
            continue
        
        output_folder = os.path.join(biospecimen_dir)
        os.makedirs(output_folder, exist_ok=True)

        slide_csv_path = os.path.join(output_folder, 'slides_mpp_otsu.csv')
        consolidated_csv_path = os.path.join(general_metadata_dir, 'consolidated.csv')
        tiles_filter_path = os.path.join(output_folder, 'tiles_filter.csv')
        
        # Commence tile cutting for current case
        pp.save_slides_mpp_otsu(slides_to_process, slide_csv_path)
        
        try:
            pp.run_tiling(
                slide_csv=slide_csv_path,
                consolidated_csv=consolidated_csv_path
            )
        except Exception as e:
            print(f"Exception during tiling for case {case_id}: {e}")
        
        pp.calculate_filters(
            slide_csv_path,
            "",
            tiles_filter_path
        )

        os.makedirs(tiles_dir, exist_ok=True)

        # Move generated files to appropriate folders

        #TODO: does this do anything? 
        general_metadata_path = os.path.join(general_metadata_dir, 'consolidated.csv')
        if os.path.exists(consolidated_csv_path):
            os.rename(consolidated_csv_path, general_metadata_path)

        for file in os.listdir(os.getcwd()):
            for svs_filename in slide_names_to_process:
                if svs_filename in file:
                    old_path = os.path.join(os.getcwd(), file)
                    new_path = os.path.join(tiles_part_dir)
                    
                    # Print paths for debugging
                    print(f"Attempting to move: {old_path} to {new_path}")
                    
                    try:
                        shutil.move(old_path, new_path)
                        print(f"Moved {file} successfully.")
                    except Exception as e:
                        print(f"Failed to move {file}: {e}")
        
        #TODO: rename tiles.part to tiles
        if os.path.exists(tiles_part_dir):
            os.rename(tiles_part_dir, tiles_dir)
            print(f"Renamed {tiles_part_dir} to {tiles_dir}")

        print(f"Processing complete for case {case_id}")

In [61]:
preprocess_WSI_slides("cases_TEST")

Removed directory: c:\Users\davet\Desktop\Research\AssCancer_Code\TCGA-AA-3844-01A-01-BS1.1b127c9c-04a1-46ad-9cd6-fe61ef67719f.svs_mpp-0.5_crop24-44_files
Removed file: c:\Users\davet\Desktop\Research\AssCancer_Code\TCGA-AA-3844-01A-01-BS1.1b127c9c-04a1-46ad-9cd6-fe61ef67719f.svs_mpp-0.5_crop24-44_files-tile_path.csv
Removed directory: c:\Users\davet\Desktop\Research\AssCancer_Code\TCGA-AA-3844-01A-01-TS1.6a6ad054-276a-49c4-a0f2-cc609b380157.svs_mpp-0.5_crop40-46_files
Removed file: c:\Users\davet\Desktop\Research\AssCancer_Code\TCGA-AA-3844-01A-01-TS1.6a6ad054-276a-49c4-a0f2-cc609b380157.svs_mpp-0.5_crop40-46_files-tile_path.csv
[1/2] cases_TEST\4f786107-3cf5-4ab3-bba4-f399dee23f0e\Biospecimen\TCGA-AA-3844-01A-01-BS1.1b127c9c-04a1-46ad-9cd6-fe61ef67719f.svs
- finding slide resolution
- calculating otsu threshold


  mean2 = np.divide(a2, w2[::-1], out=np.zeros_like(a2), where=w2 != 0)[::-1]


[2/2] cases_TEST\4f786107-3cf5-4ab3-bba4-f399dee23f0e\Biospecimen\TCGA-AA-3844-01A-01-TS1.6a6ad054-276a-49c4-a0f2-cc609b380157.svs
- finding slide resolution
- calculating otsu threshold
slides info (mpp, otsu_thres) saved to cases_TEST\4f786107-3cf5-4ab3-bba4-f399dee23f0e\Biospecimen\slides_mpp_otsu.csv
[1/2] cases_TEST\4f786107-3cf5-4ab3-bba4-f399dee23f0e\Biospecimen\TCGA-AA-3844-01A-01-BS1.1b127c9c-04a1-46ad-9cd6-fe61ef67719f.svs
- slide will NOT be rescaled | 0.5015 is within range of targeted mpp
- time elapsed: 0:00:30.831852
- directory TCGA-AA-3844-01A-01-BS1.1b127c9c-04a1-46ad-9cd6-fe61ef67719f.svs_mpp-0.5_crop24-44_files/ has 10976 .png tiles
- tile_path data saved to ./TCGA-AA-3844-01A-01-BS1.1b127c9c-04a1-46ad-9cd6-fe61ef67719f.svs_mpp-0.5_crop24-44_files-tile_path.csv
[2/2] cases_TEST\4f786107-3cf5-4ab3-bba4-f399dee23f0e\Biospecimen\TCGA-AA-3844-01A-01-TS1.6a6ad054-276a-49c4-a0f2-cc609b380157.svs
- slide will NOT be rescaled | 0.5015 is within range of targeted mpp
- time 

KeyboardInterrupt: 