In [7]:
%matplotlib inline

import os
import h5py
import numpy as np
from matplotlib import pyplot as plt


In [2]:
download_path = '/DATASERVER/MIC/SHARED/NYU_FastMRI/Preprocessed/'
train_path = os.path.join(download_path,'multicoil_train')
validation_path = os.path.join(download_path,'multicoil_val')
test_path = os.path.join(download_path,'multicoil_test')
fully_sampled_test_path = os.path.join(download_path,'multicoil_test_full')

## CHECKS FOR TRAIN/VAL/TEST HETEROGENITY

In [None]:
import os
import h5py
from collections import Counter
from multiprocessing import Pool, cpu_count
from pathlib import Path

def get_acquisition_type(h5_path):
    try:
        with h5py.File(h5_path, 'r') as f:
            return f.attrs.get('acquisition', None)
    except Exception as e:
        print(f"Error reading {h5_path}: {e}")
        return None

def count_acquisitions_in_folder(folder_path, num_workers, string='train' or 'val'):
    folder = Path(folder_path)
    h5_files = []
  
    brain_folder = folder / "Preprocessed" / f"multicoil_{string}"
    knee_folder = folder / "Knee" / f"multicoil_{string}"
    h5_files.extend(brain_folder.glob('*.h5'))
    amount_brains = len(h5_files)
    h5_files.extend(knee_folder.glob('*.h5'))
    amount_knees = len(h5_files) - amount_brains
    print(f"Found {amount_brains} brain files and {amount_knees} knee files in {folder_path}.")
    print(f"Found {len(h5_files)} files in {folder_path} for {string} set.")

    with Pool(processes=num_workers) as pool:
        acquisition_types = pool.map(get_acquisition_type, h5_files)
    
    return Counter(acquisition_types)

def count_acquisitions_in_CS_folder(CS_path, fastmri_path, num_workers):
    # STILL HAVE TO CHANGE FROM _CS.npy TO .h5!!
    CS_files = list(Path(CS_path).glob('*.npy'))
    h5_files = []

    knee_train_count = 0
    knee_val_count = 0
    for fname in CS_files:
        h5_fname = Path(fname).name.replace('_cs.npy', '.h5')
        # check if brain or knee file
        if "brain" in str(fname):
            h5_path = Path(fastmri_path) /"Preprocessed" / Path(CS_PATH).stem / h5_fname
        else:
            h5_path = Path(fastmri_path) /"Knee" / 'multicoil_train'/ h5_fname
            if not h5_path.exists():
                h5_path = Path(fastmri_path) /"Knee" / 'multicoil_val'/ h5_fname
                knee_val_count += 1
            else:
                knee_train_count += 1
                
        if h5_path.exists():
            h5_files.append(h5_path)
            
    print(f"Found {knee_train_count} knee train files and {knee_val_count} knee val files.")
    print(f"Found {len(h5_files)} h5 files corresponding to CS files.")
    
    with Pool(processes=num_workers) as pool:
        acquisition_types = pool.map(get_acquisition_type, h5_files)
    
    return Counter(acquisition_types)

# DEFINE PARAMETERS:
FASTMRI_PATH = '/DATASERVER/MIC/SHARED/NYU_FastMRI/'
CS_PATH = '/DATASERVER/MIC/GENERAL/STUDENTS/aslock2/Preprocessed_CS/multicoil_train/'
NUM_WORKERS = 10
acquisition_counts_CS = count_acquisitions_in_CS_folder(CS_PATH, FASTMRI_PATH, NUM_WORKERS)
acquisition_counts = count_acquisitions_in_folder(FASTMRI_PATH, NUM_WORKERS, 'train')

# Print the counts for each acquisition type
print("Acquisition counts in CS train folder:")
for acquisition_type, count in acquisition_counts_CS.items():
    print(f"{acquisition_type}: {count}")

print("\nAcquisition counts in training folder:")
for acquisition_type, count in acquisition_counts.items():
    print(f"{acquisition_type}: {count}")

Found 821 knee train files and 0 knee val files.
Found 2667 h5 files corresponding to CS files.
Found 4458 brain files and 973 knee files in /DATASERVER/MIC/SHARED/NYU_FastMRI/.
Found 5431 files in /DATASERVER/MIC/SHARED/NYU_FastMRI/ for train set.
Acquisition counts in CS train folder:
AXT2: 1085
AXT1: 110
CORPD_FBK: 396
AXT1POST: 390
AXFLAIR: 148
CORPDFS_FBK: 425
AXT1PRE: 113

Acquisition counts in training folder:
AXT2: 2670
AXT1POST: 947
AXFLAIR: 343
AXT1PRE: 250
AXT1: 248
CORPDFS_FBK: 489
CORPD_FBK: 484


In [76]:
# NOW CHECK FOR VAL DATA
FASTMRI_PATH = '/DATASERVER/MIC/SHARED/NYU_FastMRI/'
CS_PATH = '/DATASERVER/MIC/GENERAL/STUDENTS/aslock2/Preprocessed_CS/multicoil_val/' # changed to val
NUM_WORKERS = 1
acquisition_counts_CS = count_acquisitions_in_CS_folder(CS_PATH, FASTMRI_PATH, NUM_WORKERS)
acquisition_counts = count_acquisitions_in_folder(FASTMRI_PATH, NUM_WORKERS, 'val')

# Print the counts for each acquisition type
print("Acquisition counts in CS val folder:")
for acquisition_type, count in acquisition_counts_CS.items():
    print(f"{acquisition_type}: {count}")

print("\nAcquisition counts in val folder:")
for acquisition_type, count in acquisition_counts.items():
    print(f"{acquisition_type}: {count}")

Found 152 knee train files and 82 knee val files.
Found 761 h5 files corresponding to CS files.
Found 1378 brain files and 199 knee files in /DATASERVER/MIC/SHARED/NYU_FastMRI/.
Found 1577 files in /DATASERVER/MIC/SHARED/NYU_FastMRI/ for val set.
Acquisition counts in CS val folder:
AXT2: 325
AXT1POST: 103
AXFLAIR: 47
AXT1: 30
CORPDFS_FBK: 111
AXT1PRE: 22
CORPD_FBK: 123

Acquisition counts in val folder:
AXFLAIR: 107
AXT2: 815
AXT1POST: 287
AXT1: 92
AXT1PRE: 77
CORPD_FBK: 100
CORPDFS_FBK: 99


In [77]:
# now check for test data

# fake like we have actual files in TEST CS folders:
def count_acquisitions_in_CS_TEST_folder(CS_path, fastmri_path, num_workers):
    h5_files = []
    knee_path = Path(fastmri_path) / "Knee" / 'multicoil_val'
    brain_path = Path(fastmri_path) / "Preprocessed" / "multicoil_test"
    knee_files = list(knee_path.glob('*.h5'))
    brain_files = list(brain_path.glob('*.h5'))

    # select last 116 val files for knees and first 261 test files for brains
    h5_files.extend(knee_files[-116:])
    amount_knees = len(h5_files)
    h5_files.extend(brain_files[:261])
    amount_brains = len(h5_files) - amount_knees
    print(f"Found {amount_brains} brain files and {amount_knees} knee files in {CS_path}.")
    print(f"Found {len(h5_files)} h5 test files.")
    
    with Pool(processes=num_workers) as pool:
        acquisition_types = pool.map(get_acquisition_type, h5_files)
    
    return Counter(acquisition_types)

FASTMRI_PATH = '/DATASERVER/MIC/SHARED/NYU_FastMRI/'
CS_PATH = '/DATASERVER/MIC/GENERAL/STUDENTS/aslock2/Preprocessed_CS/multicoil_test/' # changed to test
NUM_WORKERS = 10
acquisition_counts_CS = count_acquisitions_in_CS_TEST_folder(CS_PATH, FASTMRI_PATH, NUM_WORKERS)
acquisition_counts = count_acquisitions_in_folder(FASTMRI_PATH, NUM_WORKERS, 'test')

#Print the counts for each acquisition type
print("Acquisition counts in CS test folder:")
for acquisition_type, count in acquisition_counts_CS.items():
    print(f"{acquisition_type}: {count}")
print("\nAcquisition counts in test folder:")
for acquisition_type, count in acquisition_counts.items():
    print(f"{acquisition_type}: {count}")

Found 261 brain files and 116 knee files in /DATASERVER/MIC/GENERAL/STUDENTS/aslock2/Preprocessed_CS/multicoil_test/.
Found 377 h5 test files.
Found 558 brain files and 118 knee files in /DATASERVER/MIC/SHARED/NYU_FastMRI/.
Found 676 files in /DATASERVER/MIC/SHARED/NYU_FastMRI/ for test set.
Acquisition counts in CS test folder:
CORPD_FBK: 64
CORPDFS_FBK: 52
AXT2: 161
AXT1POST: 49
AXT1PRE: 22
AXFLAIR: 19
AXT1: 10

Acquisition counts in test folder:
AXT2: 322
AXT1POST: 122
AXT1PRE: 36
AXFLAIR: 49
AXT1: 29
CORPD_FBK: 59
CORPDFS_FBK: 59


In [42]:
# PERCENTAGES SEEM TO BE OK => so even though look ordered in file system, seems to be random: OK to take first xx files
# Check if for brain it's random:
brain_dir = '/DATASERVER/MIC/SHARED/NYU_FastMRI/Preprocessed/multicoil_train/'
h5_files = list(Path(brain_dir).glob("**/*.h5"))
h5_files = h5_files[:1847] 
for file in h5_files:
    print(get_acquisition_type(file))

AXT2
AXT2
AXT1POST
AXFLAIR
AXT1PRE
AXT2
AXT2
AXT2
AXT2
AXT2
AXT2
AXT1POST
AXT2
AXT2
AXT2
AXT2
AXT2
AXT1PRE
AXT1
AXT1POST
AXT2
AXT1POST
AXT2
AXT1
AXT2
AXT1POST
AXT2
AXT2
AXT1POST
AXFLAIR
AXFLAIR
AXT1
AXT2
AXT2
AXT2
AXT2
AXT1POST
AXT2
AXT2
AXT1POST
AXT1POST
AXT2
AXT1POST
AXT2
AXT1POST
AXT2
AXT2
AXT2
AXT2
AXT2
AXT2
AXT2
AXT2
AXT2
AXT1
AXT2
AXT2
AXT2
AXT2
AXT2
AXT1POST
AXT2
AXT1POST
AXT2
AXT1POST
AXT2
AXT2
AXT1POST
AXT2
AXT2
AXT1POST
AXT1POST
AXT1POST
AXT1POST
AXT2
AXT1
AXT1
AXT2
AXT1
AXT2
AXT2
AXT2
AXT1PRE
AXT2
AXT2
AXT1POST
AXT2
AXFLAIR
AXFLAIR
AXT2
AXT2
AXT1
AXT2
AXT2
AXT2
AXT1PRE
AXT2
AXFLAIR
AXT2
AXT1PRE
AXT1PRE
AXT2
AXT1POST
AXT1
AXT1POST
AXT2
AXT2
AXT2
AXT1POST
AXT1POST
AXT2
AXT1POST
AXT2
AXT2
AXT1
AXT1POST
AXT2
AXT1POST
AXT2
AXT2
AXT1
AXT1POST
AXT2
AXT2
AXFLAIR
AXT1POST
AXT1PRE
AXT2
AXT1POST
AXT2
AXT2
AXT1PRE
AXT2
AXT1
AXT2
AXT2
AXT2
AXT1POST
AXT1POST
AXT2
AXT2
AXT2
AXT1PRE
AXT2
AXT1
AXT2
AXFLAIR
AXFLAIR
AXT2
AXT1POST
AXT1POST
AXT2
AXT1POST
AXT1POST
AXT2
AXT1
AXT2
AXT2
AXT2
AXT2
AX

In [44]:
# Check if for knee it's random:
knee_dir = '/DATASERVER/MIC/SHARED/NYU_FastMRI/Knee/multicoil_train/'
h5_files = list(Path(knee_dir).glob("**/*.h5"))
h5_files = h5_files[:821] 
for file in h5_files:
    print(get_acquisition_type(file))

CORPDFS_FBK
CORPD_FBK
CORPDFS_FBK
CORPD_FBK
CORPD_FBK
CORPDFS_FBK
CORPDFS_FBK
CORPD_FBK
CORPD_FBK
CORPDFS_FBK
CORPD_FBK
CORPDFS_FBK
CORPDFS_FBK
CORPDFS_FBK
CORPDFS_FBK
CORPD_FBK
CORPD_FBK
CORPDFS_FBK
CORPDFS_FBK
CORPDFS_FBK
CORPDFS_FBK
CORPDFS_FBK
CORPD_FBK
CORPD_FBK
CORPDFS_FBK
CORPDFS_FBK
CORPD_FBK
CORPDFS_FBK
CORPDFS_FBK
CORPD_FBK
CORPDFS_FBK
CORPD_FBK
CORPDFS_FBK
CORPDFS_FBK
CORPDFS_FBK
CORPDFS_FBK
CORPD_FBK
CORPDFS_FBK
CORPD_FBK
CORPDFS_FBK
CORPD_FBK
CORPDFS_FBK
CORPDFS_FBK
CORPDFS_FBK
CORPDFS_FBK
CORPD_FBK
CORPDFS_FBK
CORPDFS_FBK
CORPD_FBK
CORPDFS_FBK
CORPDFS_FBK
CORPDFS_FBK
CORPDFS_FBK
CORPDFS_FBK
CORPD_FBK
CORPD_FBK
CORPD_FBK
CORPD_FBK
CORPD_FBK
CORPD_FBK
CORPDFS_FBK
CORPD_FBK
CORPDFS_FBK
CORPD_FBK
CORPD_FBK
CORPDFS_FBK
CORPD_FBK
CORPDFS_FBK
CORPD_FBK
CORPDFS_FBK
CORPD_FBK
CORPDFS_FBK
CORPD_FBK
CORPD_FBK
CORPDFS_FBK
CORPD_FBK
CORPD_FBK
CORPD_FBK
CORPD_FBK
CORPD_FBK
CORPDFS_FBK
CORPD_FBK
CORPDFS_FBK
CORPDFS_FBK
CORPDFS_FBK
CORPD_FBK
CORPDFS_FBK
CORPDFS_FBK
CORPDFS_FBK
CORPD_FBK


## how many of test files brain are R=4 and R=8

In [8]:
import os
import h5py
from collections import Counter
from multiprocessing import Pool, cpu_count
from pathlib import Path
import numpy as np


def closer_to_4_or_8(float):
    diff_4 = np.abs(float - 4)
    diff_8 = np.abs(float - 8)

    if diff_4 < diff_8:
        return int(4)
    elif diff_8 < diff_4:
        return int(8)

def get_R_factor(h5_path):
    try:
        with h5py.File(h5_path, 'r') as hf:
            nPE_mask = hf['mask'][()]
            sampled_columns = np.sum(nPE_mask)
            R = len(nPE_mask)/sampled_columns
            R = float(R)
            return closer_to_4_or_8(R)
    except Exception as e:
        print(f"Error reading {h5_path}: {e}")
        return None

## Set variables
NUM_WORKERS = 20
BRAIN_TEST_PATH = '/DATASERVER/MIC/SHARED/NYU_FastMRI/Preprocessed/multicoil_test/'
##

folder = Path(BRAIN_TEST_PATH)
h5_files = []
h5_files.extend(folder.glob('*.h5'))
amount_testfiles = len(h5_files)
print(f"Found {amount_testfiles} test files in {BRAIN_TEST_PATH}.")

with Pool(processes=NUM_WORKERS) as pool:
    R_factors = pool.map(get_R_factor, h5_files)

R_counts = Counter(R_factors)
print("R factors in test folder brain:")
for R, count in R_counts.items():
    print(f"{R}: {count}")


Found 558 test files in /DATASERVER/MIC/SHARED/NYU_FastMRI/Preprocessed/multicoil_test/.
R factors in test folder brain:
4: 281
8: 277


## Explore generalized properties of the data using all training files

In [26]:
from pathlib import Path

# Cluster the training data based on number of coils used
# TAKES A LONG TIME TO RUN(113min): LOAD IF POSSIBLE
clustered_data = {}
files = Path(train_path).glob('**/*')

for file in files:
    hf_loop = h5py.File(file)
    shape = hf_loop['kspace'][()].shape
    num_coils = shape[1]
    if num_coils not in clustered_data:
       clustered_data[num_coils] = [file]
    else:
       clustered_data[num_coils].append(file)


In [None]:
# Save dictionary to .npy file
np.save("/DATASERVER/MIC/GENERAL/STUDENTS/aslock2/Results/exploration/coil_clustered_fastmri_data.npy", clustered_data)

In [2]:

# To load:
clustered_data = np.load("/DATASERVER/MIC/GENERAL/STUDENTS/aslock2/Results/exploration/coil_clustered_fastmri_data.npy", allow_pickle=True)
clustered_data = clustered_data.item()
print(np.shape(clustered_data))

()


In [3]:
for key, list in dict(clustered_data).items():
    print('Number of coils:'+str(key))
    print('Number of scans:'+str(len(list)))

# We can see that the most common number of coils are (in descending order) 16, 4 and 20

Number of coils:4
Number of scans:1212
Number of coils:12
Number of scans:244
Number of coils:16
Number of scans:1430
Number of coils:20
Number of scans:1170
Number of coils:14
Number of scans:263
Number of coils:6
Number of scans:61
Number of coils:5
Number of scans:22
Number of coils:8
Number of scans:10
Number of coils:10
Number of scans:3
Number of coils:22
Number of scans:1
Number of coils:18
Number of scans:26
Number of coils:2
Number of scans:7
Number of coils:24
Number of scans:7
Number of coils:28
Number of scans:2


In [31]:
# Cluster the 16-coil training data based on slice dimensions, as the 16-coil data corresponds to the most scans
clustered_data_2 = {}
for file in clustered_data[16]:
    hf_loop = h5py.File(file)
    shape = hf_loop['kspace'][()].shape
    slice_height = shape[2]
    slice_width = shape[3]
    key = (slice_height,slice_width)
    if key not in clustered_data_2:
        clustered_data_2[key] = [file]
    else:
        clustered_data_2[key].append(file)


In [32]:
# Save dictionary to .npy file
np.save("/DATASERVER/MIC/GENERAL/STUDENTS/aslock2/Results/exploration/16coil_slice_size_clustered_fastmri_data.npy", clustered_data_2)

In [4]:
# To load:
clustered_data_2 = np.load("/DATASERVER/MIC/GENERAL/STUDENTS/aslock2/Results/exploration/16coil_slice_size_clustered_fastmri_data.npy", allow_pickle=True)
clustered_data_2 = clustered_data_2.item()

In [5]:
for key, list in clustered_data_2.items():
    print('Slice dimensions:'+str(key))
    print('Number of scans:'+str(len(list)))


Slice dimensions:(640, 320)
Number of scans:723
Slice dimensions:(768, 396)
Number of scans:629
Slice dimensions:(640, 272)
Number of scans:25
Slice dimensions:(640, 264)
Number of scans:27
Slice dimensions:(640, 262)
Number of scans:2
Slice dimensions:(768, 324)
Number of scans:4
Slice dimensions:(512, 234)
Number of scans:4
Slice dimensions:(768, 342)
Number of scans:2
Slice dimensions:(640, 260)
Number of scans:5
Slice dimensions:(640, 312)
Number of scans:1
Slice dimensions:(512, 214)
Number of scans:3
Slice dimensions:(640, 280)
Number of scans:1
Slice dimensions:(512, 320)
Number of scans:1
Slice dimensions:(640, 274)
Number of scans:1
Slice dimensions:(512, 256)
Number of scans:1
Slice dimensions:(640, 332)
Number of scans:1


In [7]:
# Select the files with k-space size (768, 396)
selected_files = clustered_data_2.get((768, 396), [])
for filename in selected_files:
    print(filename)

/DATASERVER/MIC/SHARED/NYU_FastMRI/Preprocessed/multicoil_train/file_brain_AXT2_209_2090111.h5
/DATASERVER/MIC/SHARED/NYU_FastMRI/Preprocessed/multicoil_train/file_brain_AXT2_200_2000507.h5
/DATASERVER/MIC/SHARED/NYU_FastMRI/Preprocessed/multicoil_train/file_brain_AXT2_200_6002445.h5
/DATASERVER/MIC/SHARED/NYU_FastMRI/Preprocessed/multicoil_train/file_brain_AXT2_200_2000469.h5
/DATASERVER/MIC/SHARED/NYU_FastMRI/Preprocessed/multicoil_train/file_brain_AXT2_201_2010349.h5
/DATASERVER/MIC/SHARED/NYU_FastMRI/Preprocessed/multicoil_train/file_brain_AXT2_201_2010029.h5
/DATASERVER/MIC/SHARED/NYU_FastMRI/Preprocessed/multicoil_train/file_brain_AXT2_210_2100332.h5
/DATASERVER/MIC/SHARED/NYU_FastMRI/Preprocessed/multicoil_train/file_brain_AXT2_200_6002214.h5
/DATASERVER/MIC/SHARED/NYU_FastMRI/Preprocessed/multicoil_train/file_brain_AXT2_210_6001756.h5
/DATASERVER/MIC/SHARED/NYU_FastMRI/Preprocessed/multicoil_train/file_brain_AXT2_210_6001849.h5
/DATASERVER/MIC/SHARED/NYU_FastMRI/Preprocessed/mu

In [35]:
count_AXFLAIR = 0
count_AXT1 = 0
count_AXT2 = 0
count_others = 0

for filename in clustered_data_2[(640,320)]:
    filename = str(filename)
    if 'AXFLAIR' in filename:
        count_AXFLAIR += 1
    elif 'AXT1' in filename:
        count_AXT1 += 1
    elif 'AXT2' in filename:
        count_AXT2 += 1
    else:
        count_others += 1


In [36]:
print(f"AXFLAIR: {count_AXFLAIR}")
print(f"AXT1: {count_AXT1}")
print(f"AXT2: {count_AXT2}")
print(f"Others: {count_others}")


AXFLAIR: 94
AXT1: 406
AXT2: 223
Others: 0


In [37]:
count_AXFLAIR = 0
count_AXT1 = 0
count_AXT2 = 0
count_others = 0

for filename in clustered_data_2[(640,320)][:70]:
    filename = str(filename)
    if 'AXFLAIR' in filename:
        count_AXFLAIR += 1
    elif 'AXT1' in filename:
        count_AXT1 += 1
    elif 'AXT2' in filename:
        count_AXT2 += 1
    else:
        count_others += 1

In [38]:
print(f"AXFLAIR: {count_AXFLAIR}")
print(f"AXT1: {count_AXT1}")
print(f"AXT2: {count_AXT2}")
print(f"Others: {count_others}")


AXFLAIR: 10
AXT1: 44
AXT2: 16
Others: 0


In [39]:
for filename in clustered_data_2[(640,320)][:70]:
    print(str(filename))


/DATASERVER/MIC/SHARED/NYU_FastMRI/Preprocessed/multicoil_train/file_brain_AXT1POST_210_6001620.h5
/DATASERVER/MIC/SHARED/NYU_FastMRI/Preprocessed/multicoil_train/file_brain_AXT1PRE_205_6000021.h5
/DATASERVER/MIC/SHARED/NYU_FastMRI/Preprocessed/multicoil_train/file_brain_AXT2_202_2020162.h5
/DATASERVER/MIC/SHARED/NYU_FastMRI/Preprocessed/multicoil_train/file_brain_AXT1POST_202_6000281.h5
/DATASERVER/MIC/SHARED/NYU_FastMRI/Preprocessed/multicoil_train/file_brain_AXT1POST_200_6001969.h5
/DATASERVER/MIC/SHARED/NYU_FastMRI/Preprocessed/multicoil_train/file_brain_AXT1POST_205_2050055.h5
/DATASERVER/MIC/SHARED/NYU_FastMRI/Preprocessed/multicoil_train/file_brain_AXT2_202_2020467.h5
/DATASERVER/MIC/SHARED/NYU_FastMRI/Preprocessed/multicoil_train/file_brain_AXT2_205_6000061.h5
/DATASERVER/MIC/SHARED/NYU_FastMRI/Preprocessed/multicoil_train/file_brain_AXT2_202_2020578.h5
/DATASERVER/MIC/SHARED/NYU_FastMRI/Preprocessed/multicoil_train/file_brain_AXT1POST_205_2050233.h5
/DATASERVER/MIC/SHARED/NYU_

NOW FOR KNEES:

In [5]:
%matplotlib inline

import os
import h5py
import numpy as np
from matplotlib import pyplot as plt
from pathlib import Path

In [6]:
download_path = '/DATASERVER/MIC/SHARED/NYU_FastMRI/Knee'
train_path = Path(f"{download_path}/multicoil_train")
validation_path = Path(f"{download_path}/multicoil_val")
test_path = Path(f"{download_path}/multicoil_test")

In [12]:
from pathlib import Path
# TAKES A LONG TIME TO RUN(113min): LOAD IF POSSIBLE
files = train_path.glob('**/*')
# Cluster the training data based on slice dimensions
clustered_data = {}
for file in files:
    hf_loop = h5py.File(file)
    shape = hf_loop['kspace'][()].shape
    slice_height = shape[2]
    slice_width = shape[3]
    key = (slice_height,slice_width)
    if key not in clustered_data:
        clustered_data[key] = [file]
    else:
        clustered_data[key].append(file)


In [13]:
# Save dictionary to .npy file
np.save("/DATASERVER/MIC/GENERAL/STUDENTS/aslock2/Results/exploration/KNEE_kspace_size_clustered_fastmri_data.npy", clustered_data)

In [None]:
# To load:
clustered_data = np.load("/DATASERVER/MIC/GENERAL/STUDENTS/aslock2/Results/exploration/KNEE_kspace_size_clustered_fastmri_data.npy", allow_pickle=True)
clustered_data = clustered_data.item()

In [14]:
for key, list in clustered_data.items():
    print('Slice dimensions:'+str(key))
    print('Number of scans:'+str(len(list)))

Slice dimensions:(640, 368)
Number of scans:536
Slice dimensions:(640, 320)
Number of scans:9
Slice dimensions:(640, 372)
Number of scans:378
Slice dimensions:(640, 338)
Number of scans:7
Slice dimensions:(640, 322)
Number of scans:18
Slice dimensions:(640, 388)
Number of scans:9
Slice dimensions:(640, 356)
Number of scans:3
Slice dimensions:(640, 400)
Number of scans:4
Slice dimensions:(640, 550)
Number of scans:1
Slice dimensions:(640, 386)
Number of scans:4
Slice dimensions:(640, 480)
Number of scans:1
Slice dimensions:(640, 640)
Number of scans:1
Slice dimensions:(640, 370)
Number of scans:2


## Average number of slices for brain:

In [12]:
total_slices =  0
file_count = 0

for filename in os.listdir(validation_path):
    file_count += 1
    file_path = os.path.join(validation_path, filename)
    
    # Open the .h5 file and read the number of slices
    with h5py.File(file_path, 'r') as hf:
        slices = hf['kspace'].shape[0]
        total_slices += slices

# Calculate the average number of slices per file
average_slices = total_slices / file_count if file_count > 0 else 0
print(f"Average slices per file: {average_slices:.2f}")
print(f"Total number of files: {file_count}")
print(f"Total number of slices: {total_slices}")

Average slices per file: 35.85
Total number of files: 199
Total number of slices: 7135
