In [1]:
# Import packages
import os
import shutil
import pandas as pd
from PIL import Image
import pickle

## Obtain patient data

In [2]:
patient = pd.read_csv('PatientDiagnosis.csv')

In [3]:
# Separate sane and infected patients
patients = []

patients.append(patient.loc[patient['DENSITAT'] == 'NEGATIVA']['CODI'].values.tolist())
patients.append(patient.loc[patient['DENSITAT'] != 'NEGATIVA']['CODI'].values.tolist())

In [4]:
# Save data onto a pickle file
with open('patients.pkl', 'wb') as f:
    pickle.dump(patients, f)

## Extract information from datasets

In [5]:
# Read file with labels, training data
xl_file = pd.ExcelFile('HP_WSI-CoordAnnotatedPatches.xlsx')

dfs = {sheet_name: xl_file.parse(sheet_name) for sheet_name in xl_file.sheet_names}

df = dfs['Sheet1']
df = df.loc[df['Presence'] != 0] # Delete uncertain cases

In [6]:
def extract_data(source_folder, destination_folder, patients, test=True, data=None):
    ''' 
    Function to extract information from the datasets.
    It reads all the images and saves them into a new folder with all the information on its filename. This way, images can be easily
    treated without losing information. Patients variable contains the list of all patient codes.
    When the test option is true, the function just extracts the patient information.
    When the test option is false, patient information, window_id and label are extracted. A dataframe data with the information must be
    passed.
    '''

    # Step 1: Create destination folder if it doesn't exist
    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)
        print(f"Created folder: {destination_folder}")    
    
    # Step 2: Loop through each patient folder
    for patient in patients['CODI'].unique():        
        pat_path = os.path.join(source_folder, patient+'_0')
        if not os.path.exists(pat_path):
            pat_path = ''
            pat_path = os.path.join(source_folder, patient+'_1')
            if not os.path.exists(pat_path):
                print(f"Patient folder does not exist: {pat_path}")
                continue
        
        # Step 3: Get list of all image files in the source folder
        image_files = [f for f in os.listdir(pat_path) if f.endswith(('.png', '.jpg', '.jpeg'))]
        
        # Step 4: Rename and save each image to the destination folder

        # Obtain data from test set
        if test:
            for filename in image_files:
                # Define the new file name 
                new_filename = f"{patient}__{filename}"
                
                # Define full path for source and destination
                src_path = os.path.join(pat_path, filename)
                dst_path = os.path.join(destination_folder, new_filename)
                
                # Copy and rename the file
                shutil.copy(src_path, dst_path)
        
        # Obtain data from training set
        else:
            for filename in image_files:
                # Define the new file name 
                window_id = filename.split('_')[0].split('.')[0].lstrip('0')
                if window_id == '':
                    window_id = 0

                label = data.loc[data['Pat_ID'] == patient]            
                label = label.loc[label['Window_ID'] == int(window_id)]
                if label.empty:
                    continue

                name = os.path.splitext(filename)[0].lstrip('0')
                if name == '':
                    name = '0'
                    
                new_filename = f"{patient}__{name}__{label.iloc[0]['Presence']}{os.path.splitext(filename)[1]}"
                
                # Define full path for source and destination
                src_path = os.path.join(pat_path, filename)
                dst_path = os.path.join(destination_folder, new_filename)
                
                # Copy and rename the file
                shutil.copy(src_path, dst_path)

In [None]:
# Calling function to process train and test sets
extract_data("Raw_Train_Set", "Training_Set", patient, test=False, data=df)
extract_data("Raw_Test_Set", "Test_Set", patient, test=True)

## Read images

In [12]:
def read_images(folder, n_patients=0):
    '''
    Function that returns a list with all the images retrieved. The number of patients can be limited to avoid large sizes.
    Each element of the list consists of a dictionary with two keys: 'img' and 'info'.
    The 'img' key contains the image itself.
    The 'info' key contains a list with information about the image
    '''
    # Check if the folder exists
    if not os.path.exists(folder):
        print(f"Folder does not exist: {folder}")  

    # Read images
    images = []
    patients = []
    for img in os.listdir(folder):
        if (n_patients == 0) or (len(patients) < n_patients): # Limit patients
            if img.endswith(('.png', '.jpg', '.jpeg')):
                temp_path = os.path.join(folder, img)

                temp_dict = {}  
                temp_dict['img'] = [Image.open(temp_path).convert("RGB")] # Image in RGB ready to be processed          
                temp_dict['info'] = os.path.splitext(img)[0].split('__') # List with the image information
                if temp_dict['info'][0] not in patients:
                    patients.append(temp_dict['info'][0])
                images.append(temp_dict)
            
    
    return images, patients

In [None]:
# Calling function to read train and test sets
train_imgs, train_pat = read_images("Training_Set")
test_imgs, test_pat = read_images("Test_Set", n_patients=20) # Only 20 patients to avoid too large files
print(len(train_imgs), len(test_imgs))

2508 11636


In [16]:
# 1 -> has bacteria // -1 -> does not have bacteria
train_imgs[69] 

{'img': [<PIL.Image.Image image mode=RGB size=256x256>],
 'info': ['B22-114', '112_Aug7', '1']}

In [17]:
test_imgs[69]

{'img': [<PIL.Image.Image image mode=RGB size=256x256>],
 'info': ['B22-01', '163']}

In [18]:
# Save images onto a pickle file
with open('train_imgs.pkl', 'wb') as f:
    pickle.dump(train_imgs, f)
with open('test_imgs.pkl', 'wb') as f:
    pickle.dump(test_imgs, f)