# Libraries

In [43]:
import numpy as np
import os

import warnings
warnings.filterwarnings('ignore')

In [44]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

# Utilities

In [45]:
def get_scan_paths():
    #for MAC
    slices_path = r'/Volumes/fsmresfiles/Ophthalmology/Mirza_Images/AMD/dAMD_GA/all_slices_3'
    slices = os.listdir(slices_path)

    # For windows
    # slices_path = r'\\fsmresfiles.fsm.northwestern.edu\fsmresfiles\Ophthalmology\Mirza_Images\AMD\dAMD_GA\all_slices_3'
    # slices = os.listdir(all_slices_path)

    scan_paths = [item for item in slices if item.endswith('.jpg')]
    return scan_paths

In [46]:
def create_model_df(scan_paths):

    landmark_path = '/Users/shubhigupta/Documents/GitHub/OCT-GA-Detection/FoveaProgression/data/fovea_landmarks.csv'
    data_landmarks = pd.read_csv(landmark_path)

    scan_metadata_path = '/Users/shubhigupta/Documents/GitHub/OCT-GA-Detection/FoveaProgression/data/fovea_detection_data.csv'
    data_metadata = pd.read_csv(scan_metadata_path)

    filtered_data = data_metadata[data_metadata['scan_name'].isin(scan_paths)]
    
    filtered_data['label'] = filtered_data['status'].astype(int)

    sorted_filtered_data = filtered_data.sort_values(by=['patient_id','folder_name','scan_number'])

    return data_landmarks, sorted_filtered_data


In [47]:
def assign_sequence_threshold(landmarks_mapping,scan_number, patient_id, folder_name):
    # Retrieve the landmarks for the given patient and folder
    landmarks = landmarks_mapping.get((patient_id, folder_name), None)
    if landmarks is None:
        return 'unknown', -1  # Handle cases where no mapping is found

    if scan_number < landmarks['start'] or scan_number > landmarks['end']:
        return 'no fovea', 0
    elif landmarks['start'] <= scan_number <= landmarks['start_threshold']:
        return 'progression', 1
    elif landmarks['end_threshold'] <= scan_number <= landmarks['end']:
        return 'progression', 1
    else:
        return 'deepest', 2

In [48]:
def landmark_labels(fovea_landmarks,fovea_detection_data):

    # Initialize a dictionary to hold the landmarks for each patient and folder
    landmarks_mapping = {}
    for index, row in fovea_landmarks.iterrows():
        key = (row['patient_id'], row['folder_name'])
        landmarks_mapping[key] = {
            'start': row['start'],
            'deepest': row['deepest'],
            'end': row['end'],
            'start_threshold': row['deepest'] - (row['deepest'] - row['start'])/4,
            'end_threshold': row['deepest'] + (row['end'] - row['deepest'])/4
        }

    # Apply the function to each row
    results = fovea_detection_data.apply(lambda row: assign_sequence_threshold(landmarks_mapping,row['scan_number'], row['patient_id'], row['folder_name']), axis=1)
    fovea_detection_data['sequence'], fovea_detection_data['sequence_label'] = zip(*results)
    
    
    return fovea_detection_data

In [49]:
# Function to resample patient data
def resample_patient_data(patient_data, max_appointments):
    appointment_folders = patient_data['folder_name'].unique()
    resampled_folders = np.random.choice(appointment_folders, max_appointments, replace=True)
    resampled_data = pd.concat([patient_data[patient_data['folder_name'] == folder] for folder in resampled_folders])
    return resampled_data

In [50]:
# Function to resample patient data with renaming
def resample_patient_data(patient_data, max_appointments):
    appointment_folders = patient_data['folder_name'].unique()
    resampled_folders = np.random.choice(appointment_folders, max_appointments, replace=True)
    resampled_data = []
    for i, folder in enumerate(resampled_folders):
        folder_data = patient_data[patient_data['folder_name'] == folder].copy()
        if i >= len(appointment_folders):
            folder_data['folder_name'] = f"{folder}_resample_{i - len(appointment_folders) + 1}"
        resampled_data.append(folder_data)
    return pd.concat(resampled_data)

In [51]:
# Function to print count of patients, appointment folders, and scan counts
def print_set_details(df_set, set_name):
    patient_count = df_set['patient_id'].nunique()
    print(f"{set_name} Set: {patient_count} patients")
    for patient_id in df_set['patient_id'].unique():
        patient_data = df_set[df_set['patient_id'] == patient_id]
        appointment_folders = patient_data['folder_name'].unique()
        print(f"Patient ID: {patient_id}, Appointments: {len(appointment_folders)}")
        for folder in appointment_folders:
            scan_count = len(patient_data[patient_data['folder_name'] == folder])
            print(f"\tAppointment Folder: {folder}, Scan Count: {scan_count}")


# Pre-processing

In [53]:
scan_paths = get_scan_paths()
df_landmarks, df_metadata = create_model_df(scan_paths)
df_landmarks.info()
df_metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   patient_id     41 non-null     int64 
 1   folder_name    41 non-null     object
 2   start          41 non-null     int64 
 3   deepest        41 non-null     int64 
 4   end            41 non-null     int64 
 5   start_slice    41 non-null     object
 6   deepest_slice  41 non-null     object
 7   end_slice      41 non-null     object
dtypes: int64(4), object(4)
memory usage: 2.7+ KB
<class 'pandas.core.frame.DataFrame'>
Index: 21500 entries, 10202 to 12453
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   scan_name    21500 non-null  object
 1   scan_number  21500 non-null  int64 
 2   folder_name  21500 non-null  object
 3   patient_id   21500 non-null  int64 
 4   eye_side     21500 non-null  object
 5   status       21500 

In [54]:
df_landmarks.head()

Unnamed: 0,patient_id,folder_name,start,deepest,end,start_slice,deepest_slice,end_slice
0,47,047_OS_1_GA_6x6_SSOCT_1,198,253,355,047_OS_1_GA_6x6_SSOCT_1_198.jpg,047_OS_1_GA_6x6_SSOCT_1_253.jpg,047_OS_1_GA_6x6_SSOCT_1_355.jpg
1,47,047_OS_2_GA_6x6_SSOCT_1,194,258,368,047_OS_2_GA_6x6_SSOCT_1_194.jpg,047_OS_2_GA_6x6_SSOCT_1_258.jpg,047_OS_2_GA_6x6_SSOCT_1_368.jpg
2,64,064_OD_1_GA_6x6_SSOCT_1,209,287,369,064_OD_1_GA_6x6_SSOCT_1_209.jpg,064_OD_1_GA_6x6_SSOCT_1_287.jpg,064_OD_1_GA_6x6_SSOCT_1_369.jpg
3,64,064_OD_1_GA_6x6_SSOCT_2,215,289,368,064_OD_1_GA_6x6_SSOCT_2_215.jpg,064_OD_1_GA_6x6_SSOCT_2_289.jpg,064_OD_1_GA_6x6_SSOCT_2_368.jpg
4,64,064_OD_10_GA_6x6_SSOCT_1,155,258,348,064_OD_10_GA_6x6_SSOCT_1_155.jpg,064_OD_10_GA_6x6_SSOCT_1_258.jpg,064_OD_10_GA_6x6_SSOCT_1_348.jpg


In [55]:
df_metadata.head()

Unnamed: 0,scan_name,scan_number,folder_name,patient_id,eye_side,status,label
10202,047_OS_1_GA_6x6_SSOCT_1_1.jpg,1,047_OS_1_GA_6x6_SSOCT_1,47,left,False,0
10439,047_OS_1_GA_6x6_SSOCT_1_2.jpg,2,047_OS_1_GA_6x6_SSOCT_1,47,left,False,0
10021,047_OS_1_GA_6x6_SSOCT_1_3.jpg,3,047_OS_1_GA_6x6_SSOCT_1,47,left,False,0
10301,047_OS_1_GA_6x6_SSOCT_1_4.jpg,4,047_OS_1_GA_6x6_SSOCT_1,47,left,False,0
10164,047_OS_1_GA_6x6_SSOCT_1_5.jpg,5,047_OS_1_GA_6x6_SSOCT_1,47,left,False,0


In [56]:
df_model = landmark_labels(df_landmarks, df_metadata)

In [57]:
df_model.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21500 entries, 10202 to 12453
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   scan_name       21500 non-null  object
 1   scan_number     21500 non-null  int64 
 2   folder_name     21500 non-null  object
 3   patient_id      21500 non-null  int64 
 4   eye_side        21500 non-null  object
 5   status          21500 non-null  bool  
 6   label           21500 non-null  int64 
 7   sequence        21500 non-null  object
 8   sequence_label  21500 non-null  int64 
dtypes: bool(1), int64(4), object(4)
memory usage: 1.5+ MB


In [58]:
scan_img_path = r'/Volumes/fsmresfiles/Ophthalmology/Mirza_Images/AMD/dAMD_GA/all_slices_3'
df_model['image_path'] = df_model.apply(lambda row: os.path.join(scan_img_path, row['scan_name']), axis=1)

In [59]:
df_model.head()

Unnamed: 0,scan_name,scan_number,folder_name,patient_id,eye_side,status,label,sequence,sequence_label,image_path
10202,047_OS_1_GA_6x6_SSOCT_1_1.jpg,1,047_OS_1_GA_6x6_SSOCT_1,47,left,False,0,no fovea,0,/Volumes/fsmresfiles/Ophthalmology/Mirza_Image...
10439,047_OS_1_GA_6x6_SSOCT_1_2.jpg,2,047_OS_1_GA_6x6_SSOCT_1,47,left,False,0,no fovea,0,/Volumes/fsmresfiles/Ophthalmology/Mirza_Image...
10021,047_OS_1_GA_6x6_SSOCT_1_3.jpg,3,047_OS_1_GA_6x6_SSOCT_1,47,left,False,0,no fovea,0,/Volumes/fsmresfiles/Ophthalmology/Mirza_Image...
10301,047_OS_1_GA_6x6_SSOCT_1_4.jpg,4,047_OS_1_GA_6x6_SSOCT_1,47,left,False,0,no fovea,0,/Volumes/fsmresfiles/Ophthalmology/Mirza_Image...
10164,047_OS_1_GA_6x6_SSOCT_1_5.jpg,5,047_OS_1_GA_6x6_SSOCT_1,47,left,False,0,no fovea,0,/Volumes/fsmresfiles/Ophthalmology/Mirza_Image...


In [36]:
df_model.to_csv('../data/fovea_progression_dataset.csv', index=False)

In [60]:
unknown_status_df = df_model[df_model['sequence'] == 'unknown']
unique_patient_ids = unknown_status_df['folder_name'].unique()

print(unique_patient_ids.tolist())

['190_OS_1_GA_6x6_SSOCT_1', '345_OS_1_GA_6x6_SSOCT_1']


In [61]:
df_model_ex_unknown = df_model[df_model['sequence']!= 'unknown']

In [None]:
# Write the filtered DataFrame to a CSV file
df_model_ex_unknown.to_csv('../data/fovea_progression_dataset_excluding_unknowns.csv', index=False)

# Generating Datasets


## Creating Sample dataset

In [77]:
df_sample_train = df_model_ex_unknown[df_model_ex_unknown['folder_name'] == '047_OS_1_GA_6x6_SSOCT_1']
df_sample_train.to_csv('../data/sample/train.csv')

df_sample_test = df_model_ex_unknown[df_model_ex_unknown['folder_name'] == '064_OD_1_GA_6x6_SSOCT_2']
df_sample_test.to_csv('../data/sample/test.csv')

df_sample_val = df_model_ex_unknown[df_model_ex_unknown['folder_name'] == '321_OD_1_GA_6x6_SSOCT_1']
df_sample_val.to_csv('../data/sample/val.csv')

## Creating Model dataset

### Random Shuffle

In [70]:
shuffled_folders = df_landmarks.sample(frac=1, random_state=42)['folder_name'].tolist()
# Determine split indices
train_size = int(0.8 * len(shuffled_folders))
val_size = int(0.15 * len(shuffled_folders))
test_size = len(shuffled_folders) - train_size - val_size

print(train_size)
print(val_size)
print(test_size)


32
6
3


In [71]:
# Split the folder names into train, test, val
train_folders = shuffled_folders[:train_size]
val_folders = shuffled_folders[train_size:train_size + val_size]
test_folders = shuffled_folders[train_size + val_size:]

# Filter the main DataFrame based on the splits
train_scans = df_model_ex_unknown[df_model_ex_unknown['folder_name'].isin(train_folders)]
val_scans = df_model_ex_unknown[df_model_ex_unknown['folder_name'].isin(val_folders)]
test_scans = df_model_ex_unknown[df_model_ex_unknown['folder_name'].isin(test_folders)]


In [72]:
print("Train set:")
print(train_scans.info())

Train set:
<class 'pandas.core.frame.DataFrame'>
Index: 16000 entries, 10202 to 12453
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   scan_name       16000 non-null  object
 1   scan_number     16000 non-null  int64 
 2   folder_name     16000 non-null  object
 3   patient_id      16000 non-null  int64 
 4   eye_side        16000 non-null  object
 5   status          16000 non-null  bool  
 6   label           16000 non-null  int64 
 7   sequence        16000 non-null  object
 8   sequence_label  16000 non-null  int64 
 9   image_path      16000 non-null  object
dtypes: bool(1), int64(4), object(5)
memory usage: 1.2+ MB
None


In [73]:
print("\nValidation set:")
print(val_scans.info())


Validation set:
<class 'pandas.core.frame.DataFrame'>
Index: 3000 entries, 13270 to 8680
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   scan_name       3000 non-null   object
 1   scan_number     3000 non-null   int64 
 2   folder_name     3000 non-null   object
 3   patient_id      3000 non-null   int64 
 4   eye_side        3000 non-null   object
 5   status          3000 non-null   bool  
 6   label           3000 non-null   int64 
 7   sequence        3000 non-null   object
 8   sequence_label  3000 non-null   int64 
 9   image_path      3000 non-null   object
dtypes: bool(1), int64(4), object(5)
memory usage: 237.3+ KB
None


In [74]:
print("\nTest set:")
print(test_scans.info())


Test set:
<class 'pandas.core.frame.DataFrame'>
Index: 1500 entries, 1331 to 14580
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   scan_name       1500 non-null   object
 1   scan_number     1500 non-null   int64 
 2   folder_name     1500 non-null   object
 3   patient_id      1500 non-null   int64 
 4   eye_side        1500 non-null   object
 5   status          1500 non-null   bool  
 6   label           1500 non-null   int64 
 7   sequence        1500 non-null   object
 8   sequence_label  1500 non-null   int64 
 9   image_path      1500 non-null   object
dtypes: bool(1), int64(4), object(5)
memory usage: 118.7+ KB
None


In [75]:
train_folders_set = set(train_scans['folder_name'])
val_folders_set = set(val_scans['folder_name'])
test_folders_set = set(test_scans['folder_name'])

# Check for overlaps
train_val_overlap = train_folders_set.intersection(val_folders_set)
train_test_overlap = train_folders_set.intersection(test_folders_set)
val_test_overlap = val_folders_set.intersection(test_folders_set)

# Print results
print("Overlap between train and val sets:", train_val_overlap)
print("Overlap between train and test sets:", train_test_overlap)
print("Overlap between val and test sets:", val_test_overlap)

# Verify if there are any overlaps
if not train_val_overlap and not train_test_overlap and not val_test_overlap:
    print("No overlaps found between the datasets. The split is valid.")
else:
    print("Overlaps found! Please check the dataset splits.")

Overlap between train and val sets: set()
Overlap between train and test sets: set()
Overlap between val and test sets: set()
No overlaps found between the datasets. The split is valid.


In [76]:
train_scans.to_csv('../data/model/train.csv')
val_scans.to_csv('../data/model/val.csv')
test_scans.to_csv('../data/model/test.csv')

### Split by Patient ID and Resample

In [23]:
# Step 1: Resample the data
# Get the number of appointments per patient
appointments_per_patient = df_model_ex_unknown.groupby('patient_id')['folder_name'].nunique()
max_appointments = appointments_per_patient.max()
print(max_appointments)

23


In [41]:
# Apply resampling to each patient
df_resampled = pd.concat([resample_patient_data(df_model_ex_unknown[df_model_ex_unknown['patient_id'] == patient_id], max_appointments)
                          for patient_id in df_model_ex_unknown['patient_id'].unique()])

In [42]:
# Step 2: Split the data into train, val, test sets (80/10/10)
patient_ids = df_resampled['patient_id'].unique()
train_ids, test_ids = train_test_split(patient_ids, test_size=0.2)
train_ids, val_ids = train_test_split(train_ids, test_size=0.2)
print(train_ids)
print(test_ids)
print(val_ids)

[578 321  64  47]
[190 341]
[345 326]


In [43]:
# Split the resampled DataFrame into train, val, test sets
df_train = df_resampled[df_resampled['patient_id'].isin(train_ids)]
df_val = df_resampled[df_resampled['patient_id'].isin(val_ids)]
df_test = df_resampled[df_resampled['patient_id'].isin(test_ids)]
display(df_train.head())
display(df_val.head())
display(df_test.head())


Unnamed: 0,scan_name,scan_number,folder_name,patient_id,eye_side,status,label,sequence,sequence_label,image_path
10202,047_OS_1_GA_6x6_SSOCT_1_1.jpg,1,047_OS_1_GA_6x6_SSOCT_1,47,left,False,0,no fovea,0,/Volumes/fsmresfiles/Ophthalmology/Mirza_Image...
10439,047_OS_1_GA_6x6_SSOCT_1_2.jpg,2,047_OS_1_GA_6x6_SSOCT_1,47,left,False,0,no fovea,0,/Volumes/fsmresfiles/Ophthalmology/Mirza_Image...
10021,047_OS_1_GA_6x6_SSOCT_1_3.jpg,3,047_OS_1_GA_6x6_SSOCT_1,47,left,False,0,no fovea,0,/Volumes/fsmresfiles/Ophthalmology/Mirza_Image...
10301,047_OS_1_GA_6x6_SSOCT_1_4.jpg,4,047_OS_1_GA_6x6_SSOCT_1,47,left,False,0,no fovea,0,/Volumes/fsmresfiles/Ophthalmology/Mirza_Image...
10164,047_OS_1_GA_6x6_SSOCT_1_5.jpg,5,047_OS_1_GA_6x6_SSOCT_1,47,left,False,0,no fovea,0,/Volumes/fsmresfiles/Ophthalmology/Mirza_Image...


Unnamed: 0,scan_name,scan_number,folder_name,patient_id,eye_side,status,label,sequence,sequence_label,image_path
16096,326_OD_1_GA_6x6_SSOCT_1_1.jpg,1,326_OD_1_GA_6x6_SSOCT_1,326,right,False,0,no fovea,0,/Volumes/fsmresfiles/Ophthalmology/Mirza_Image...
16298,326_OD_1_GA_6x6_SSOCT_1_2.jpg,2,326_OD_1_GA_6x6_SSOCT_1,326,right,False,0,no fovea,0,/Volumes/fsmresfiles/Ophthalmology/Mirza_Image...
16256,326_OD_1_GA_6x6_SSOCT_1_3.jpg,3,326_OD_1_GA_6x6_SSOCT_1,326,right,False,0,no fovea,0,/Volumes/fsmresfiles/Ophthalmology/Mirza_Image...
16061,326_OD_1_GA_6x6_SSOCT_1_4.jpg,4,326_OD_1_GA_6x6_SSOCT_1,326,right,False,0,no fovea,0,/Volumes/fsmresfiles/Ophthalmology/Mirza_Image...
16097,326_OD_1_GA_6x6_SSOCT_1_5.jpg,5,326_OD_1_GA_6x6_SSOCT_1,326,right,False,0,no fovea,0,/Volumes/fsmresfiles/Ophthalmology/Mirza_Image...


Unnamed: 0,scan_name,scan_number,folder_name,patient_id,eye_side,status,label,sequence,sequence_label,image_path
4521,190_OD_1_GA_6x6_SSOCT_1_1.jpg,1,190_OD_1_GA_6x6_SSOCT_1,190,right,False,0,no fovea,0,/Volumes/fsmresfiles/Ophthalmology/Mirza_Image...
4528,190_OD_1_GA_6x6_SSOCT_1_2.jpg,2,190_OD_1_GA_6x6_SSOCT_1,190,right,False,0,no fovea,0,/Volumes/fsmresfiles/Ophthalmology/Mirza_Image...
4942,190_OD_1_GA_6x6_SSOCT_1_3.jpg,3,190_OD_1_GA_6x6_SSOCT_1,190,right,False,0,no fovea,0,/Volumes/fsmresfiles/Ophthalmology/Mirza_Image...
4688,190_OD_1_GA_6x6_SSOCT_1_4.jpg,4,190_OD_1_GA_6x6_SSOCT_1,190,right,False,0,no fovea,0,/Volumes/fsmresfiles/Ophthalmology/Mirza_Image...
4995,190_OD_1_GA_6x6_SSOCT_1_5.jpg,5,190_OD_1_GA_6x6_SSOCT_1,190,right,False,0,no fovea,0,/Volumes/fsmresfiles/Ophthalmology/Mirza_Image...


In [44]:
# Print details for each set
print_set_details(df_train, "Training")

Training Set: 4 patients
Patient ID: 47, Appointments: 23
	Appointment Folder: 047_OS_1_GA_6x6_SSOCT_1, Scan Count: 500
	Appointment Folder: 047_OS_2_GA_6x6_SSOCT_1, Scan Count: 500
	Appointment Folder: 047_OS_2_GA_6x6_SSOCT_1_resample_1, Scan Count: 500
	Appointment Folder: 047_OS_2_GA_6x6_SSOCT_1_resample_2, Scan Count: 500
	Appointment Folder: 047_OS_2_GA_6x6_SSOCT_1_resample_3, Scan Count: 500
	Appointment Folder: 047_OS_2_GA_6x6_SSOCT_1_resample_4, Scan Count: 500
	Appointment Folder: 047_OS_2_GA_6x6_SSOCT_1_resample_5, Scan Count: 500
	Appointment Folder: 047_OS_2_GA_6x6_SSOCT_1_resample_6, Scan Count: 500
	Appointment Folder: 047_OS_2_GA_6x6_SSOCT_1_resample_7, Scan Count: 500
	Appointment Folder: 047_OS_1_GA_6x6_SSOCT_1_resample_8, Scan Count: 500
	Appointment Folder: 047_OS_2_GA_6x6_SSOCT_1_resample_9, Scan Count: 500
	Appointment Folder: 047_OS_2_GA_6x6_SSOCT_1_resample_10, Scan Count: 500
	Appointment Folder: 047_OS_2_GA_6x6_SSOCT_1_resample_11, Scan Count: 500
	Appointment 

In [45]:
print_set_details(df_val, "Validation")

Validation Set: 2 patients
Patient ID: 326, Appointments: 23
	Appointment Folder: 326_OD_1_GA_6x6_SSOCT_1, Scan Count: 500
	Appointment Folder: 326_OD_3_GA_6x6_SSOCT_1, Scan Count: 500
	Appointment Folder: 326_OD_2_GA_6x6_SSOCT_1, Scan Count: 500
	Appointment Folder: 326_OD_3_GA_6x6_SSOCT_1_resample_1, Scan Count: 500
	Appointment Folder: 326_OD_2_GA_6x6_SSOCT_1_resample_2, Scan Count: 500
	Appointment Folder: 326_OD_3_GA_6x6_SSOCT_1_resample_3, Scan Count: 500
	Appointment Folder: 326_OD_2_GA_6x6_SSOCT_1_resample_4, Scan Count: 500
	Appointment Folder: 326_OD_3_GA_6x6_SSOCT_1_resample_5, Scan Count: 500
	Appointment Folder: 326_OD_2_GA_6x6_SSOCT_1_resample_6, Scan Count: 500
	Appointment Folder: 326_OD_3_GA_6x6_SSOCT_1_resample_7, Scan Count: 500
	Appointment Folder: 326_OD_2_GA_6x6_SSOCT_1_resample_8, Scan Count: 500
	Appointment Folder: 326_OD_3_GA_6x6_SSOCT_1_resample_9, Scan Count: 500
	Appointment Folder: 326_OD_3_GA_6x6_SSOCT_1_resample_10, Scan Count: 500
	Appointment Folder: 3

In [46]:
print_set_details(df_test, "Test")

Test Set: 2 patients
Patient ID: 190, Appointments: 23
	Appointment Folder: 190_OD_1_GA_6x6_SSOCT_1, Scan Count: 500
	Appointment Folder: 190_OD_1_GA_6x6_SSOCT_1_resample_1, Scan Count: 500
	Appointment Folder: 190_OD_1_GA_6x6_SSOCT_1_resample_2, Scan Count: 500
	Appointment Folder: 190_OD_1_GA_6x6_SSOCT_1_resample_3, Scan Count: 500
	Appointment Folder: 190_OD_1_GA_6x6_SSOCT_1_resample_4, Scan Count: 500
	Appointment Folder: 190_OD_1_GA_6x6_SSOCT_1_resample_5, Scan Count: 500
	Appointment Folder: 190_OD_1_GA_6x6_SSOCT_1_resample_6, Scan Count: 500
	Appointment Folder: 190_OD_1_GA_6x6_SSOCT_1_resample_7, Scan Count: 500
	Appointment Folder: 190_OD_1_GA_6x6_SSOCT_1_resample_8, Scan Count: 500
	Appointment Folder: 190_OD_1_GA_6x6_SSOCT_1_resample_9, Scan Count: 500
	Appointment Folder: 190_OD_1_GA_6x6_SSOCT_1_resample_10, Scan Count: 500
	Appointment Folder: 190_OD_1_GA_6x6_SSOCT_1_resample_11, Scan Count: 500
	Appointment Folder: 190_OD_1_GA_6x6_SSOCT_1_resample_12, Scan Count: 500
	App

In [47]:
# Save the split dataframes for future use
df_train.to_csv('../data/model/02/train.csv', index=False)
df_val.to_csv('../data/model/02/val.csv', index=False)
df_test.to_csv('../data/model/02/test.csv', index=False)

### Model X - Split by Patient ID 

Validation Set: 345, 578,47
Test Set: 190, 341

In [35]:
# # Step 1: Limit Patient 64 to 4 Appointments
# patient_64_data = df_model_ex_unknown[df_model_ex_unknown['patient_id'] == 64]
# patient_64_appointments = patient_64_data['folder_name'].unique()
# selected_appointments = np.random.choice(patient_64_appointments, 6, replace=False)
# print(selected_appointments)

['064_OD_5_GA_6x6_SSOCT_2' '064_OS_4_GA_6x6_SSOCT_1'
 '064_OD_8_GA_6x6_SSOCT_1' '064_OS_2_GA_6x6_SSOCT_1'
 '064_OD_7_GA_6x6_SSOCT_3' '064_OD_1_GA_6x6_SSOCT_1']


In [36]:
# # Create a new DataFrame with the modified patient 64 data
# df_patient_64_limited = patient_64_data[patient_64_data['folder_name'].isin(selected_appointments)]
# # Remove original patient 64 data from the main DataFrame and add the limited data
# df_model_ex_unknown_limited = df_model_ex_unknown[df_model_ex_unknown['patient_id'] != 64]
# df_model_ex_unknown_limited = pd.concat([df_model_ex_unknown_limited, df_patient_64_limited])

In [67]:
# Step 2: Define patient IDs for validation and test sets
val_patient_ids = [345, 47]
test_patient_ids = [190, 341]

In [68]:
# Step 3: Split the DataFrame into train, val, test sets
df_train = df_model_ex_unknown[
    ~df_model_ex_unknown['patient_id'].isin(val_patient_ids + test_patient_ids)]
df_val = df_model_ex_unknown[df_model_ex_unknown['patient_id'].isin(val_patient_ids)]
df_test = df_model_ex_unknown[df_model_ex_unknown['patient_id'].isin(test_patient_ids)]

In [69]:
# Print details for each set
print_set_details(df_train, "Training")

Training Set: 4 patients
Patient ID: 64, Appointments: 23
	Appointment Folder: 064_OD_10_GA_6x6_SSOCT_1, Scan Count: 500
	Appointment Folder: 064_OD_1_GA_6x6_SSOCT_1, Scan Count: 500
	Appointment Folder: 064_OD_1_GA_6x6_SSOCT_2, Scan Count: 500
	Appointment Folder: 064_OD_2_GA_6x6_SSOCT_1, Scan Count: 500
	Appointment Folder: 064_OD_3_GA_6x6_SSOCT_1, Scan Count: 500
	Appointment Folder: 064_OD_4_GA_6x6_SSOCT_1, Scan Count: 500
	Appointment Folder: 064_OD_5_GA_6x6_SSOCT_1, Scan Count: 500
	Appointment Folder: 064_OD_5_GA_6x6_SSOCT_2, Scan Count: 500
	Appointment Folder: 064_OD_6_GA_6x6_SSOCT_1, Scan Count: 500
	Appointment Folder: 064_OD_6_GA_6x6_SSOCT_2, Scan Count: 500
	Appointment Folder: 064_OD_7_GA_6x6_SSOCT_1, Scan Count: 500
	Appointment Folder: 064_OD_7_GA_6x6_SSOCT_2, Scan Count: 500
	Appointment Folder: 064_OD_7_GA_6x6_SSOCT_3, Scan Count: 500
	Appointment Folder: 064_OD_8_GA_6x6_SSOCT_1, Scan Count: 500
	Appointment Folder: 064_OD_9_GA_6x6_SSOCT_1, Scan Count: 500
	Appointmen

In [70]:
print_set_details(df_val, "Validation")

Validation Set: 2 patients
Patient ID: 47, Appointments: 2
	Appointment Folder: 047_OS_1_GA_6x6_SSOCT_1, Scan Count: 500
	Appointment Folder: 047_OS_2_GA_6x6_SSOCT_1, Scan Count: 500
Patient ID: 345, Appointments: 5
	Appointment Folder: 345_OD_1_GA_6x6_SSOCT_1, Scan Count: 500
	Appointment Folder: 345_OD_2_GA_6x6_SSOCT_1, Scan Count: 500
	Appointment Folder: 345_OD_3_GA_6x6_SSOCT_1, Scan Count: 500
	Appointment Folder: 345_OS_2_GA_6x6_SSOCT_1, Scan Count: 500
	Appointment Folder: 345_OS_3_GA_6x6_SSOCT_1, Scan Count: 500


In [71]:
print_set_details(df_test, "Test")

Test Set: 2 patients
Patient ID: 190, Appointments: 1
	Appointment Folder: 190_OD_1_GA_6x6_SSOCT_1, Scan Count: 500
Patient ID: 341, Appointments: 1
	Appointment Folder: 341_OS_1_GA_6x6_SSOCT_1, Scan Count: 500


In [73]:
# Save the split dataframes for future use
df_train.to_csv('../data/experiment/X/train.csv', index=False)
df_val.to_csv('../data/experiment/X/val.csv', index=False)
df_test.to_csv('../data/experiment/X/test.csv', index=False)