# Libraries

In [1]:
import numpy as np
import os

import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Utilities

In [3]:
def get_scan_paths():
    #for MAC
    slices_path = r'/Volumes/fsmresfiles/Ophthalmology/Mirza_Images/AMD/dAMD_GA/all_slices_3'
    slices = os.listdir(slices_path)

    # For windows
    # slices_path = r'\\fsmresfiles.fsm.northwestern.edu\fsmresfiles\Ophthalmology\Mirza_Images\AMD\dAMD_GA\all_slices_3'
    # slices = os.listdir(all_slices_path)

    scan_paths = [item for item in slices if item.endswith('.jpg')]
    return scan_paths

In [7]:
def create_model_df(scan_paths):

    landmark_path = '/Users/shubhigupta/Documents/GitHub/OCT-GA-Detection/FoveaProgression/data/fovea_landmarks.csv'
    data_landmarks = pd.read_csv(landmark_path)

    scan_metadata_path = '/Users/shubhigupta/Documents/GitHub/OCT-GA-Detection/FoveaProgression/data/fovea_detection_data.csv'
    data_metadata = pd.read_csv(scan_metadata_path)

    filtered_data = data_metadata[data_metadata['scan_name'].isin(scan_paths)]
    
    filtered_data['label'] = filtered_data['status'].astype(int)

    sorted_filtered_data = filtered_data.sort_values(by=['patient_id','folder_name','scan_number'])

    return data_landmarks, sorted_filtered_data


In [16]:
def assign_sequence_threshold(landmarks_mapping,scan_number, patient_id, folder_name):
    # Retrieve the landmarks for the given patient and folder
    landmarks = landmarks_mapping.get((patient_id, folder_name), None)
    if landmarks is None:
        return 'unknown', -1  # Handle cases where no mapping is found

    if scan_number < landmarks['start'] or scan_number > landmarks['end']:
        return 'no fovea', 0
    elif landmarks['start'] <= scan_number <= landmarks['start_threshold']:
        return 'progression', 1
    elif landmarks['end_threshold'] <= scan_number <= landmarks['end']:
        return 'progression', 1
    else:
        return 'deepest', 2

In [26]:
def landmark_labels(fovea_landmarks,fovea_detection_data):

    # Initialize a dictionary to hold the landmarks for each patient and folder
    landmarks_mapping = {}
    for index, row in fovea_landmarks.iterrows():
        key = (row['patient_id'], row['folder_name'])
        landmarks_mapping[key] = {
            'start': row['start'],
            'deepest': row['deepest'],
            'end': row['end'],
            'start_threshold': row['deepest'] - (row['deepest'] - row['start'])/4,
            'end_threshold': row['deepest'] + (row['end'] - row['deepest'])/4
        }

    # Apply the function to each row
    results = fovea_detection_data.apply(lambda row: assign_sequence_threshold(landmarks_mapping,row['scan_number'], row['patient_id'], row['folder_name']), axis=1)
    fovea_detection_data['sequence'], fovea_detection_data['sequence_label'] = zip(*results)
    
    
    return fovea_detection_data

# Pre-processing

In [29]:
scan_paths = get_scan_paths()
df_landmarks, df_metadata = create_model_df(scan_paths)
df_landmarks.info()
df_metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   patient_id     41 non-null     int64 
 1   folder_name    41 non-null     object
 2   start          41 non-null     int64 
 3   deepest        41 non-null     int64 
 4   end            41 non-null     int64 
 5   start_slice    41 non-null     object
 6   deepest_slice  41 non-null     object
 7   end_slice      41 non-null     object
dtypes: int64(4), object(4)
memory usage: 2.7+ KB
<class 'pandas.core.frame.DataFrame'>
Index: 21500 entries, 10202 to 12453
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   scan_name    21500 non-null  object
 1   scan_number  21500 non-null  int64 
 2   folder_name  21500 non-null  object
 3   patient_id   21500 non-null  int64 
 4   eye_side     21500 non-null  object
 5   status       21500 

In [30]:
df_landmarks.head()

Unnamed: 0,patient_id,folder_name,start,deepest,end,start_slice,deepest_slice,end_slice
0,47,047_OS_1_GA_6x6_SSOCT_1,198,253,355,047_OS_1_GA_6x6_SSOCT_1_198.jpg,047_OS_1_GA_6x6_SSOCT_1_253.jpg,047_OS_1_GA_6x6_SSOCT_1_355.jpg
1,47,047_OS_2_GA_6x6_SSOCT_1,194,258,368,047_OS_2_GA_6x6_SSOCT_1_194.jpg,047_OS_2_GA_6x6_SSOCT_1_258.jpg,047_OS_2_GA_6x6_SSOCT_1_368.jpg
2,64,064_OD_1_GA_6x6_SSOCT_1,209,287,369,064_OD_1_GA_6x6_SSOCT_1_209.jpg,064_OD_1_GA_6x6_SSOCT_1_287.jpg,064_OD_1_GA_6x6_SSOCT_1_369.jpg
3,64,064_OD_1_GA_6x6_SSOCT_2,215,289,368,064_OD_1_GA_6x6_SSOCT_2_215.jpg,064_OD_1_GA_6x6_SSOCT_2_289.jpg,064_OD_1_GA_6x6_SSOCT_2_368.jpg
4,64,064_OD_10_GA_6x6_SSOCT_1,155,258,348,064_OD_10_GA_6x6_SSOCT_1_155.jpg,064_OD_10_GA_6x6_SSOCT_1_258.jpg,064_OD_10_GA_6x6_SSOCT_1_348.jpg


In [31]:
df_metadata.head()

Unnamed: 0,scan_name,scan_number,folder_name,patient_id,eye_side,status,label
10202,047_OS_1_GA_6x6_SSOCT_1_1.jpg,1,047_OS_1_GA_6x6_SSOCT_1,47,left,False,0
10439,047_OS_1_GA_6x6_SSOCT_1_2.jpg,2,047_OS_1_GA_6x6_SSOCT_1,47,left,False,0
10021,047_OS_1_GA_6x6_SSOCT_1_3.jpg,3,047_OS_1_GA_6x6_SSOCT_1,47,left,False,0
10301,047_OS_1_GA_6x6_SSOCT_1_4.jpg,4,047_OS_1_GA_6x6_SSOCT_1,47,left,False,0
10164,047_OS_1_GA_6x6_SSOCT_1_5.jpg,5,047_OS_1_GA_6x6_SSOCT_1,47,left,False,0


In [32]:
df_model = landmark_labels(df_landmarks, df_metadata)

In [33]:
df_model.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21500 entries, 10202 to 12453
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   scan_name       21500 non-null  object
 1   scan_number     21500 non-null  int64 
 2   folder_name     21500 non-null  object
 3   patient_id      21500 non-null  int64 
 4   eye_side        21500 non-null  object
 5   status          21500 non-null  bool  
 6   label           21500 non-null  int64 
 7   sequence        21500 non-null  object
 8   sequence_label  21500 non-null  int64 
dtypes: bool(1), int64(4), object(4)
memory usage: 1.5+ MB


In [34]:
scan_img_path = r'/Volumes/fsmresfiles/Ophthalmology/Mirza_Images/AMD/dAMD_GA/all_slices_3'
df_model['image_path'] = df_model.apply(lambda row: os.path.join(scan_img_path, row['scan_name']), axis=1)

In [35]:
df_model.head()

Unnamed: 0,scan_name,scan_number,folder_name,patient_id,eye_side,status,label,sequence,sequence_label,image_path
10202,047_OS_1_GA_6x6_SSOCT_1_1.jpg,1,047_OS_1_GA_6x6_SSOCT_1,47,left,False,0,no fovea,0,/Volumes/fsmresfiles/Ophthalmology/Mirza_Image...
10439,047_OS_1_GA_6x6_SSOCT_1_2.jpg,2,047_OS_1_GA_6x6_SSOCT_1,47,left,False,0,no fovea,0,/Volumes/fsmresfiles/Ophthalmology/Mirza_Image...
10021,047_OS_1_GA_6x6_SSOCT_1_3.jpg,3,047_OS_1_GA_6x6_SSOCT_1,47,left,False,0,no fovea,0,/Volumes/fsmresfiles/Ophthalmology/Mirza_Image...
10301,047_OS_1_GA_6x6_SSOCT_1_4.jpg,4,047_OS_1_GA_6x6_SSOCT_1,47,left,False,0,no fovea,0,/Volumes/fsmresfiles/Ophthalmology/Mirza_Image...
10164,047_OS_1_GA_6x6_SSOCT_1_5.jpg,5,047_OS_1_GA_6x6_SSOCT_1,47,left,False,0,no fovea,0,/Volumes/fsmresfiles/Ophthalmology/Mirza_Image...


In [36]:
df_model.to_csv('../data/fovea_progression_dataset.csv', index=False)

In [37]:
unknown_status_df = df_model[df_model['sequence'] == 'unknown']
unique_patient_ids = unknown_status_df['folder_name'].unique()

print(unique_patient_ids.tolist())

['190_OS_1_GA_6x6_SSOCT_1', '345_OS_1_GA_6x6_SSOCT_1']


In [38]:
df_model_ex_unknown = df_model[df_model['sequence']!= 'unknown']

# Write the filtered DataFrame to a CSV file
df_model_ex_unknown.to_csv('../data/fovea_progression_dataset_excluding_unknowns.csv', index=False)

# Generating Datasets


## Creating Sample dataset

In [77]:
df_sample_train = df_model_ex_unknown[df_model_ex_unknown['folder_name'] == '047_OS_1_GA_6x6_SSOCT_1']
df_sample_train.to_csv('../data/sample/train.csv')

df_sample_test = df_model_ex_unknown[df_model_ex_unknown['folder_name'] == '064_OD_1_GA_6x6_SSOCT_2']
df_sample_test.to_csv('../data/sample/test.csv')

df_sample_val = df_model_ex_unknown[df_model_ex_unknown['folder_name'] == '321_OD_1_GA_6x6_SSOCT_1']
df_sample_val.to_csv('../data/sample/val.csv')

## Creating Model dataset

In [70]:
shuffled_folders = df_landmarks.sample(frac=1, random_state=42)['folder_name'].tolist()
# Determine split indices
train_size = int(0.8 * len(shuffled_folders))
val_size = int(0.15 * len(shuffled_folders))
test_size = len(shuffled_folders) - train_size - val_size

print(train_size)
print(val_size)
print(test_size)


32
6
3


In [71]:
# Split the folder names into train, test, val
train_folders = shuffled_folders[:train_size]
val_folders = shuffled_folders[train_size:train_size + val_size]
test_folders = shuffled_folders[train_size + val_size:]

# Filter the main DataFrame based on the splits
train_scans = df_model_ex_unknown[df_model_ex_unknown['folder_name'].isin(train_folders)]
val_scans = df_model_ex_unknown[df_model_ex_unknown['folder_name'].isin(val_folders)]
test_scans = df_model_ex_unknown[df_model_ex_unknown['folder_name'].isin(test_folders)]


In [72]:
print("Train set:")
print(train_scans.info())

Train set:
<class 'pandas.core.frame.DataFrame'>
Index: 16000 entries, 10202 to 12453
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   scan_name       16000 non-null  object
 1   scan_number     16000 non-null  int64 
 2   folder_name     16000 non-null  object
 3   patient_id      16000 non-null  int64 
 4   eye_side        16000 non-null  object
 5   status          16000 non-null  bool  
 6   label           16000 non-null  int64 
 7   sequence        16000 non-null  object
 8   sequence_label  16000 non-null  int64 
 9   image_path      16000 non-null  object
dtypes: bool(1), int64(4), object(5)
memory usage: 1.2+ MB
None


In [73]:
print("\nValidation set:")
print(val_scans.info())


Validation set:
<class 'pandas.core.frame.DataFrame'>
Index: 3000 entries, 13270 to 8680
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   scan_name       3000 non-null   object
 1   scan_number     3000 non-null   int64 
 2   folder_name     3000 non-null   object
 3   patient_id      3000 non-null   int64 
 4   eye_side        3000 non-null   object
 5   status          3000 non-null   bool  
 6   label           3000 non-null   int64 
 7   sequence        3000 non-null   object
 8   sequence_label  3000 non-null   int64 
 9   image_path      3000 non-null   object
dtypes: bool(1), int64(4), object(5)
memory usage: 237.3+ KB
None


In [74]:
print("\nTest set:")
print(test_scans.info())


Test set:
<class 'pandas.core.frame.DataFrame'>
Index: 1500 entries, 1331 to 14580
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   scan_name       1500 non-null   object
 1   scan_number     1500 non-null   int64 
 2   folder_name     1500 non-null   object
 3   patient_id      1500 non-null   int64 
 4   eye_side        1500 non-null   object
 5   status          1500 non-null   bool  
 6   label           1500 non-null   int64 
 7   sequence        1500 non-null   object
 8   sequence_label  1500 non-null   int64 
 9   image_path      1500 non-null   object
dtypes: bool(1), int64(4), object(5)
memory usage: 118.7+ KB
None


In [75]:
train_folders_set = set(train_scans['folder_name'])
val_folders_set = set(val_scans['folder_name'])
test_folders_set = set(test_scans['folder_name'])

# Check for overlaps
train_val_overlap = train_folders_set.intersection(val_folders_set)
train_test_overlap = train_folders_set.intersection(test_folders_set)
val_test_overlap = val_folders_set.intersection(test_folders_set)

# Print results
print("Overlap between train and val sets:", train_val_overlap)
print("Overlap between train and test sets:", train_test_overlap)
print("Overlap between val and test sets:", val_test_overlap)

# Verify if there are any overlaps
if not train_val_overlap and not train_test_overlap and not val_test_overlap:
    print("No overlaps found between the datasets. The split is valid.")
else:
    print("Overlaps found! Please check the dataset splits.")

Overlap between train and val sets: set()
Overlap between train and test sets: set()
Overlap between val and test sets: set()
No overlaps found between the datasets. The split is valid.


In [76]:
train_scans.to_csv('../data/model/train.csv')
val_scans.to_csv('../data/model/val.csv')
test_scans.to_csv('../data/model/test.csv')