# Relvant Stroke Statistics

### From "all_metadata.csv", this notebook will produce statistics for the following things:

1. Total number of patients in the study (based on unique values of Patient_MRN)
2. Total number of studies in the study (based on unique values of Study_id)
3. Total number of studies per patient_MRN
4. Display the updated combination of Structure/Scan_pattern/modality
5. Display the frequency of Structure/Scan_pattern/modality combination for each patient_MRN in the study
6. Display the frequency of Structure/Scan_pattern/modality combination for each study_id in the study

In [45]:
import pandas as pd
import os

In [46]:
data_path= os.path.join('/', 'scratch', 'sas10092', 'eyescore', 'data')
metadata_path = os.path.join(data_path, 'intermediate', 'h5-metadata', 'all_metadata.csv') 
metadata = pd.read_csv(metadata_path)

In [47]:
metadata.head(10)

Unnamed: 0.1,Unnamed: 0,Data_division,File_name,Patient_MRN,Sex,Acquisition_date,Study_id,Series_id,Structure,Laterality,Scan_pattern,Enface_modality,Oct_modality,en_face_scans_num,B_scans_num
0,0,S - 1,10025081,184850,M,3/28/2019,33970,0.0,Retina,OD,OCT B-Scan,Infra-Red,OCT,0.0,2.0
1,1,S - 1,10025081,184850,M,3/28/2019,33970,1.0,Retina,OD,OCT ART Volume,Infra-Red,OCT,1.0,49.0
2,2,S - 1,10025081,184850,M,3/28/2019,33970,2.0,Retina,OS,OCT B-Scan,Infra-Red,OCT,0.0,4.0
3,3,S - 1,10025081,184850,M,3/28/2019,33970,3.0,Retina,OS,OCT ART Volume,Infra-Red,OCT,1.0,49.0
4,4,S - 1,10060021,229051,M,5/13/2018,23043,0.0,Retina,OD,OCT B-Scan,Infra-Red,OCT,0.0,2.0
5,5,S - 1,10060021,229051,M,5/13/2018,23043,1.0,Retina,OS,OCT B-Scan,Infra-Red,OCT,0.0,2.0
6,6,S - 1,10060021,229051,M,5/13/2018,23043,2.0,Retina,OS,OCT ART Volume,Infra-Red,OCT,1.0,25.0
7,7,S - 1,10021031,177033,F,8/30/2020,50760,0.0,Retina,OD,OCT B-Scan,Infra-Red,OCT,0.0,2.0
8,8,S - 1,10021031,177033,F,8/30/2020,50760,1.0,Retina,OD,OCT ART Volume,Infra-Red,OCT,1.0,25.0
9,9,S - 1,10021031,177033,F,8/30/2020,50760,2.0,Retina,OS,OCT ART Volume,Infra-Red,OCT,1.0,25.0


In [48]:
metadata.columns

Index(['Unnamed: 0', 'Data_division', 'File_name', 'Patient_MRN', 'Sex',
       'Acquisition_date', 'Study_id', 'Series_id', 'Structure', 'Laterality',
       'Scan_pattern', 'Enface_modality', 'Oct_modality', 'en_face_scans_num',
       'B_scans_num'],
      dtype='object')

In [49]:
patient_mrn = metadata.groupby(['Patient_MRN'])

In [50]:
patient_mrn.head(10)

Unnamed: 0.1,Unnamed: 0,Data_division,File_name,Patient_MRN,Sex,Acquisition_date,Study_id,Series_id,Structure,Laterality,Scan_pattern,Enface_modality,Oct_modality,en_face_scans_num,B_scans_num
0,0,S - 1,10025081,184850,M,3/28/2019,33970,0.0,Retina,OD,OCT B-Scan,Infra-Red,OCT,0.0,2.0
1,1,S - 1,10025081,184850,M,3/28/2019,33970,1.0,Retina,OD,OCT ART Volume,Infra-Red,OCT,1.0,49.0
2,2,S - 1,10025081,184850,M,3/28/2019,33970,2.0,Retina,OS,OCT B-Scan,Infra-Red,OCT,0.0,4.0
3,3,S - 1,10025081,184850,M,3/28/2019,33970,3.0,Retina,OS,OCT ART Volume,Infra-Red,OCT,1.0,49.0
4,4,S - 1,10060021,229051,M,5/13/2018,23043,0.0,Retina,OD,OCT B-Scan,Infra-Red,OCT,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130650,130650,N-O,10143011,47026,F,1/2/2018,18692,0.0,Retina,OS,OCT ART Volume,Infra-Red,OCT,1.0,25.0
130651,130651,N-O,10143011,47026,F,1/2/2018,18692,1.0,Retina,OS,OCT Star Scan,Infra-Red,OCT,1.0,12.0
130652,130652,N-O,10143011,47026,F,1/2/2018,18692,2.0,Retina,OS,OCT B-Scan,Infra-Red,OCT,0.0,2.0
130653,130653,N-O,10143011,47026,F,1/2/2018,18692,3.0,Retina,OD,OCT B-Scan,Infra-Red,OCT,0.0,3.0


In [51]:
metadata['Patient_MRN'].unique()

array([184850, 229051, 177033, ..., 140270, 412743,  88438])

## Count the total number of Patients in the Study
There are 9056 unique patient_mrn in the study

In [52]:
c = 0;
for i in metadata['Patient_MRN'].unique():
    c+=1
print("Total number of patients in the study: ", c)

Total number of patients in the study:  9056


In [53]:
metadata['Study_id'].unique()

array([33970, 23043, 50760, ..., 14477, 18692, 26394])

## Count the total number of studies
There are 23708 unique studies

In [54]:
a = 0;
for i in metadata['Study_id'].unique():
    a+=1
print("Total number of studies in the study: ", a)

Total number of studies in the study:  23708


## The following 2 cells produce a csv file that contains the number of studies per patient

Key point to note is that the highest number of studies per patient is 73 for MRN 33699

In [55]:
study_count_per_patient = metadata.groupby('Patient_MRN')['Study_id'].nunique().reset_index(name='Number_of_Studies')

In [56]:
sorted_df = study_count_per_patient.sort_values(by='Number_of_Studies', ascending=False)
sorted_df.to_csv('sorted_study_count_per_patient.csv', index=False)

## Function that creates a csv file containing the general information for scans combinations

Key point to note is that the highest general combination is Retina/OCT B-Scan/OCT (count: 44390)

In [57]:
def combinations_to_csv(data, output_csv):
    results = []

    for i in data.Structure.unique():
        for j in data[data.Structure == i].Scan_pattern.unique():
            scan = data[(data.Structure == i) & (data.Scan_pattern == j)]
            for k, l in zip(scan.Enface_modality.unique(), scan.Oct_modality.unique()):
                enface = scan[scan.Enface_modality == k]
                
                en_face_scans_num = enface.iloc[0]['en_face_scans_num'] if len(enface) > 0 else 0
                if en_face_scans_num == 1:
                    results.append([i, j, k, len(enface)])
                else:
                    results.append([i, j, 'N/A', len(enface)])

                results.append([i, j, l, len(enface[enface.Oct_modality == l])])
    result_df = pd.DataFrame(results, columns=['Structure', 'Scan_pattern', 'Enface_modality', 'Count'])
    result_df.to_csv(output_csv, index=False)

In [58]:
output_csv = 'combinations.csv'
combinations_to_csv(metadata, output_csv)

## Function that creates a csv file containing the scan combination information count per patient_MRN

Key point to note: the most common scan combination is Retina/OCT ART Volume/Infra-Red and Retina/OCT ART Volume/OCT with count 134 for patient_MRN 85345. This is also the most common combination scans per patient overall. 

In [59]:
def save_combinations_to_csv(data, output_csv):
    results = []

    for study_id, study_group in data.groupby('Patient_MRN'):
        for i in study_group.Structure.unique():
            for j in study_group[study_group.Structure == i].Scan_pattern.unique():
                scan = study_group[(study_group.Structure == i) & (study_group.Scan_pattern == j)]
                for k, l in zip(scan.Enface_modality.unique(), scan.Oct_modality.unique()):
                    enface = scan[scan.Enface_modality == k]
                    
                    en_face_scans_num = enface.iloc[0]['en_face_scans_num'] if len(enface) > 0 else 0
                    if en_face_scans_num == 1:
                        results.append([study_id, i, j, k, len(enface)])
                    else:
                        results.append([study_id, i, j, 'N/A', len(enface)])

                    results.append([study_id, i, j, l, len(enface[enface.Oct_modality == l])])

    result_df = pd.DataFrame(results, columns=['Study_id', 'Structure', 'Scan_pattern', 'Enface_modality', 'Count'])
    result_df.to_csv(output_csv, index=False)

In [60]:
output_csv = 'new_output_combinations.csv'
save_combinations_to_csv(metadata, output_csv)

## Function that creates a csv file containing the scan combination information count per study_id

Key point to note: the most common scan combination is Retina/Images/ICG Angiography with count 51 for study_id 52535. This topped the most frequent scan combination per study as well. (You can infer from the csv file) 

In [61]:
def save_to_csv(data, output_csv):
    results = []

    for study_id, study_group in data.groupby('Study_id'):
        for i in study_group.Structure.unique():
            for j in study_group[study_group.Structure == i].Scan_pattern.unique():
                scan = study_group[(study_group.Structure == i) & (study_group.Scan_pattern == j)]
                for k, l in zip(scan.Enface_modality.unique(), scan.Oct_modality.unique()):
                    enface = scan[scan.Enface_modality == k]
                    
                    en_face_scans_num = enface.iloc[0]['en_face_scans_num'] if len(enface) > 0 else 0
                    if en_face_scans_num == 1:
                        results.append([study_id, i, j, k, len(enface)])
                    else:
                        results.append([study_id, i, j, 'N/A', len(enface)])

                    results.append([study_id, i, j, l, len(enface[enface.Oct_modality == l])])

    result_df = pd.DataFrame(results, columns=['Study_id', 'Structure', 'Scan_pattern', 'Enface_modality', 'Count'])
    result_df.to_csv(output_csv, index=False)

In [None]:
output_csv = 'final_combinations.csv'
save_to_csv(metadata, output_csv)