In [74]:
import boto3
import pandas as pd
from pathlib import Path
from enum import Enum
import os
import re

In [2]:
LOCAL_PATH = Path('/home/shared/datasets/defeatcovid19/sacco')
AWS_REGION = 'eu-west-1'
AWS_PROFILE = 'defeatcovid19'
AWS_BUCKET = 'dataset.sacco.defeatcovid19.org'
AWS_FOLDER = 'eco-scan-cardio'

session = boto3.Session(region_name=AWS_REGION, profile_name=AWS_PROFILE)
s3 = session.resource('s3')
bucket = s3.Bucket(AWS_BUCKET)
remote_files = bucket.objects.filter(Prefix=AWS_FOLDER).all();
for o in list(remote_files):
    print(o)

In [52]:
def sync_s3_to_local(bucket, region, profile, remote_path, local_path):
    local_path.mkdir(parents=True, exist_ok=True)
    
    results = []

    session = boto3.Session(region_name=region, profile_name=profile)
    s3 = session.resource('s3')
    bucket = s3.Bucket(bucket)
    remote_files = bucket.objects.filter(Prefix=remote_path).all();
    for obj in remote_files:
        item = obj.key

        item_path = local_path / item
    
        if item_path.suffix == '.dcm':
            results.append((item, item_path, item_path.name, obj.last_modified, obj.size))

        exists = item_path.exists()
        if exists:
            #print('Item {} already downloaded'.format(item))
            continue
        else:
            if item[-1] == '/':
                item_path.mkdir(parents=True, exist_ok=True)
            else:
                # Avoid skipping directory due to missing s3 folder keys
                item_path.parent.mkdir(parents=True, exist_ok=True)
            
            print('Downloading {} to {}'.format(item, item_path))
            bucket.download_file(item, str(item_path))

    # Avoid umask
    for root, dirs, files in os.walk(str(local_path / remote_path)):
        Path(root).chmod(0o777)
        
    return results

In [54]:
results = sync_s3_to_local(AWS_BUCKET, AWS_REGION, AWS_PROFILE, AWS_FOLDER, LOCAL_PATH)
print('...done!')

...done!


In [55]:
df = pd.DataFrame(data=results, columns=['index', 'local_path', 'filename', 'last_modified', 'size'])
df.set_index('index', inplace=True)

In [91]:
def extract_patient(index):
    patient_id_re = re.compile(r'(\w\w\w\w).*(\d\d)')
    
    path = Path(index).relative_to(AWS_FOLDER)
    main_folder = path.parts[0]
    if main_folder == '2020-01-01-acinesia-p-inflat':
        patient = path.parts[1][:6]
    elif main_folder == '2020-05-20-eco-normali-4c':
        if path.parts[1] == 'dicom.dcm':
            patient = 'DEGI02'
        else:
            patient = path.parts[1][2:8]
    elif main_folder == '2020-05-20-eco-normali-lax':
        patient = path.parts[1][3:9]
    elif main_folder == '2020-06-09-acinesia-apicale-ma-co':
        # todo: *al
        p = path.parts[1].replace('dubbia', '')
        if p.startswith('ALCAAL') or p.startswith('ROMARO'):
            patient = p[2:8]
        else:
            patient = p[0:6]
    elif main_folder == '2020-06-15-acinesia-p-sean-meba-al':
        patient = path.parts[1][:6]
    elif main_folder == '2020-07-01-acinesia-inferiore':
        patient = path.parts[1][:6]
    elif main_folder == '2020-07-17-pazienti-covid':
        patient = path.parts[1]
    elif main_folder == '2020-07-20-acinesia inferiore 2_MC':
        patient = path.parts[1][:6]
    elif main_folder == '2020-07-24-sani':
        patient = path.parts[1]
    elif main_folder == '2020-07-25-sani':
        patient = path.parts[1]
    elif main_folder == '2020-07-29-acinesia-ga':
        patient = path.parts[1][:6]
    elif main_folder == '2020-08-03-acinesia-ti':
        patient = path.parts[1][:6]
    elif main_folder == '2020-08-07-covid-normali':
        patient = path.parts[1]
        if patient == 'CAGI76 RIANIMAZIONE':
            patient = 'CAGI76'
    elif main_folder == '2020-09-24-acinesia-inferiore-3':
        patient = path.parts[1][:6]
    elif main_folder == '2020-09-25-acinesia-validazione-ag':
        patient = path.parts[1][:6]
    elif main_folder == '2020-09-30-eco-normali':
        patient = path.parts[2]
        if patient[:2] in ['2C', '2c', '3c', '3C', '4C', '4c']:
            patient = patient[2:-4]
        elif patient[:3] in ['LAX']:
            patient = patient[3:-4]
        # Optimistically extract patient id - Shame!
        patient_re_result = patient_id_re.search(patient)
        patient = ''.join(str(i) for i in patient_re_result.groups())
    elif main_folder == '2020-10-05-acinesia-validazione-ia-am':
        patient = path.parts[2].split('_')[0]
        # Optimistically extract patient id - Shame!
        patient_re_result = patient_id_re.search(patient)
        patient = ''.join(str(i) for i in patient_re_result.groups())
    else:
        raise Exception('Unknown folder "{}"'.format(main_folder))
    return patient.upper()

df['patient'] = [extract_patient(row) for row in df.index]

In [93]:
def extract_covid(index):
    path = Path(index).relative_to(AWS_FOLDER)
    main_folder = path.parts[0]
    if main_folder in [
        '2020-07-17-pazienti-covid',
        '2020-08-07-covid-normali'
    ]:
        return True
    else:
        return False
   
    
def extract_akinetic(index):
    path = Path(index).relative_to(AWS_FOLDER)
    main_folder = path.parts[0]
    if main_folder in [
        '2020-05-20-eco-normali-4c',
        '2020-05-20-eco-normali-lax',
        '2020-07-24-sani',
        '2020-07-25-sani',
        '2020-08-07-covid-normali',
        '2020-09-30-eco-normali'
    ]:
        return False
    elif main_folder in [
        '2020-01-01-acinesia-p-inflat', 
        '2020-06-09-acinesia-apicale-ma-co', 
        '2020-06-15-acinesia-p-sean-meba-al', 
        '2020-07-01-acinesia-inferiore', 
        '2020-07-17-pazienti-covid',
        '2020-07-20-acinesia inferiore 2_MC', 
        '2020-07-29-acinesia-ga',
        '2020-08-03-acinesia-ti',
        '2020-09-24-acinesia-inferiore-3',
        '2020-09-25-acinesia-validazione-ag',
        '2020-10-05-acinesia-validazione-ia-am'
    ]:
        return True
    else:
        raise Exception('Unknown folder "{}"'.format(main_folder))


df['covid'] = [extract_covid(row) for row in df.index]
df['akinetic'] = [extract_akinetic(row) for row in df.index]

In [94]:
print(df)
df.to_csv(LOCAL_PATH / 's3sync.csv')

                                                                                           local_path  \
index                                                                                                   
eco-scan-cardio/2020-01-01-acinesia-p-inflat/BR...  /home/shared/datasets/defeatcovid19/sacco/eco-...   
eco-scan-cardio/2020-01-01-acinesia-p-inflat/DI...  /home/shared/datasets/defeatcovid19/sacco/eco-...   
eco-scan-cardio/2020-01-01-acinesia-p-inflat/GR...  /home/shared/datasets/defeatcovid19/sacco/eco-...   
eco-scan-cardio/2020-01-01-acinesia-p-inflat/GR...  /home/shared/datasets/defeatcovid19/sacco/eco-...   
eco-scan-cardio/2020-01-01-acinesia-p-inflat/MO...  /home/shared/datasets/defeatcovid19/sacco/eco-...   
...                                                                                               ...   
eco-scan-cardio/2020-10-05-acinesia-validazione...  /home/shared/datasets/defeatcovid19/sacco/eco-...   
eco-scan-cardio/2020-10-05-acinesia-validazione...  /ho