In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import os.path
from bs4 import BeautifulSoup
import requests

In [71]:
record_path_local = '../../../2_data/record_path_mimic3_matched.csv'
record_segment_file_name = '../../../2_data/' + 'record_segment.csv'
record_root_url = 'https://physionet.org/content/mimic3wdb/1.0/matched'
record_path_url = record_root_url + '/RECORDS-waveforms'

signal_extract_name = 'PLETH'
signal_extract_minimum_length = 5 # the minimum length of the signal segment, in minutes

#record_root_url = 'https://archive.physionet.org/physiobank/database/mimic3wdb'
#record_path_url = record_root_url + '/RECORDS'

print('record_path_local: {}'.format(record_path_local))
print('record_root_url: {}'.format(record_root_url))
print('record_path_url: {}'.format(record_path_url))

record_path_local: ../../../2_data/record_path_mimic3_matched.csv
record_root_url: https://physionet.org/content/mimic3wdb/1.0/matched
record_path_url: https://physionet.org/content/mimic3wdb/1.0/matched/RECORDS-waveforms


In [3]:
def get_url_content(url, tag=None):
    
    '''
    This function scrapes a list of useful information from a given PhysioNet URL.
    If the URL address points to an HTML document, the information to be extracted is define by a tag.
    I found this address (https://hackersandslackers.com/scraping-urls-with-beautifulsoup/) usefull
      for details on how to scrape a web page
    '''
    
    headers = {
        'Access-Control-Allow-Origin': '*',
        'Access-Control-Allow-Methods': 'GET',
        'Access-Control-Allow-Headers': 'Content-Type',
        'Access-Control-Max-Age': '3600',
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }    
    
    req = requests.get(url, headers)
    soup = BeautifulSoup(req.content, 'html.parser')
    if '<!DOCTYPE html>' in str(soup):
        content = getattr(soup, str(tag)).getText()
        #content = soup.pre.getText()
    else:
        content = soup.getText()
    return content.splitlines()

In [8]:
tag = 'pre'
record_path_list = get_url_content(record_path_url, tag)
print(len(record_path_list))
for i in range(5):
    print('record_path: {}'.format(record_path_list[i]))

22317
record_path: p00/p000020/p000020-2183-04-28-17-47
record_path: p00/p000030/p000030-2172-10-16-12-22
record_path: p00/p000033/p000033-2116-12-24-12-35
record_path: p00/p000033/p000033-2116-12-25-13-11
record_path: p00/p000052/p000052-2191-01-10-02-21


In [72]:
# TO DO:
# we have to decide whether the index_of_last_record_processed is 0 or 1 based 
#  (i.e. it starts from zero or from one)

tag = 'pre'
index_of_last_record_processed = -1
for i in range(index_of_last_record_processed + 1, 20):
        
    #print(i)
    record_path = record_path_list[i]
    print('processing record #: {}, record_path: {}'.format(i, record_path))
    
    #directory p04/p044083.
    directory = str(record_path.split("/")[0]) + '/' + str(record_path.split("/")[1])
    #print('directory: {}'.format(directory))
    
    intermediate_directory = str(record_path.split("/")[0])
    #print('intermediate_directory: {}'.format(intermediate_directory))
    subject_id = record_path.split("/")[1]
    subject_id = int(subject_id[1:])
    #print('subject_id: {}'.format(subject_id))
        
    master_waveform_header = record_root_url + '/' + record_path + '.hea'
    #print('master_waveform_header: {}'.format(master_waveform_header))
    
    target_url = master_waveform_header
    master_waveform_content = get_url_content(target_url, tag)
    #print('master_waveform_content: {}'.format(master_waveform_content))
    
    #print('       ...........\n')
    
    master_waveform_content_first_line = master_waveform_content[0].split(' ')
    record_length = int(master_waveform_content_first_line[3]) / int(master_waveform_content_first_line[2])
    #print('record_length (s): {}'.format(record_length))
    
    #print('       ...........\n')    
        
    waveform_layout_header = master_waveform_content[1].split(' ')[0]
    waveform_layout_header = waveform_layout_header + '.hea'
    waveform_layout_header = record_root_url + '/' + directory + '/' + waveform_layout_header
    #print('waveform_layout_header: {}'.format(waveform_layout_header))    
    target_url = waveform_layout_header
    waveform_layout_content = get_url_content(target_url, tag)
    #print('waveform_layout_content: {}'.format(waveform_layout_content))
        
    signal_names = [x.split(' ')[-1] for x in waveform_layout_content[1:-1]]
    #print('signal_names: {}'.format(signal_names))
    
    if signal_extract_name in signal_names and record_length > 60 * signal_extract_minimum_length: # and len(record_segments_path_list):      
        
        # Open the record_segment_file_name file with access mode 'a'
        record_segment_file_object = open(record_segment_file_name, 'a')

        # get the waveform record name
        waveform_record_name = [i.split(' ')[0].split('_') for i in master_waveform_content if '3' == i[0]][0][0]
        
        # get the waveform_record_state
        waveform_record_state = [(waveform_record_name + '_' in x) and not (waveform_record_name + '_layout' in x) 
                for x in master_waveform_content]

        # get the waveform_record_segment
        waveform_record_segment = [master_waveform_content[i].split(' ') for i, j in enumerate(waveform_record_state) if j]

        
        for i in range(len(waveform_record_segment)):
            # number of samples: int(waveform_record_segment[i][1])
            # sampling frequency: int(master_waveform_content_first_line[2])
            # length (in seconds) of the record segment: number of samples / sampling frequency
            
            if int(waveform_record_segment[i][1]) / int(master_waveform_content_first_line[2]) > 60 * signal_extract_minimum_length:
                
                waveform_record_segment_good_length_root = directory + '/' +  waveform_record_segment[i][0]
                print(waveform_record_segment_good_length_root)
                
                # Append the waveform_record_segment_good_length_root at the end of file record_segment_file_name
                record_segment_file_object.write(waveform_record_segment_good_length_root + '\n')

        print('baubau_1')
        
        # Close the file record_segment_file_name
        record_segment_file_object.close()
        break
        
    
    
    print('       ...........\n')
    index_of_last_record_processed = index_of_last_record_processed + 1
    print('index_of_last_record_processed: {}'.format(index_of_last_record_processed))    
    print('--------------------------------------')

    
    


0
record_path: p00/p000020/p000020-2183-04-28-17-47
directory: p00/p000020
intermediate_directory: p00
subject_id: 20
master_waveform_header: https://physionet.org/content/mimic3wdb/1.0/matched/p00/p000020/p000020-2183-04-28-17-47.hea
master_waveform_content: ['p000020-2183-04-28-17-47/10 4 125 9862593 17:47:59.486 28/04/2183', '3544749_layout 0', '~ 93', '3544749_0001 3811', '3544749_0002 124', '3544749_0003 8', '3544749_0004 380', '3544749_0005 7098177', '3544749_0006 15000', '3544749_0007 30000', '3544749_0008 2715000']
       ...........

record_length (s): 78900.744
       ...........

waveform_layout_header: https://physionet.org/content/mimic3wdb/1.0/matched/p00/p000020/3544749_layout.hea
waveform_layout_content: ['3544749_layout 4 125 0 17:47:59.486', '~ 0 43/mV 14 0 -8192 0 0 II', '~ 0 50/mV 14 0 -8192 0 0 AVF', '~ 0 1.25/mmHg 9 0 -256 0 0 ABP', '~ 0 2.5/mmHg 10 0 -512 0 0 PAP']
signal_names: ['II', 'AVF', 'ABP']
       ...........

index_of_last_record_processed: 0
----------

master_waveform_content: ['p000085-2167-07-25-21-11/18 4 125 20392500 21:11:31.000 25/07/2167', '3647298_layout 0', '~ 2875', '3647298_0001 31561', '3647298_0002 7151064', '3647298_0003 750', '3647298_0004 125', '3647298_0005 6375', '3647298_0006 854', '3647298_0007 2642271', '3647298_0008 750', '3647298_0009 125', '3647298_0010 322375', '3647298_0011 2524125', '3647298_0012 750', '3647298_0013 125', '3647298_0014 7000', '3647298_0015 847', '3647298_0016 7700528', '# Location: micu']
       ...........

record_length (s): 163140.0
       ...........

waveform_layout_header: https://physionet.org/content/mimic3wdb/1.0/matched/p00/p000085/3647298_layout.hea
waveform_layout_content: ['3647298_layout 4 125 0 21:11:31', '~ 0 128(-65)/mV 8 0 -128 0 0 II', '~ 0 127(-64)/mV 8 0 -128 0 0 V', '~ 0 128/mV 9 0 -256 0 0 III', '~ 0 128/mV 9 0 -256 0 0 I']
signal_names: ['II', 'V', 'III']
       ...........

index_of_last_record_processed: 8
--------------------------------------
9
record_path: p00/p

waveform_layout_content: ['3860035_layout 5 125 0 21:01:58', '~ 0 128/mV 9 0 -256 0 0 II', '~ 0 128(-65)/mV 8 0 -128 0 0 V', '~ 0 255(-128)/NU 8 0 -128 0 0 PLETH', '~ 0 1.28(-109)/mmHg 8 0 -128 0 0 ABP', '~ 0 128/mV 9 0 -256 0 0 I']
signal_names: ['II', 'V', 'PLETH', 'ABP']
p00/p000107/3860035_0001
p00/p000107/3860035_0004
p00/p000107/3860035_0006
p00/p000107/3860035_0009
p00/p000107/3860035_0010
p00/p000107/3860035_0011
p00/p000107/3860035_0012
p00/p000107/3860035_0014
p00/p000107/3860035_0016
p00/p000107/3860035_0022
p00/p000107/3860035_0024
p00/p000107/3860035_0029
p00/p000107/3860035_0031
p00/p000107/3860035_0032
p00/p000107/3860035_0034
p00/p000107/3860035_0037
p00/p000107/3860035_0040
p00/p000107/3860035_0044
baubau_1
