In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import os.path
from bs4 import BeautifulSoup
import requests

In [2]:
first_record = 10
last_record = 11
record_segment_file_name = '../../../2_data/' + 'record_segment_good_length_' + str(first_record) + '_' + str(last_record - 1) +'.csv'

In [3]:
record_segment_file_name

'../../../2_data/record_segment_good_length_10_10.csv'

In [4]:
record_path_local = '../../../2_data/record_path_mimic3_matched.csv'
record_root_url = 'https://physionet.org/content/mimic3wdb/1.0/matched'
record_path_url = record_root_url + '/RECORDS-waveforms'

signal_extract_name = 'PLETH'
sampling_frequency = 125 # Hz
signal_extract_minimum_length = 5 # the minimum length of the signal segment, in minutes

print('record_path_local: {}'.format(record_path_local))
print('record_root_url: {}'.format(record_root_url))
print('record_path_url: {}'.format(record_path_url))

record_path_local: ../../../2_data/record_path_mimic3_matched.csv
record_root_url: https://physionet.org/content/mimic3wdb/1.0/matched
record_path_url: https://physionet.org/content/mimic3wdb/1.0/matched/RECORDS-waveforms


In [5]:
def get_url_content(url, tag=None):
    
    '''
    This function scrapes a list of useful information from a given PhysioNet URL.
    If the URL address points to an HTML document, the information to be extracted is define by a tag.
    I found this address (https://hackersandslackers.com/scraping-urls-with-beautifulsoup/) usefull
      for details on how to scrape a web page
    '''
    
    headers = {
        'Access-Control-Allow-Origin': '*',
        'Access-Control-Allow-Methods': 'GET',
        'Access-Control-Allow-Headers': 'Content-Type',
        'Access-Control-Max-Age': '3600',
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }    
    
    req = requests.get(url, headers)
    soup = BeautifulSoup(req.content, 'html.parser')
    if '<!DOCTYPE html>' in str(soup):
        content = getattr(soup, str(tag)).getText()
        #content = soup.pre.getText()
    else:
        content = soup.getText()
    return content.splitlines()

In [None]:
 ''' 
segment_header_content = get_url_content(segment_header, tag)
print('segment_header_content: {}\n'.format(segment_header_content))

segment_signal_names = [x.split(' ')[-1] for x in segment_header_content[1:]]
print('segment_signal_names: {}\n'.format(segment_signal_names))
 '''

In [31]:
def get_signal_name(url_content):
    return [x.split(' ')[-1] for x in url_content[1:]]

In [6]:
tag = 'pre'
record_path_list = get_url_content(record_path_url, tag)
print(len(record_path_list))
for i in range(5):
    print('record_path: {}'.format(record_path_list[i]))

22317
record_path: p00/p000020/p000020-2183-04-28-17-47
record_path: p00/p000030/p000030-2172-10-16-12-22
record_path: p00/p000033/p000033-2116-12-24-12-35
record_path: p00/p000033/p000033-2116-12-25-13-11
record_path: p00/p000052/p000052-2191-01-10-02-21


In [43]:
tag = 'pre'
#last_record = len(record_path_list) + 1
for process_record_index in range(first_record, last_record):
        
    #print(process_record_index)
    record_path = record_path_list[process_record_index]
    record_path_short = record_path.split('/')[2]
    print('processing record #: {}, record_path_short: {}'.format(process_record_index, record_path_short))
    
    directory = str(record_path.split("/")[0]) + '/' + str(record_path.split("/")[1])
    #print('directory: {}'.format(directory))
            
    master_waveform_header = record_root_url + '/' + record_path + '.hea'
    #print('master_waveform_header: {}'.format(master_waveform_header))
    
    target_url = master_waveform_header
    master_waveform_content = get_url_content(target_url, tag)
    #print('master_waveform_content: {}'.format(master_waveform_content))
    #print('       ...........\n')
    
    master_waveform_content_first_line = master_waveform_content[0].split(' ')
    record_length = int(master_waveform_content_first_line[3]) / sampling_frequency
    #print('record_length (s): {}'.format(record_length))
    #print('       ...........\n')
        
    waveform_layout_header = master_waveform_content[1].split(' ')[0]
    #print('1. waveform_layout_header: {}\n'.format(waveform_layout_header))
    
    waveform_layout_header = waveform_layout_header + '.hea'
    #print('2. waveform_layout_header: {}\n'.format(waveform_layout_header))
    
    waveform_layout_header = record_root_url + '/' + directory + '/' + waveform_layout_header
    #print('record_root_url: {}'.format(record_root_url))
    #print('directory: {}'.format(directory))
    #print('3. waveform_layout_header: {}\n'.format(waveform_layout_header))
    #print('waveform_layout_header: {}\n'.format(waveform_layout_header))
    
    target_url = waveform_layout_header
    waveform_layout_content = get_url_content(target_url, tag)
    #print('waveform_layout_content: {}'.format(waveform_layout_content))
        
    record_signal_names = get_signal_name(waveform_layout_content)
    #print('record_signal_names: {}'.format(record_signal_names))
    
    if signal_extract_name in record_signal_names and record_length > 60 * signal_extract_minimum_length: # and len(record_segments_path_list):      
        
        # Open the record_segment_file_name file with access mode 'a'
        record_segment_file_object = open(record_segment_file_name, 'a')

        # get the waveform record name
        waveform_record_id = [x.split(' ')[0].split('_') for x in master_waveform_content if '3' == x[0]][0][0]
        #print('waveform_record_id: {}\n'.format(waveform_record_id))
        
        # get the waveform_record_state
        waveform_record_state = [(waveform_record_id + '_' in x) and not (waveform_record_id + '_layout' in x) 
                for x in master_waveform_content]
        #print('waveform_record_state: {}\n'.format(waveform_record_state))

        # get the waveform_record_segment
        waveform_record_segment = [master_waveform_content[x].split(' ') for x, y in enumerate(waveform_record_state) if y]
        #print('waveform_record_segment: {}\n'.format(waveform_record_segment))
      
        for segment_index in range(len(waveform_record_segment)):
            # number of samples: int(waveform_record_segment[segment_index][1])
            # sampling frequency: int(master_waveform_content_first_line[2])
            # length (in seconds) of the record segment: number of samples / sampling frequency
            
            # https://physionet.org/content/mimic3wdb/1.0/matched/p00/p000107/3168852_0001.hea
            
            segment_header = record_root_url + '/' + directory + '/' + waveform_record_segment[segment_index][0] + '.hea'
            #print(segment_header)
            
            segment_header_content = get_url_content(segment_header, tag)
            #print('segment_header_content: {}\n'.format(segment_header_content))
            
            segment_signal_names = get_signal_name(segment_header_content)
            #print('segment_signal_names: {}\n'.format(segment_signal_names))
           
            waveform_record_segment_length = int(waveform_record_segment[segment_index][1]) / sampling_frequency
            #print('waveform_record_segment_length: {}\n'.format(waveform_record_segment_length))
            
            if signal_extract_name in segment_signal_names and waveform_record_segment_length > 60 * signal_extract_minimum_length:
                
                waveform_record_segment_good_length = waveform_record_segment[segment_index][0]
                print('  ' + str(process_record_index) + ',' + record_path + ',' + waveform_record_segment_good_length)                
                
                # Append the waveform_record_segment_good_length_root at the end of file record_segment_file_name
                record_segment_file_object.write(str(process_record_index) + ',' + record_path + ',' + waveform_record_segment_good_length + '\n')
        
        # Close the file record_segment_file_name
        record_segment_file_object.close()
        
    #index_of_last_record_processed = index_of_last_record_processed + 1
    print('  \nindex of last record processed successfully: {}'.format(process_record_index))  
    print('--------------------------------------')

    
    


processing record #: 10, record_path_short: p000107-2122-05-14-21-01
  10,p00/p000107/p000107-2122-05-14-21-01,3860035_0001
  10,p00/p000107/p000107-2122-05-14-21-01,3860035_0004
  10,p00/p000107/p000107-2122-05-14-21-01,3860035_0006
  10,p00/p000107/p000107-2122-05-14-21-01,3860035_0010
  10,p00/p000107/p000107-2122-05-14-21-01,3860035_0012
  10,p00/p000107/p000107-2122-05-14-21-01,3860035_0014
  10,p00/p000107/p000107-2122-05-14-21-01,3860035_0016
  10,p00/p000107/p000107-2122-05-14-21-01,3860035_0022
  10,p00/p000107/p000107-2122-05-14-21-01,3860035_0024
  10,p00/p000107/p000107-2122-05-14-21-01,3860035_0029
  10,p00/p000107/p000107-2122-05-14-21-01,3860035_0031
  10,p00/p000107/p000107-2122-05-14-21-01,3860035_0032
  10,p00/p000107/p000107-2122-05-14-21-01,3860035_0034
  10,p00/p000107/p000107-2122-05-14-21-01,3860035_0037
  10,p00/p000107/p000107-2122-05-14-21-01,3860035_0040
  10,p00/p000107/p000107-2122-05-14-21-01,3860035_0044
  
index of last record processed successfully: 10


In [None]:
waveform_record_segment[segment_index][0]

In [None]:
waveform_record_segment[segment_index][1]

In [None]:
master_waveform_content_first_line

In [None]:
master_waveform_content_first_line[2]

In [None]:
int(master_waveform_content_first_line[2])