In [7]:
import sys
from file_utils import *
import lazyjson
from bs4 import BeautifulSoup
import requests
import datetime
import multiprocessing

sys.setrecursionlimit(100000)

In [9]:
physioneturl = "https://physionet.org/physiobank/database/mimic2wdb/matched"
db_file = 'mimic2wdb.json'
output = 'data/mimic2wdb'

create_folder(output)

if not file_exists(db_file):
    create_file(db_file, contents='{}')
db = lazyjson.File(db_file)

In [29]:
def list_page(url, string_red=9999999, pre_idx=0):
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    tmp = list(map(lambda e: e.string[:string_red], soup.find_all('pre')[pre_idx].find_all('a')))
    
    return tmp

In [30]:
folders = list_page(physioneturl, -1, 3)[9:]
print('Got {} folders'.format(len(folders)))

Got 2808 folders


In [31]:
files = list_page(physioneturl + '/' + folders[0])[5:]
print('Got {} files'.format(len(files)))
print(files)

Got 20 files
['3544749_0001.dat', '3544749_0001.hea', '3544749_0002.dat', '3544749_0002.hea', '3544749_0003.dat', '3544749_0003.hea', '3544749_0004.dat', '3544749_0004.hea', '3544749_0005.dat', '3544749_0005.hea', '3544749_0006.dat', '3544749_0006.hea', '3544749_0007.dat', '3544749_0007.hea', '3544749_0008.dat', '3544749_0008.hea', '3544749_layout.hea', '3544749n.dat', 's00020-2567-03-30-17-47.hea', 's00020-2567-03-30-17-47n.hea']


In [45]:
def process_url(f, file_url, file_path):
    download_file(file_url, file_path)
    h = sha256_checksum(file_path)
    s = file_size(file_path)
    return f, h, s

def download_folder(name, processes):
    dir_url = physioneturl + '/' + name
    dir_path = output + '/' + name
    
    create_folder(dir_path)
    if name not in db:
        db[name] = {}

    files_to_dl = []
    for f in list_page(dir_url)[5:]:
        if f.endswith('hea') or f.endswith('dat'):
            file_url = dir_url + '/' + f
            file_path = dir_path + '/' + f
            
            # Check local existance
            if file_exists(file_path):
                if f in db[name]:
                    continue
                    
            delete_file(file_path)
            files_to_dl.append((f, file_url, file_path))
            
    pool = multiprocessing.Pool(processes=processes) # how much parallelism?
    try:
        res = pool.starmap(process_url, files_to_dl)
    except:
        for _, _, fp in files_to_dl:
            delete_file(fp)
        raise
    total_size = 0
    total_dl = 0
    for f, h, s in res:
        total_size += s
        db[name][f] = {
            'hash': h,
            'size': s
        }
        total_dl += 1
    return total_size, total_dl
    

In [46]:
size = 0
for k in folders:
    start = datetime.datetime.now()
    print('Downloading {}...'.format(k), end='')
    s, dl = download_folder(k, 4)
    elapsed = datetime.datetime.now() - start
    total_sec = elapsed.seconds + elapsed.microseconds / 1000000
    print(' Done! {}/{} files downloaded/updated ({} in {}s: {}/s)'.format(
        dl,
        len(db[k]),
        sizeof_fmt(s),
        total_sec,
        sizeof_fmt(s/total_sec)
    ))
    
    
    if size > 100 * 1024**3:
        break

Downloading s00001... Done! 0/40 files downloaded/updated (0.0B in 0.74561s: 0.0B/s)
Downloading s00020... Done! 0/20 files downloaded/updated (0.0B in 0.707867s: 0.0B/s)
Downloading s00033... Done! 0/16 files downloaded/updated (0.0B in 0.674795s: 0.0B/s)
Downloading s00052... Done! 0/56 files downloaded/updated (0.0B in 0.711819s: 0.0B/s)
Downloading s00076... Done! 0/28 files downloaded/updated (0.0B in 0.698597s: 0.0B/s)
Downloading s00079... Done! 0/78 files downloaded/updated (0.0B in 0.75301s: 0.0B/s)
Downloading s00123... Done! 0/48 files downloaded/updated (0.0B in 0.759291s: 0.0B/s)
Downloading s00124... Done! 0/308 files downloaded/updated (0.0B in 0.957151s: 0.0B/s)
Downloading s00135... Done! 22/22 files downloaded/updated (14.7MiB in 8.144163s: 1.8MiB/s)
Downloading s00138... Done! 18/18 files downloaded/updated (25.8MiB in 40.477841s: 652.0KiB/s)
Downloading s00151... Done! 128/128 files downloaded/updated (195.5MiB in 145.757905s: 1.3MiB/s)
Downloading s00175... Done! 2

ProxyError: None: Max retries exceeded with url: /physiobank/database/mimic2wdb/matched/s00302/3967145_0044.hea (Caused by None)

In [None]:
c = 0
for k in folders:
    sha256 = requests.get(physioneturl + '/' + k + '/SHA256SUMS').text
    for l in sha256.splitlines():
        h, f = l.split('  ')
        if not f.endswith('hea') and not f.endswith('dat'):
            continue
        c += 1
    print(c, end="\r")
print(c)