In [1]:
import sys
from file_utils import *
import lazyjson
from bs4 import BeautifulSoup
import requests
import datetime

sys.setrecursionlimit(100000)

In [2]:
physioneturl = "https://physionet.org/physiobank/database/mimic2db"
db_file = 'db.json'
output = 'data/mimic2'

create_folder(output)

if not file_exists(db_file):
    create_file(db_file, contents='{}')
db = lazyjson.File(db_file)

In [3]:
homepage = requests.get(physioneturl)
soup = BeautifulSoup(homepage.text, 'html.parser')
tmp = list(map(lambda e: e.string[:-1], soup.find('pre').find_all('a')[15:-1]))
print('Got {} folders'.format(len(tmp)))

Got 4294 folders


In [9]:
def download_folder(name):
    dir_url = physioneturl + '/' + name
    dir_path = output + '/' + name
    
    sha256 = requests.get(dir_url + '/SHA256SUMS').text
    
    create_folder(dir_path)
    if name not in db:
        db[name] = {}

    
    total_size = 0
    for l in sha256.splitlines():
        h, f = l.split('  ')
        
        if not f.endswith('hea') and not f.endswith('dat'):
            continue
        
        file_url = dir_url + '/' + f
        file_path = dir_path + '/' + f
        
        # Check local existance and sha256
        valid = False
        if file_exists(file_path):
            if f in db[name]:
                if db[name][f]['hash'] == h:
                    valid = True
        
        if not valid:
            delete_file(file_path)
            download_file(file_url, file_path)
            assert h == sha256_checksum(file_path)
            s = file_size(file_path)
            total_size += s
            db[name][f] = {
                'hash': h,
                'size': s
            }
    return total_size
    

In [None]:
size = 0
for k in tmp:
    start = datetime.datetime.now()
    print('Downloading {}...'.format(k), end='')
    s = download_folder(k)
    elapsed = datetime.datetime.now() - start
    total_sec = elapsed.seconds + elapsed.microseconds / 1000000
    print(' Done! {} files downloaded/updated ({} in {}s: {}/s)'.format(
        len(db[k]), 
        sizeof_fmt(s), 
        total_sec, 
        sizeof_fmt(s/total_sec)
    ))
    
    
    if size > 100 * 1024**3:
        break

In [None]:
c = 0
for k in tmp:
    sha256 = requests.get(physioneturl + '/' + k + '/SHA256SUMS').text
    for l in sha256.splitlines():
        h, f = l.split('  ')
        if not f.endswith('hea') and not f.endswith('dat'):
            continue
        c += 1
    print(c, end="\r")
print(c)