In [None]:
import requests
import os
import tarfile
from tqdm import tqdm

### Downloading the [ChestX-ray8 Dataset](https://arxiv.org/abs/1705.02315)

In [None]:
links = [
    'https://nihcc.box.com/shared/static/vfk49d74nhbxq3nqjg0900w5nvkorp5c.gz',
    'https://nihcc.box.com/shared/static/i28rlmbvmfjbl8p2n3ril0pptcmcu9d1.gz',
    'https://nihcc.box.com/shared/static/f1t00wrtdk94satdfb9olcolqx20z2jp.gz',
    'https://nihcc.box.com/shared/static/0aowwzs5lhjrceb3qp67ahp0rd1l1etg.gz',
    'https://nihcc.box.com/shared/static/v5e3goj22zr6h8tzualxfsqlqaygfbsn.gz',
    'https://nihcc.box.com/shared/static/asi7ikud9jwnkrnkj99jnpfkjdes7l6l.gz',
    'https://nihcc.box.com/shared/static/jn1b4mw4n6lnh74ovmcjb8y48h8xj07n.gz',
    'https://nihcc.box.com/shared/static/tvpxmn7qyrgl0w8wfh9kqfjskv6nmm1j.gz',
    'https://nihcc.box.com/shared/static/upyy3ml7qdumlgk2rfcvlb9k6gvqq2pj.gz',
    'https://nihcc.box.com/shared/static/l6nilvfa9cg3s28tqv1qc1olm3gnz54p.gz',
    'https://nihcc.box.com/shared/static/hhq8fkdgvcari67vfhs7ppg2w6ni4jze.gz',
    'https://nihcc.box.com/shared/static/ioqwiy20ihqwyr8pf4c24eazhh281pbu.gz'
]

Create a directory for the data, download the compressed tar.gz files, and extract them into the `data/images` folder. and removes the .gz files

*Warning: This will occupy ~45 GB of disk space*

In [None]:
extract_dir = 'dataa'
if not os.path.exists(extract_dir):
    os.makedirs(extract_dir)

for i,link in enumerate(links):
    file_name = link.split('/')[-1]
    gz_path = os.path.join(extract_dir, file_name)

    print(f"Downloading file {i+1}/{len(links)}: {file_name}...")
    response = requests.get(link, stream=True)
    total_size_in_bytes= int(response.headers.get('content-length', 0))
    block_size = 1024 
    progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
    if response.status_code == 200:
        with open(gz_path, 'wb') as file:
            for data in response.iter_content(block_size):
                progress_bar.update(len(data))
                file.write(data)
        progress_bar.close()

        print(f"Successfully downloaded file {i+1}/{len(links)}: {file_name}")

        print(f"Extracting file {i+1}/{len(links)}: {file_name}...")
        with tarfile.open(gz_path, 'r:gz') as tar:
            members = tar.getmembers()
            extract_progress_bar = tqdm(members, unit='files', desc=f"Extracting {file_name}")
            for member in extract_progress_bar:
                tar.extract(member, path=extract_dir)
        print(f"Successfully extracted file {i+1}/{len(links)}: {file_name}")

        os.remove(gz_path)
        print(f"Removed {file_name}\n\n")
    else:
        print(f"Failed to download {file_name}. Status code: {response.status_code}")
        progress_bar.close()