# Setup

Initial module setup.

In [32]:
import bs4
import pathlib
import urllib.request
import urllib.parse
import requests

# Constants

In [33]:
DATASET_URL = 'https://physionet.org/files/auditory-eeg/1.0.0/Segmented_Data/'

# Utilities

In [34]:
def get_dataset() -> pathlib.Path:
    """
    Downloads (if necessary) the dataset and retrieves the path to the root of the dataset files directory.

    :return: the path object pointing to the dataset directory.
    """
    data_directory = _get_data_directory()
    if data_directory.exists():
        return data_directory
    data_directory.mkdir(exist_ok=True)
    _download_dataset(data_directory)
    return data_directory


def _get_data_directory() -> pathlib.Path:
    """
    Helper function which generates the path to the data directory.

    :return: the data directory path object.
    """
    return pathlib.Path().resolve().parent / 'data'


def _download_dataset(target_path: pathlib.Path):
    """
    Initiates download of the dataset and saves all files into the given target path directory.

    :param target_path: the target path directory.
    """
    with requests.get(DATASET_URL) as listing_page:
        listing_soup = bs4.BeautifulSoup(
            listing_page.content,
            features='html.parser'
        )
        _download_files_in_listing(target_path, listing_soup)


def _download_files_in_listing(target_path: pathlib.Path, listing_soup: bs4.BeautifulSoup):
    """
    Helper function which iterates over all file links in the given BeautifulSoup object and downloads each file into
    the target path's directory.

    :param target_path: the target path directory.
    :param listing_soup: the BeautifulSoup object to use to find download links.
    """
    for file_link in listing_soup.find_all('a'):
        file_href = file_link.get('href')
        if file_href and str(file_href).endswith('.csv'):
            file_path = target_path / file_href
            file_url = urllib.parse.urljoin(DATASET_URL, file_href)
            _download_url_to_file(file_path, file_url)


def _download_url_to_file(file_path: pathlib.Path, url: str):
    """
    Downloads the given URL's remote content to the given file path.

    :param file_path: the file path to download to.
    :param url: the URL to download from.
    """
    with requests.get(url) as response:
        with open(file_path, 'wb') as out_file:
            for chunk in response.iter_content(chunk_size=1024):
                out_file.write(chunk)


# Setup Dataset

In [35]:
dataset_path = get_dataset()
print(f'Dataset path: {dataset_path}')

KeyboardInterrupt: 