In [29]:
import requests
import os
import zipfile
from tqdm import tqdm

In [30]:
urls = [
    "https://www.hesa.ac.uk/data-and-analysis/graduates/table-1.csv",
    "https://www.hesa.ac.uk/data-and-analysis/graduates/table-2.csv",
    "https://www.hesa.ac.uk/data-and-analysis/graduates/table-3.csv",
    "https://www.hesa.ac.uk/data-and-analysis/graduates/table-4.csv",
    "https://www.hesa.ac.uk/data-and-analysis/graduates/table-6.csv",
    "https://www.hesa.ac.uk/data-and-analysis/graduates/table-8.csv"
]

In [31]:
def download_graduate_outcome_files(urls, dir_to_save_to):
    """Downloads files from the given URLs and also creates a dir for the given input

    ------------------------------

    Variables:

      urls: str
        The URLs to download files from. There should be from the graduate outcomes source files from HESA
      
      dir_to_save_to: str
        The directory you want to create and/ or store files to. If exists, OK.
    
    ------------------------------

    Returns:
      None
    """
    
    # make the dir if it does not exist; OK if it does
    os.makedirs(dir_to_save_to, exist_ok=True)

    for url in urls:
        r = requests.get(url, stream=True)
        download_file_name = r.headers['Etag'].replace('"','')

        with open(f'{dir_to_save_to}/{download_file_name}.zip', 'wb') as handle:
            for data in tqdm(r.iter_content(chunk_size=100)):
                handle.write(data)

        # extract zip files for processing
        for root, dirs, files in os.walk(dir_to_save_to):
            for file in files:
                if file.endswith('.zip'):
                    zip_file_name = os.path.join(root, file)
                    with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
                        zip_ref.extractall(dir_to_save_to)
    
    # delete zips
    for root, dirs, files in os.walk(dir_to_save_to):
        for file in files:
            if file.endswith('.zip'):
                os.remove(os.path.join(root, file))

    print("Finished downloading files")

download_graduate_outcome_files(urls=urls, dir_to_save_to='extracted-tables')


327547it [00:01, 206542.41it/s]
432715it [00:03, 139030.44it/s]
305070it [00:01, 232916.19it/s]
208984it [00:00, 229632.57it/s]
128743it [00:00, 218726.54it/s]
133497it [00:00, 229798.58it/s]


Finished downloading files
