In [61]:
from pathlib import Path
import requests
import pandas as pd
import os
from html.parser import HTMLParser
from tqdm.auto import tqdm

In [63]:
data_dir = Path(os.path.realpath("__file__")).parent.parent / "data"
assert data_dir.exists()

In [10]:
http_server_uri = "https://www.ncei.noaa.gov/pub/data/swdi/stormevents/csvfiles/"

In [30]:
test_response = requests.get(http_server_uri)
test_response.encoding = "utf-8"
test_response.status_code

200

In [47]:
class StormDataLinkParser(HTMLParser):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.download_file_list = []

    def handle_data(self, data):
        if "StormEvents" in data and ".csv.gz" in data:
            self.download_file_list.append(data)
        
        
html_parser = StormDataLinkParser()
html_parser.feed(test_response.text)
len(html_parser.download_file_list)

216

In [48]:
test_download_response = requests.get(http_server_uri + html_parser.download_file_list[0])
test_download_response.status_code

200

In [65]:
raw_dir = data_dir / "raw/StormEventsGZIP"

In [54]:
test_write_path = raw_dir / html_parser.download_file_list[0]

with test_write_path.open('wb+') as f:
    f.write(test_download_response.content)

In [60]:
# Ignore already downloaded files
for file_name in html_parser.download_file_list[:2]:
    file_name_path = raw_dir / file_name
    if not file_name_path.exists():
        print("will download {}!".format(file_name))
    else:
        print("Will not download {}!".format(file_name))

Will not download StormEvents_details-ftp_v1.0_d1950_c20210803.csv.gz!
will download StormEvents_details-ftp_v1.0_d1951_c20210803.csv.gz!


In [76]:
for file_name in tqdm(html_parser.download_file_list):
    file_name_download_path = raw_dir / file_name
    if not file_name_download_path.exists():
        download_response = requests.get(http_server_uri + file_name, stream=True)
        if download_response.status_code == 200:
            with file_name_download_path.open('ab+') as f:
                for chunk in download_response.iter_content(chunk_size=1024):
                    if (chunk):
                        f.write(chunk)

  0%|          | 0/216 [00:00<?, ?it/s]

In [77]:
# Get already downloaded files
already_downloaded_files = list(raw_dir.glob("*"))
len(already_downloaded_files)

216

In [78]:
if all([raw_dir / x in already_downloaded_files for x in html_parser.download_file_list]):
    print("All files from the HTTP server have been downloaded!")
else:
    print("Some files failed to download")

All files from the HTTP server have been downloaded!
