In [1]:
import requests
from bs4 import BeautifulSoup
import os

import zipfile

### NPI files
https://download.cms.gov/nppes/NPI_Files.html

In [2]:
download_dir = "nppes_zip_files_v2"
os.makedirs(download_dir, exist_ok=True)

#### retrieve the NPI download page

In [3]:
url = "https://download.cms.gov/nppes/NPI_Files.html"

response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

links = soup.find_all("a")

#### select zip files of version 2

In [4]:
zip_links = []
for link in links:
    href = link.get("href")
    if href and href.endswith(".zip") & ("V2" in href or "v2" in href):
        if href.startswith("/"):  
            href = "https://download.cms.gov" + href
        elif href.startswith("."):
            href = "https://download.cms.gov/nppes/" + href[2:]
        zip_links.append(href)

In [10]:
zip_links

['https://download.cms.gov/nppes/NPPES_Data_Dissemination_May_2025_V2.zip',
 'https://download.cms.gov/nppes/NPPES_Deactivated_NPI_Report_051225_V2.zip',
 'https://download.cms.gov/nppes/NPPES_Data_Dissemination_050525_051125_Weekly_V2.zip',
 'https://download.cms.gov/nppes/NPPES_Data_Dissemination_051225_051825_Weekly_V2.zip',
 'https://download.cms.gov/nppes/NPPES_Data_Dissemination_051925_052525_Weekly_V2.zip']

#### download each zip file

In [5]:
for i, zip_url in enumerate(zip_links, 1):
    filename = zip_url.split("/")[-1]
    filepath = os.path.join(download_dir, filename)
    print(f"Downloading {i}/{len(zip_links)}: {filename}")
    with requests.get(zip_url, stream=True) as r:
        r.raise_for_status()
        with open(filepath, "wb") as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)

print("All files downloaded.")

Downloading 1/5: NPPES_Data_Dissemination_May_2025_V2.zip
Downloading 2/5: NPPES_Deactivated_NPI_Report_051225_V2.zip
Downloading 3/5: NPPES_Data_Dissemination_050525_051125_Weekly_V2.zip
Downloading 4/5: NPPES_Data_Dissemination_051225_051825_Weekly_V2.zip
Downloading 5/5: NPPES_Data_Dissemination_051925_052525_Weekly_V2.zip
All files downloaded.


#### unzip the files

In [6]:
for filename in os.listdir(download_dir):
    if filename.endswith(".zip"):
        zip_path = os.path.join(download_dir, filename)
        extract_dir = os.path.join(download_dir, filename.replace(".zip", ""))
        os.makedirs(extract_dir, exist_ok=True)

        print(f"Extracting {filename}...")
        try:
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(extract_dir)
            os.remove(zip_path)
            print(f"Extracted and deleted {filename}")
        except zipfile.BadZipFile:
            print(f"Failed to extract {filename} (Bad zip file)")
        except Exception as e:
            print(f"Error extracting {filename}: {e}")

print("All zip files extracted.")

Extracting NPPES_Data_Dissemination_050525_051125_Weekly_V2.zip...
Extracted and deleted NPPES_Data_Dissemination_050525_051125_Weekly_V2.zip
Extracting NPPES_Data_Dissemination_051225_051825_Weekly_V2.zip...
Extracted and deleted NPPES_Data_Dissemination_051225_051825_Weekly_V2.zip
Extracting NPPES_Data_Dissemination_051925_052525_Weekly_V2.zip...
Extracted and deleted NPPES_Data_Dissemination_051925_052525_Weekly_V2.zip
Extracting NPPES_Data_Dissemination_May_2025_V2.zip...
Extracted and deleted NPPES_Data_Dissemination_May_2025_V2.zip
Extracting NPPES_Deactivated_NPI_Report_051225_V2.zip...
Extracted and deleted NPPES_Deactivated_NPI_Report_051225_V2.zip
All zip files extracted.


### taxonomy
https://www.nucc.org/index.php/code-sets-mainmenu-41/provider-taxonomy-mainmenu-40/csv-mainmenu-57


In [None]:
url_taxonomy = "https://www.nucc.org/index.php/code-sets-mainmenu-41/provider-taxonomy-mainmenu-40/csv-mainmenu-57"

response_taxonomy = requests.get(url_taxonomy)
response_taxonomy.raise_for_status()

soup_taxonomy = BeautifulSoup(response_taxonomy.text, "html.parser")

links = soup_taxonomy.find_all("a")

csv_links = [link.get("href") for link in links if link.get("href") and link.get("href").endswith(".csv")]

In [17]:
if csv_links:
    csv_url = csv_links[0] # latest version

    if not csv_url.startswith("http"):
        csv_url = "https://www.nucc.org" + csv_url

    filename = os.path.basename(csv_url)

    print(f"Downloading {filename} from {csv_url}")
    csv_response = requests.get(csv_url)
    csv_response.raise_for_status()

    with open(filename, "wb") as f:
        f.write(csv_response.content)
    print(f"Downloaded and saved as {filename}")
else:
    print("No CSV links found on the page.")

Downloading nucc_taxonomy_250.csv from https://www.nucc.org/images/stories/CSV/nucc_taxonomy_250.csv
Downloaded and saved as nucc_taxonomy_250.csv
