In [2]:
import pandas as pd
import numpy as np

import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

import re

import time
import datetime

## functions

In [12]:
def download_links(soup,url):
    """
    Find all links to CSV files on the page.
    """
    links = []
    for link in soup.find_all('a', href=True):
        href = link['href']
        if any(href.lower().endswith(ext) for ext in ['.csv', '.pdf']):
            links.append(urljoin(url, link['href']))
    print(f"Found {len(links)} links to files.")
    return links

In [39]:
def download_files(links,path,csv = True, pdf = True):
    """
    Download all files from the links found on the page.
    """
    for url in links:
        if csv:
            if not url.endswith('.csv'):
                continue
        if pdf:
            if not url.endswith('.pdf'):
                continue
        filename = os.path.basename(url)
        file_path = os.path.join(path, filename)

        print(f"Downloading {filename} ...")
        try:
            file_response = requests.get(url)
            file_response.raise_for_status()
            with open(file_path, 'wb') as f:
                f.write(file_response.content)
            print(f"Saved to {file_path}")
        except Exception as e:
            print(f"Failed to download {url}: {e}")
        time.sleep(1)  
        
    print("All done.")

## montly exclude list

In [9]:
url_csv = "https://oig.hhs.gov/exclusions/supplements.asp"
dowload_dir_csv = "OIG_exclude_csv/raw"

os.makedirs(dowload_dir_csv, exist_ok=True)

In [10]:
response_csv = requests.get(url_csv)
response_csv.raise_for_status()

soup_csv = BeautifulSoup(response_csv.text, 'html.parser')

In [40]:
# download all links
csv_links = download_links(soup_csv, url_csv)
# download/update files
if len(os.listdir(dowload_dir_csv)) == 0:
    download_files(csv_links, dowload_dir_csv)
else:
    csv_list_last_name = "csv_list_last.csv"
    csv_list_last = pd.read_csv(csv_list_last_name)
    last_update_time = csv_list_last.columns[0]

    csv_list_new = pd.DataFrame(np.array(csv_links)).applymap(lambda x: x.lower())
    csv_update = csv_list_new.applymap(lambda x: x not in csv_list_last.values)
    csv_list_update = csv_list_new[csv_update[0] == True]

    if len(csv_list_update) == 0:
        print("No new files to download.")
    else:
        download_files(list(csv_list_update.iloc[:,0]), dowload_dir_csv, csv = True, pdf = False)

Found 33 links to files.
All done.


  csv_list_new = pd.DataFrame(np.array(csv_links)).applymap(lambda x: x.lower())
  csv_update = csv_list_new.applymap(lambda x: x not in csv_list_last.values)


In [38]:
list(csv_list_update.iloc[:,0])

['https://oig.hhs.gov/exclusions/files/leie_updated_information.pdf']

In [None]:
# store the list of csv files we already have
update_time = datetime.datetime.now().strftime("%Y-%m-%d")
pd.DataFrame(np.array(csv_links),columns = [str(update_time)]).applymap(lambda x: x.lower()).to_csv("OIG_exclude_csv/csv_list_last.csv", index=False)

## LEIE Downloadable Databases

In [14]:
url_LEIE = "https://oig.hhs.gov/exclusions/exclusions_list.asp"
dowload_dir_LEIE = "LEIE downloadable"
os.makedirs(dowload_dir_LEIE, exist_ok=True)

In [15]:
response_LEIE = requests.get(url_LEIE)
response_LEIE.raise_for_status()
soup_LEIE = BeautifulSoup(response_LEIE.text, 'html.parser')

If the last update of this webpage is later than that in our database, we need to download the raw data, 
otherwise create a new folder to store the original data.

In [16]:
# find the last update date in the page
match = re.search(r'\b(\d{2}-\d{2}-\d{4})\b', soup_LEIE.text)
if match:
    last_update_web = match.group(1)
    last_update_web = datetime.datetime.strptime(last_update_web, "%m-%d-%Y").date()
    print("Last Update is", last_update_web)
else:
    print("did not find last update date in the page.")    

Last Update is 2025-05-09


In [17]:
# need to update or not
last_update_local = os.listdir(dowload_dir_LEIE)[-1]
last_update_local = datetime.datetime.strptime(last_update_local, "%Y-%m-%d").date()

if last_update_web > last_update_local:
    print("Need to update")
    os.makedirs(os.path.join(dowload_dir_LEIE, last_update_web), exist_ok=True)
    path = os.path.join(dowload_dir_LEIE, str(last_update_web))
    update_status = True
else:
    print("No need to update")
    update_status = False

No need to update


In [None]:
if update_status:
    # find all links to CSV files on the page
    file_links = download_links(soup_LEIE)
    # download all files from the links found on the page
    download_files(file_links, path, csv = True, pdf = True)

## Georgia OIG

In [3]:
url_GA = "https://dch.georgia.gov/office-inspector-general/georgia-oig-exclusions-list"
dowload_dir_GA = "GA OIG exclude"
os.makedirs(dowload_dir_GA, exist_ok=True)

In [4]:
response_GA = requests.get(url_GA)
response_GA.raise_for_status()  # 如果请求失败，将引发异常

soup_GA = BeautifulSoup(response_GA.text, 'html.parser')

In [None]:
download_links = []
for a in soup_GA.find_all('a', href=True):
    href = a['href']
    if href.endswith('/download'):
        full_url = urljoin(url_GA, href)
        download_links.append(full_url)

In [17]:
download_links

['https://dch.georgia.gov/document/document/dch-oig-exclusions-list-05062025/download']

In [18]:
for i, url in enumerate(download_links):

    path_parts = url.strip('/').split('/')
    if len(path_parts) >= 2:
        filename_base = path_parts[-2]
    else:
        filename_base = f"file_{i}"

    filename = filename_base + ".xlsx"
    save_path = os.path.join(dowload_dir_GA, filename)

    print(f"Saving file: {url}")
    try:
        r = requests.get(url, stream=True)
        r.raise_for_status()
        with open(save_path, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"Successfully saved: {save_path}")
    except Exception as e:
        print(f"fail to download: {url}\n Error: {e}")


Saving file: https://dch.georgia.gov/document/document/dch-oig-exclusions-list-05062025/download
Successfully saved: GA OIG exclude\dch-oig-exclusions-list-05062025.xlsx
