In [2]:
import numpy as np
import pandas as pd
import os

import datetime

import fitz  # PyMuPDF
import re

## LEIE

### List of all exclusions and reinstatements starting in 2024

In [41]:
path_OIG = "OIG_exclude_csv/raw"

In [42]:
df_excl = pd.DataFrame()
df_rein = pd.DataFrame()
for filename in os.listdir(path_OIG):
    df1 = pd.read_csv(os.path.join(path_OIG, filename))
    df1['Y&M'] = filename[:4]
    if filename[-8:-4].lower() == "rein":
        if df_rein.empty:
            df_rein = df1
        else:
            df_rein = pd.concat([df_rein, df1], ignore_index=True)
    elif filename[-8:-4].lower() == "excl":
        if df_excl.empty:
            df_excl = df1
        else:
            df_excl = pd.concat([df_excl, df1], ignore_index=True)

In [45]:
True in df_rein.duplicated(subset = df_rein.columns[:6])

False

In [46]:
True in df_excl.duplicated(subset = df_excl.columns[:6])

False

In [48]:
df_all = pd.merge(df_excl, df_rein, how='outer', on = df_excl.columns[:6].tolist(),
         suffixes=('_excl', '_rein'))

In [49]:
update_time = datetime.datetime.now().strftime("%Y-%m-%d")

In [51]:
df_all.to_csv("OIG_exclude_csv/OIG_all_{}.csv".format(update_time), index=False)
df_rein.to_csv("OIG_exclude_csv/OIG_rein_{}.csv".format(update_time), index=False)
df_excl.to_csv("OIG_exclude_csv/OIG_excl_{}.csv".format(update_time), index=False)

In [None]:
df_all.query("REINDATE_rein.isnull()")

Unnamed: 0,LASTNAME,FIRSTNAME,MIDNAME,BUSNAME,GENERAL,SPECIALTY,UPIN_excl,NPI_excl,DOB_excl,ADDRESS_excl,...,ADDRESS_rein,CITY_rein,STATE_rein,ZIP_rein,EXCLTYPE_rein,EXCLDATE_rein,REINDATE_rein,WAIVERDATE_rein,WVRSTATE_rein,Y&M_rein
0,AAKER,DEBHANNA,,,EMPLOYEE - PRIVATE S,HOME HEALTH AGENCY,,0.000000e+00,19820311.0,2006 OAK ST,...,,,,,,,,,,
2,ABAD-SANTOS,CRISELDA,CALAYAN,,"PHYSICIAN (MD, DO)",PSYCHIATRY,,1.760462e+09,19631220.0,8506 N ADIR DR,...,,,,,,,,,,
3,ABANDA,JACOB,ATAMBILI,,IND- LIC HC SERV PRO,NURSE PRACTITIONER (,,0.000000e+00,19771212.0,12014 OSAGE PARK DRIVE,...,,,,,,,,,,
4,ABARE,RICKY,JOHN,,IND- LIC HC SERV PRO,THERAPIST,,0.000000e+00,19920129.0,4010 BOLTON VALLEY ACCESS ROAD,...,,,,,,,,,,
6,ABAZIED COGAR,JOY,GAYLE,,IND- LIC HC SERV PRO,NURSE/NURSES AIDE,,0.000000e+00,19670405.0,101 DEE ST,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4817,,,,THE HEALING TREE WELLNESS CENT,OTHER BUSINESS,MENTAL/BEHAVIORAL HE,,1.093162e+09,,"4270 S DECATUR BOULEVARD, SUIT",...,,,,,,,,,,
4818,,,,"TRUE FAMILY SERVICES, INC",OTHER BUSINESS,MENTAL/BEHAVIORAL HE,,1.124308e+09,,914 EAST SAHARA AVENUE,...,,,,,,,,,,
4820,,,,UNITED MEMORIAL MEDICAL CENTER,OTHER BUSINESS,HOSPITAL,,1.891741e+09,,510 W TIDWELL ROAD,...,,,,,,,,,,
4821,,,,"VDA OC, LLC",GOVERNMENT CONTRACTO,OTHER CONTRACTOR,,1.649610e+09,,C/O REGISTERED AGENT - COOLIDG,...,,,,,,,,,,


### leie_updated_information.pdf

In [None]:
pdf_path = "LEIE downloadable/2025-05-09/" 

file_name_update =  "leie_updated_information.pdf"
doc = fitz.open(pdf_path+file_name_update)

In [4]:
month_list = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]

header_pos = {
    "LASTNAME": 2,
    "FIRSTNAME": 4,
    "MIDNAME": 6,
    "BUSNAME": 8,
    "GENERAL": 10,
    "SPECIALTY": 12,
    "UPIN": 14,
    "NPI": 16,
    "DOB": 18,
    "ADDRESS": 20,
    "CITY": 22,
    "STATE": 24,
    "ZIP": 26,
    "EXCLTYPE": 28,
    "EXCLDATE": 30
}

rename_dict = {
    0: "Update month",
    1: "Update",
    3: "LASTNAME",
    5: "FIRSTNAME",
    7: "MIDNAME",
    9: "BUSNAME",
    11: "GENERAL",
    13: "SPECIALTY",
    15: "UPIN",
    17: "NPI",
    19: "DOB",
    21: "ADDRESS",
    23: "CITY",
    25: "STATE",
    27: "ZIP",
    29: "EXCLTYPE",
    31: "EXCLDATE"   
}

In [5]:
pages = []
pages_abnormal = []
num_abnormal = []
time_update = ''

for num,page in enumerate(doc):
    text = page.get_text()
    lines = [line.strip() for line in text.split('\n')]
    if lines[0] == "Updated LEIE Information":
        lines = lines[1:]

    if lines[0] == "":
        lines = lines[1:]

    if lines[0][:3] in month_list:
        time_update = lines[0]
    else:
        lines.insert(0, time_update)

    for i in header_pos.keys():
        index = lines.index(i)      
        if index < header_pos[i]:  
            lines.insert(index, "")
        elif index > header_pos[i]:
            lines_before = lines[:header_pos[i]-1]
            lines_after = lines[index:]
            lines_between = [" ".join(lines[header_pos[i]-1:index])]
            if isinstance(lines_before, str):
                lines_before = [lines_before]
            lines = lines_before + lines_between + lines_after

    # print(lines)

    if len(lines) > 34:
        pages_abnormal.append(lines)
        num_abnormal.append(num)
    else:
        pages.append(lines)


#### most of the pages can be tranformed by the codes above

In [6]:
df_raw = pd.DataFrame(pages)
df_raw.rename(columns=rename_dict, inplace=True)
df_raw = df_raw[rename_dict.values()]
df_raw['Update'] = df_raw['Update'].str.replace("Update:","")

In [7]:
df_raw

Unnamed: 0,Update month,Update,LASTNAME,FIRSTNAME,MIDNAME,BUSNAME,GENERAL,SPECIALTY,UPIN,NPI,DOB,ADDRESS,CITY,STATE,ZIP,EXCLTYPE,EXCLDATE
0,April 2025,Subject Sub-Type and Subject Type,,,,BELL LAB AND X-RAY SERVICES,OTHER BUSINESS,LABORATORY,,0000000000,,"17442 VIRGINIA AVENUE, APT 2",BELLFLOWER,CA,907060000,1128a1,19870724
1,April 2025,Subject Type and Subject Sub-Type,,,,COALFIELD PHARMACY,OTHER BUSINESS,PHARMACY,,0000000000,,109 HICKORY HOLLOW DRIVE,OAK RIDGE,TN,378300000,1128a1,19880624
2,April 2025,Subject Type and Subject Sub-Type,,,,HINTON PHARMACY,OTHER BUSINESS,PHARMACY,,0000000000,,C/O 1447 COUNTY FARM ROAD,RAYMOND,MS,391540000,1128b8,19950216
3,April 2025,Subject Type and Subject Sub-Type,,,,"JOHN'S MINI-BUS SERVICE, INC",OTHER BUSINESS,TRANSPORTATION CO,,0000000000,,148-14 LIBERTY AVENUE,JAMAICA,NY,114350000,1128a1,19870302
4,April 2025,Subject Sub-Type and Subject Type,,,,"RONALD W HIGGINS, D D S, P C",OTHER BUSINESS,DENTAL PRACTICE,,0000000000,,3400 SQUALICUM PARKWAY,BELLINGHAM,WA,982250000,1128a1,19881228
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112,March 2025,Date of Birth,PEREZ OYOLA,MAYRA,ELENA,,BUS OWNER/EXEC,RENAL FACILITY,,0000000000,19530909,A1 CALLE 1,SAN JUAN,PR,009240000,1128a2,20250320
113,March 2025,Date of Birth,TAYLOR,NANCY,LOCKWOOD,,IND- LIC HC SERV PRO,NURSE/NURSES AIDE,,0000000000,19530915,2509 LOCH HAVEN DRIVE,PLANO,TX,750235316,1128b4,20160720
114,February 2025,SSN,BOLDEN,DANTE,L,,EMPLOYEE - PRIVATE S,SKILLED NURSING FAC,,0000000000,19920127,"827 N MADISON ST, # 19801",WILMINGTON,DE,198011437,1128a2,20240118
115,February 2025,Date of Birth,HEMBREE,LENORA,JEAN,,NURSING PROFESSION,NURSE/NURSES AIDE,,0000000000,19671001,"1728 102ND STREET, UNIT B",LUBBOCK,TX,794230000,1128b4,19890623


#### Manual input of a PDF that cannot be deconstructed properly

In [None]:
pd.DataFrame(pages_abnormal).to_csv("LEIE downloadable/abnormal_pages.csv", index=False)

In [None]:
df_changed = pd.read_csv("LEIE downloadable/changed_abnormal_pages.csv")
df_raw = pd.concat([df_raw, df_changed], ignore_index=True)

#### store as csv

In [21]:
df_raw.to_csv(pdf_path + file_name_update[:-4] + ".csv", index=False)

### leie_record_layout

In [22]:
file_name_layout =  "leie_record_layout.pdf"
doc_layout = fitz.open(pdf_path+file_name_layout)

In [None]:
for num,page in enumerate(doc_layout):
    text = page.get_text()
    lines = [line.strip() for line in text.split('\n')]
    lines = [line for line in lines if line != ""]
    pd.DataFrame({lines[0]: lines[2::2], lines[1]: lines[3::2]}).to_csv(pdf_path + file_name_layout[:-4] + ".csv", index=False)

## Georgia OIG

In [7]:
xls = pd.ExcelFile("GA OIG exclude/dch-oig-exclusions-list-05062025.xlsx")

all_sheets = {}
for sheet_name in xls.sheet_names:
    df = xls.parse(sheet_name)
    if df.empty:
        continue
    all_sheets[sheet_name] = df

In [8]:
all_sheets.keys()

dict_keys(['Sheet1'])

In [17]:
df = pd.read_excel("GA OIG exclude/dch-oig-exclusions-list-05062025.xlsx", sheet_name="Sheet1",skiprows=2)

In [20]:
df.to_csv("GA OIG exclude/dch-oig-exclusions-list-05062025.csv", index=False)