<a href="https://colab.research.google.com/github/Colsai/scott_data606/blob/main/Scrapers/HHSOIG_WP_Scraper_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# OIG Workplan & Reports Scraper (Step 0)
## Data Scraper for HHS data and Reports
To prepare for topic modeling, we need to scrape OIG's public website for its text data on current Audits and Evaluations. We'll use:
https://oig.hhs.gov/reports-and-publications/workplan/

- Scrape the summary pages of all work plan items
https://oig.hhs.gov/reports-and-publications/workplan/summary/wp-summary-0000668.asp
- Scrape the reports pages of all work plan items
https://oig.hhs.gov/oei/reports/OEI-02-22-00310.asp

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
########################################
# Import general packages for analysis #
########################################
import pandas as pd
import numpy as np
import os
import time
import requests
from bs4 import BeautifulSoup
from timeit import default_timer as timer

#Scrape Active Table (last row is skipped- double header col)
df_active_table = pd.read_html('https://oig.hhs.gov/reports-and-publications/workplan/active-item-table.asp')[0][0:-1]

###############################
# Summary Scraper             #
###############################
def scrape_all_summaries(last_scraped_page = 750, 
                         show_output = False):
    '''
    This function scrapes the summaries of the web pages
    last_scraped_page is defined as the number at the end of the summary website:
    https://oig.hhs.gov/reports-and-publications/workplan/summary/wp-summary-0000XXX.asp
    '''
    
    df_all_workplans = []

    for summary_num in range(1,last_scraped_page):
        #Scrape sumamry number and fills to len(7) with 0s
        strng_sum_num = str(summary_num)
        
        summ_num = strng_sum_num.zfill(7)

        try:
            workplan_website = f"https://oig.hhs.gov/reports-and-publications/workplan/summary/wp-summary-{summ_num}.asp"
            df = pd.read_html(workplan_website)[0]
            df["Website_Link"] = workplan_website

            if show_output:
                print(summ_num)
            
            try:
                #Scrape work plan website with bs4
                response = requests.get(workplan_website)
                
                soup = BeautifulSoup(response.text, 'html.parser')

                num_para_elements = len(soup.find_all('p'))

                wp_summary = ''.join(str(soup.find_all('p')[3:num_para_elements])).replace("<p>", "").replace("</p>","")[1:-1]

                df["Summary"] = wp_summary

            except Exception as e:
                print(e)
                df["Summary"] = ""
                continue

            #Append df to list    
            df_all_workplans.append(df)
        
        except Exception as e:
            if show_output:
                print(f"{summ_num}: {e}")
        
            continue
        
    return pd.concat(df_all_workplans)

def run_scraper(last_scraped_page = 750, show_output = True, output_file = False):
    start = timer()
    df_all_workplans = scrape_all_summaries(last_scraped_page, show_output)
    end = timer()
    run_time = f"{round((end - start)/60,2)} minutes"
    print(f"Total Run Time: {run_time}")

    if output_file == True:
        #Output to CSV
        df_all_workplans.to_csv('/content/drive/MyDrive/DATA_606/HHS_OIG_workplans.csv', index = False)

    return df_all_workplans

def scrape_all_reports(df_workplans, file_output = True):
    #String Replacement
    workplan_ids = [elem.replace(" ", "").replace(",",";") if type(elem) == str 
                    else elem for elem in df_workplans['Report Number(s)']]

    #Convert to dict
    wp_list = [elem for idx, elem in enumerate(workplan_ids)]

    #String Replacement
    all_wps = [elem.split(";") if type(elem) == str else elem for elem in wp_list]
    all_wps[0:10]

    #Set potential products as anything that has an 'A' or starts with 'OEI
    all_products = []
    potential_products = []

    for wp_list in all_wps:
        wp_item_reports = []
        if type(wp_list) == list:
            for item in wp_list:
                if item.startswith('A') | item.startswith('OEI'):
                    wp_item_reports.append(item)
                    all_products.append(item)
        else:
            wp_item_reports = ''
            
        potential_products.append(wp_item_reports)

    df_output['Potential_products'] = potential_products

    #Define the list of possible regions here
    oas_iter_list = list(pd.Series([int(prod[2:4]) for prod in all_products if prod.startswith('A-')]).unique())
    oei_iter_list = list(pd.Series([prod[4:6] for prod in all_products if prod.startswith('OEI')]).unique())

    wp_item_stats = []

###############################
# Products Scraper            #
###############################
    for idx, prod in enumerate(all_products):
        if idx % 5 == 0:
            print(idx, end = ' ')

        #"A" products are audits
        if prod.startswith('A'):
            wp_item_title = ''
            wp_item_summary = ''
            
            #OAS' reports are defined within their websites as:
            wp_test_num = prod.replace('-','')[-8:]
            region_num = int(prod[2:4])
            OAS_prod_website = f"https://oig.hhs.gov/oas/reports/region{region_num}/{wp_test_num}.asp"
            response = requests.get(OAS_prod_website)

            #If the response is positive, scrape the page.    
            if str(response) == '<Response [200]>':
                soup = BeautifulSoup(response.text, 'html.parser')
                wp_item_title = str(soup.find_all('title')[0]).replace('<title>','').replace('</title>','')
                wp_item_summary = str(soup.find_all('p')[5::]).replace("[","").replace("]","").replace("<p>","").replace("</p>","")
                time.sleep(1)
            
            #Append scraped item
            wp_item_stats.append([wp_item_title,wp_item_summary])

        #OEI products are Evaluations and Inspections
        elif prod.startswith('OEI'):
            wp_item_title = ''
            wp_item_summary = ''
            OEI_prod_website = f"https://oig.hhs.gov/oei/reports/{prod}.asp"
            response = requests.get(OEI_prod_website)
            
            #If the response is positive, scrape the page.    
            if str(response) == '<Response [200]>':
                soup = BeautifulSoup(response.text, 'html.parser')
                wp_item_title = str(soup.find_all('title')[0]).replace('<title>','').replace('</title>','')
                wp_item_summary = str(soup.find_all('p')[5::]).replace("[","").replace("]","").replace("<p>","").replace("</p>","")
                time.sleep(1)
            
            #Append scraped item
            wp_item_stats.append([wp_item_title,wp_item_summary])

    #After scraping, 
    df_prods = pd.DataFrame(wp_item_stats)
    df_prods.columns = ['Title', 'Summary']

    df_combined = pd.DataFrame(all_products)
    df_combined['Title'] = df_prods["Title"]
    df_combined['Summary'] = df_prods["Summary"]
    df_combined.columns = ["Report Number(s)","Workplan_Title","Workplan_Summary"].reset_index(drop = True)

    if file_output == True:
        try:
            df_combined.to_csv('/content/drive/MyDrive/DATA_606/HHS_OIG_Reports.csv')
        except Exception as e:
            print(e)

    return df_combined

###############################
# Start from Previous         #
###############################
def start_from_previous_wps():
    df = pd.read_csv('/content/drive/MyDrive/DATA_606/HHS_OIG_workplans.csv')
    return df

def start_from_previous_products():
    df = pd.read_csv('/content/drive/MyDrive/DATA_606/HHS_OIG_Reports.csv')
    return df   

## Save summaries

In [2]:
df_workplan_summaries = scrape_all_summaries()

In [10]:
#Remove problem column here
try:
    df_workplan_summaries.drop(columns = 'Office of Evaluation and Inspections', inplace = True)
except Exception as e:
    print(e)

"['Office of Evaluation and Inspections'] not found in axis"


In [None]:
df_workplan_summaries.to_csv('HHS_workplan_summaries.csv', index = False)