## Scraping Oregon State Hospital Data
#### This notebook uses beautiful soup to scrape healthcare data from https://oregonhospitalguide.org

In [1]:
#import libraries
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup
import requests

### Get list of hospitals

In [2]:
#the list of hospital URLS
hospital_urls = ['mid-columbia-medical-center/',
                 'asante-three-rivers-medical-center/',
                 'providence-st-vincent-medical-ctr/',
                 'asante-ashland-community-hospital/',
                 'legacy-emanuel-medical-center/',
                 'ohsu-hospital/',
                 'good-samaritan-regional-medical-center/',
                 'legacy-good-samaritan-medical-center/',
                 'asante-rogue-regional-medical-center/',
                 'mckenzie-willamette-medical-center/',
                 'tuality-healthcare/',
                 'samaritan-albany-general-hospital/',
                 'legacy-mount-hood-medical-center/',
                 'mercy-medical-center/',
                 'legacy-silverton-medical-center/',
                 'peacehealth-sacred-heart-medical-center-university-district/',
                 'providence-newberg-medical-center/',
                 'providence-willamette-falls-medical-ctr/',
                 'st-charles-redmond/','st-charles-bend/',
                 'sky-lakes-medical-center/',
                 'salem-hospital/',
                 'saint-alphonsus-medical-center---ontario/'
                 ,'santiam-memorial-hospital/',
                 'adventist-health-portland/',
                 'providence-portland-medical-center/',
                 'willamette-valley-medical-center/',
                 'providence-medford-medical-center/',
                 'providence-milwaukie-hospital/',
                 'legacy-meridian-park-medical-center/',
                 'bay-area-hospital/',
                 'kaiser-sunnyside-medical-center/',
                 'peacehealth-sacred-heart-medical-center-at-riverbend/',
                 'kaiser-westside-medical-center/',
                 'peacehealth-cottage-grove-community-medical-center/',
                 'samaritan-north-lincoln-hospital/',
                 'providence-seaside-hospital/',
                 'southern-coos-hospital-health-center/',
                 'blue-mountain-hospital/',
                 'wallowa-memorial-hospital/',
                 'harney-district-hospital/',
                 'salem-health-west-valley/',
                 'lake-district-hospital/',
                 'pioneer-memorial-hospital-h/',
                 'lower-umpqua-hospital/',
                 'coquille-valley-hospital/',
                 'st-charles-prineville/',
                 'samaritan-pacific-communities-hospital/',
                 'saint-alphonsus-medical-center---baker-city/',
                 'peacehealth-peace-harbor-medical-center/',
                 'tillamook-regional-medical-center/',
                 'providence-hood-river-memorial-hospital/',
                 'st-anthony-hospital/',
                 'columbia-memorial-hospital/',
                 'grande-ronde-hospital/',
                 'curry-general-hospital/',
                 'samaritan-lebanon-community-hospital/',
                 'st-charles-madras/',
                 'good-shepherd-medical-center/']

### Create a list of procedures to scrape

For this exercise, I chose to list the procedures from Legacy Good Samaritan Medical Center. The reasons:
 - They had a lot of procedures listed.
 - They did not have any duplicates (procedures with the exact same name, but different prices).
   - I chose to just scrape 80 of the 100+ listed.

In [3]:
#Get a list of procedures
baseurl = 'https://oregonhospitalguide.org/hospitals/'
gsurl = requests.get(baseurl + 'legacy-good-samaritan-medical-center/#procedure-costs').text
gs_soup = BeautifulSoup(gsurl,'lxml')
#find all procedure listed for the hospital
all_procs = gs_soup.find_all('li',{'class':'cp-procedure'})
#remove duplicates if any
all_procs = set([all_procs[i].contents[1].text[:-2] for i in range(len(all_procs))]) 

In [4]:
#note the number of hospitals
len(all_procs)

80

### Create a lookup table to assign an integer to a given procedure

In [5]:
#create a lookup dictionary assigning an int to a procedure and a reverse lookup
proc_lookup = {i:proc for i,proc in enumerate(list(set(all_procs)))}
rev_lu = {v:k for k,v in proc_lookup.items()}

In [6]:
#create function to scrape the data
def get_hosp_data(hosp_list):
    #baseurl
    baseurl = 'https://oregonhospitalguide.org/hospitals/'
    #A list of hospital names
    hospital_names = []
    #a list of lists containing price information
    prices = []
    #iterate over list of hospitals
    for i in hosp_list:
        #get the page data for the selected hispital
        url = requests.get(baseurl + i).text
        #read and parse the page data
        hosp_soup = BeautifulSoup(url, "lxml")
        #include only data with procedure tag
        procedures = hosp_soup.find_all('li',{'class':'cp-procedure'})
        #get a list of procedures for the given hospital
        hprocs = [procedures[i].contents[1].text[:-2] for i in range(len(procedures))]
        #only store data if the hospital has less than 80 procedures(this is to limit duplicates)
        if len(hprocs)<81:
            #Append the hospital name
            hospital_names.append(hosp_soup.title.text[:-17])
            #create an empty array of 80 zeros
            proc_price = np.zeros(80)
            #iterate over list of procedure for a given hospital
            for i in range(len(hprocs)):
                #for each procedure, if in the lookup:
                if procedures[i].contents[1].text[:-2] in proc_lookup.values():
                    #insert the procedure price at the index of corresponding lookup value (this removes the '$' sign and ',')
                    proc_price[rev_lu[procedures[i].contents[1].text[:-2]]] = procedures[i].contents[5].text[1:].replace(',','')
            #append the list of prices to the prices
            prices.append(proc_price)
            
    return hospital_names, prices

In [13]:
#run the function on the list of hospitals
hospital_names , prices = get_hosp_data(hospital_urls)

In [14]:
#contrust a dataframe with the information
hospital_data = pd.DataFrame(prices,index=hospital_names, columns=proc_lookup.values())

In [15]:
#Convert 0s to NaN
hospital_data = hospital_data.replace(0, np.nan)

In [16]:
hospital_data.shape

(53, 80)

In [17]:
#preview the data
hospital_data.head(5)

Unnamed: 0,Breast Reconstruction,Radiation Treatment: Consult,Nerve Block,CT scan: Chest,Arthorcentesis,Cardiovascular: Electrophysiology,X-ray: Abdomen/GI,Cardiovascular: Echocardiography,Hysterectomy,Cesarean Section with complications,...,Newborn care without complications,Breast Biopsy,CT scan: Extremities,Newborn care with complications,ORIF,Abdominal Drainage,Cardiovascular: Electrocardiography,Knee Replacement,MRI with contrast: Head and Neck,Big Toe Surgery
Mid-Columbia Medical Center,,611.0,,,,,255.0,2078.0,,,...,8569.0,,,,,,77.0,,2705.0,
Asante Three Rivers Medical Center,,447.0,,468.0,931.0,,230.0,1595.0,15233.0,,...,10355.0,3864.0,,23361.0,,,231.0,34009.0,1891.0,
Asante Ashland Community Hospital,,,2887.0,1408.0,,,211.0,1635.0,,,...,10378.0,3573.0,,,,,248.0,,,
Legacy Emanuel Medical Center,,,,,,48813.0,186.0,1273.0,11985.0,15733.0,...,2360.0,,570.0,10511.0,30091.0,,133.0,,1387.0,7315.0
Good Samaritan Regional Medical Center,,432.0,,496.0,,40627.0,230.0,1312.0,16018.0,23488.0,...,9260.0,,496.0,4430.0,32983.0,760.0,208.0,36844.0,2004.0,


In [12]:
#Save the data to csv
hospital_data.to_csv('OR_hospital_data.csv',index=False)