## Web Scraping from data source

In [1]:
# dependencies
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

In [2]:
# report generator url
url = "http://www9.health.gov.au/cda/source/rpt_2_sel.cfm"

In [3]:
# use splinter to navigate to page
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)
browser.visit(url)



Current google-chrome version is 91.0.4472
Get LATEST driver version for 91.0.4472
Driver [C:\Users\fernb\.wdm\drivers\chromedriver\win32\91.0.4472.101\chromedriver.exe] found in cache


In [4]:
# set query ranges and empty lists for data return
queryyears = [2015,2016,2017,2018,2019,2020]
dieaselist =[]
diseasedflist=[]
diseasegroupings = []
eachgrouplist = []

In [5]:
# loop through years for field options
for year in queryyears:
    
    # use splinter to select and submit form based on field options
    browser.find_by_id('report_option_5').first.click()
    browser.find_by_id('sel_year').first.select(str(year))
    browser.find_by_id('CTIME1').click() 
    
    #get html table data into dataframe using pandas
    tables = pd.read_html(browser.html)
    df = tables[0]
    dropped = df.dropna(axis=0)
    
    #form initial dataframes on first year
    if year == 2015:
        print("first")
        #get list of disease groups
        diseasegroups = dropped.sort_values(by=['ACT'],ascending=False)[:8]
        diseasegroups = diseasegroups["ACT"].sort_index()
        indexlist = diseasegroups.index
        diseaselist = diseasegroups.tolist()
        diseasedflist = diseasegroups.tolist()        

        # loop through dataframe and slice based on disease groupings
        for i in range(len(indexlist)):
            if i !=7:
                dlist = dropped[indexlist[i]:indexlist[i+1]-1]["Unnamed: 0"]
            else:
                dlist = dropped[indexlist[i]:]["Unnamed: 0"]
            
            # get group and diseases paired for later database load
            dlist = dlist.values.tolist()
            for d in dlist:
                diseasegroupings.append(diseaselist[i])
                eachgrouplist.append(d)
            dlist.append('Location')
            dlist.append('Year')
            diseasedflist[i] = pd.DataFrame(columns = dlist)
        
        #slice and append data to dataframe per disease grouping
        for i in range(len(diseasedflist)):
            if i !=7:
                item = dropped[indexlist[i]:indexlist[i+1]-1]
            else:
                item = dropped[indexlist[i]:]
                
            item.set_index('Unnamed: 0', inplace=True)
            item = item[["ACT","NSW","NT","QLD","SA","TAS","VIC","WA","Aust","Last 5yearsmean"]].T
            item["Year"] = [year for x in range(10)]
            item.reset_index(inplace =True)
            item.rename(columns = {'index':"Location"}, inplace = True)
            diseasedflist[i]=diseasedflist[i].append(item, ignore_index=True)  
        

    #loop through rest of years to append to dataframes
    else:
        
        
        for i in range(len(diseasedflist)):
            if i !=7:
                item = dropped[indexlist[i]:indexlist[i+1]-1]
            else:
                item = dropped[indexlist[i]:]
                
            item.set_index('Unnamed: 0', inplace=True)
            item = item[["ACT","NSW","NT","QLD","SA","TAS","VIC","WA","Aust","Last 5yearsmean"]].T
            item["Year"] = [year for x in range(10)]
            item.reset_index(inplace =True)
            item.rename(columns = {'index':"Location"}, inplace = True)
            diseasedflist[i]=diseasedflist[i].append(item, ignore_index=True)
    
    # close year report so loop can run another report
    browser.find_by_id('close').click() 

        
        

first


In [9]:
## create dictionary of disease groupings

dic = {"Disease Group":diseasegroupings,"Disease":eachgrouplist}

groupeddf = pd.DataFrame.from_dict(dic)

In [10]:
browser.quit()

In [11]:
## create sqlite database

from sqlalchemy import create_engine

path = f"sqlite:///diseases.sqlite"
engine = create_engine(path)

connection = engine.connect()



In [13]:
# add each dataframe into database 
for i in range(len(dieaselist)):
    diseasedflist[i].to_sql(diseaselist[i],connection,if_exists='replace',index=False)




In [14]:
## add disease groups to database
groupeddf.to_sql("Disease Groups",connection,if_exists='replace',index=False)