## Web Scraping from data source

In [1]:
# dependencies
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

In [2]:
# report generator url
url = "http://www9.health.gov.au/cda/source/rpt_2_sel.cfm"

In [3]:
# use splinter to navigate to page
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)
browser.visit(url)



Current google-chrome version is 91.0.4472
Get LATEST driver version for 91.0.4472
Get LATEST driver version for 91.0.4472
Trying to download new driver from https://chromedriver.storage.googleapis.com/91.0.4472.101/chromedriver_win32.zip
Driver has been saved in cache [C:\Users\fernb\.wdm\drivers\chromedriver\win32\91.0.4472.101]


In [4]:
# set query ranges and empty lists for data return
queryyears = [2015,2016,2017,2018,2019,2020]
diseaselist =[]
diseasedflist=[]
diseasegroupings = []
eachgrouplist = []

In [5]:
# loop through years for field options
for year in queryyears:
    
    # use splinter to select and submit form based on field options
    browser.find_by_id('report_option_5').first.click()
    browser.find_by_id('sel_year').first.select(str(year))
    browser.find_by_id('CTIME1').click() 
    
    #get html table data into dataframe using pandas
    tables = pd.read_html(browser.html)
    df = tables[0]
    dropped = df.dropna(axis=0)
    
    #form initial dataframes on first year
    if year == 2015:
        print("first")
        #get list of disease groups
        diseasegroups = dropped.sort_values(by=['ACT'],ascending=False)[:8]
        diseasegroups = diseasegroups["ACT"].sort_index()
        indexlist = diseasegroups.index
        diseaselist = diseasegroups.tolist()
        diseasedflist = diseasegroups.tolist()        

        # loop through dataframe and slice based on disease groupings
        for i in range(len(indexlist)):
            if i !=7:
                dlist = dropped[indexlist[i]:indexlist[i+1]-1]["Unnamed: 0"]
            else:
                dlist = dropped[indexlist[i]:]["Unnamed: 0"]
            
            # get group and diseases paired for later database load
            dlist = dlist.values.tolist()
            for d in dlist:
                diseasegroupings.append(diseaselist[i])
                eachgrouplist.append(d)
            dlist.append('Location')
            dlist.append('Year')
            diseasedflist[i] = pd.DataFrame(columns = dlist)
        
        #slice and append data to dataframe per disease grouping
        for i in range(len(diseasedflist)):
            if i !=7:
                item = dropped[indexlist[i]:indexlist[i+1]-1]
            else:
                item = dropped[indexlist[i]:]
                
            item.set_index('Unnamed: 0', inplace=True)
            item = item[["ACT","NSW","NT","QLD","SA","TAS","VIC","WA","Aust","Last 5yearsmean"]].T
            item["Year"] = [year for x in range(10)]
            item.reset_index(inplace =True)
            item.rename(columns = {'index':"Location"}, inplace = True)
            diseasedflist[i]=diseasedflist[i].append(item, ignore_index=True)  
        

    #loop through rest of years to append to dataframes
    else:
        
        
        for i in range(len(diseasedflist)):
            if i !=7:
                item = dropped[indexlist[i]:indexlist[i+1]-1]
            else:
                item = dropped[indexlist[i]:]
                
            item.set_index('Unnamed: 0', inplace=True)
            item = item[["ACT","NSW","NT","QLD","SA","TAS","VIC","WA","Aust","Last 5yearsmean"]].T
            item["Year"] = [year for x in range(10)]
            item.reset_index(inplace =True)
            item.rename(columns = {'index':"Location"}, inplace = True)
            diseasedflist[i]=diseasedflist[i].append(item, ignore_index=True)
    
    # close year report so loop can run another report
    browser.find_by_id('close').click() 

        
        

first


In [6]:
## create dictionary of disease groupings

dic = {"DiseaseGroup":diseasegroupings,"Disease":eachgrouplist}

groupeddf = pd.DataFrame.from_dict(dic)

In [7]:
browser.quit()

In [14]:
for i in range(len(diseaselist)):
    print(diseaselist[i])

Bloodborne diseases
Gastrointestinal diseases
Other notifiable diseases
Quarantinable diseases
Sexually transmissible infections
Vaccine preventable diseases
Vectorborne diseases
Zoonoses


In [10]:
## create sqlite database

from sqlalchemy import create_engine

path = f"sqlite:///diseases.sqlite"
engine = create_engine(path)

connection = engine.connect()



In [34]:
# add each dataframe into database 
for i in range(len(diseaselist)):
    tablename = diseaselist[i].replace(" ", "")
    diseasedflist[i].to_sql(tablename,connection,if_exists='replace',index=True)






OperationalError: (sqlite3.OperationalError) index ix_Bloodbornediseases_index already exists
[SQL: CREATE INDEX "ix_Bloodbornediseases_index" ON "Bloodbornediseases" ("index")]
(Background on this error at: http://sqlalche.me/e/14/e3q8)

In [28]:
## add disease groups to database
groupeddf.to_sql("DiseaseGroups",connection, if_exists='replace',index=True)



OperationalError: (sqlite3.OperationalError) cannot commit - no transaction is active
[SQL: COMMIT TRANSACTION;]
(Background on this error at: http://sqlalche.me/e/14/e3q8)

In [None]:
  "Gastrointestinaldiseases", 
  "Othernotifiablediseases", 
  "Quarantinablediseases", 
  "Sexuallytransmissibleinfections", 
  "Vaccinepreventablediseases", 
  "Vectorbornediseases", 
  "Zoonoses"

In [None]:
  "Legionellosis", 
  "Leprosy", 
  "Meningococcal disease (invasive)", 
  "RSV", 
  "Tuberculosis", 
  "iGAS", 
  "Location", 
  "Year"
    
'Rubella congenital' REAL NOT NULL, 'Tetanus' REAL NOT NULL, 'Varicella zoster (unspecified)' REAL NOT NULL, 'Varicella zoster (shingles)' REAL NOT NULL,
    
    
    

In [45]:
connection.execute("PRAGMA foreign_keys=off;")

connection.execute("BEGIN TRANSACTION;")

connection.execute("ALTER TABLE Othernotifiablediseases RENAME TO old_cities;")

connection.execute("CREATE TABLE Othernotifiablediseases (id INTEGER NOT NULL PRIMARY KEY, 'Legionellosis' REAL NOT NULL, 'Leprosy' REAL NOT NULL, 'Meningococcal disease (invasive)' REAL NOT NULL, 'RSV' REAL NOT NULL, 'Tuberculosis' REAL NOT NULL, 'iGAS' REAL NOT NULL, Location TEXT NOT NULL, Year INTEGER NOT NULL);")

connection.execute("INSERT INTO Othernotifiablediseases SELECT * FROM old_cities;")

connection.execute("DROP TABLE old_cities;")

connection.execute("COMMIT;")

connection.execute("PRAGMA foreign_keys=on;")

OperationalError: (sqlite3.OperationalError) cannot commit - no transaction is active
[SQL: COMMIT;]
(Background on this error at: http://sqlalche.me/e/14/e3q8)

In [36]:
connection.execute("ALTER TABLE old_cities RENAME TO Bloodbornediseases2;")

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x1de80486070>

In [37]:
connection.execute("DROP TABLE Bloodbornediseases;")

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x1de805f7550>