In [None]:
'''
The script aims to parse HTML elements for the Illinois Geospatial Data Clearinghouse
and extract parsed content into a local CSV. The progress is maintained on GitHub
(https://github.com/BTAA-Geospatial-Data-Project/parse-html).


Files
-----
x.csv
	A local csv file stores existing urls that are prepared to parse.
output_yyyymmdd.csv
	The output file after parsing and it is followed by the action date.


Developers
----------
Original created on xxxxx
Created by Karen Majewicz  @karenmajewicz

Updated December 14, 2020
Updated by Ziying Cheng  @Ziiiiing

Updated May 26, 2021 for Illinois Geospatial Data Clearinghouse

'''

In [75]:
import csv
import time
import urllib.request
from bs4 import BeautifulSoup

# extract exising urls from local csv file
urls = []

with open('02a-01-sample.csv') as fr:
    reader = csv.reader(fr)  # reader object
    for row in reader:
        urls.append(row)


# store parsed elements for all urls
parseElements = []

for url in urls:
    page = urllib.request.urlopen(url[0]).read()
    soup = BeautifulSoup(page, "html.parser")
    print(f'Parsing {url[0]}')
    zips = soup.find(attrs={"type":"application/zip"})
    for zip in zips:
        try:
            downloadLink = soup.find('a', href=True)
            download = downloadLink['href']
        except:
            download = "none" 
        print(download)
    

Parsing https://clearinghouse.isgs.illinois.edu/data/geology/bedrock-valleys


TypeError: 'NoneType' object is not iterable

In [66]:
import csv
import time
import urllib.request
from bs4 import BeautifulSoup

# extract exising urls from local csv file
urls = []

with open('02a-01-sample.csv') as fr:
    reader = csv.reader(fr)  # reader object
    for row in reader:
        urls.append(row)


# store parsed elements for all urls
parseElements = []

for url in urls:
    page = urllib.request.urlopen(url[0]).read()
    soup = BeautifulSoup(page, "html.parser")
    print(f'Parsing {url[0]}')

    #TITLE - works
    titleField = soup.find(attrs={'id':'page-title'})
    title = titleField.text.strip()
    
    #METADATA LINK - works
    try:
        metadataLink = soup.find('a', href=True, text = "Link")
        metadata = metadataLink['href']
    except:
        metadata = "none"
        
    #SUMMARY - works
    try:
        summaryField = soup.find(attrs={"property":"content:encoded"})
        summary = summaryField.text.strip()
    except:
        summary = "none"
        
        
    #Everything else - needs to be parsed
    
    nodeContentField = soup.find(attrs={'class':'node-content'})
    nodeContent = nodeContentField.text.strip()

    #combine the scraped information
    parseElements.append([title,summary,metadata,nodeContent])

    
# generate action date with format YYYYMMDD    
    
actionDate = time.strftime('%Y%m%d')

# write outputs to local csv file
with open(f'output_{actionDate}.csv', 'w') as fw:
    fields = ['Title','Description','HTML','Content']

    writer = csv.writer(fw)
    writer.writerow(fields)           # fieldnames
    writer.writerows(parseElements)   # elements

    print('#### Job done ####')
    
    

Parsing https://clearinghouse.isgs.illinois.edu/data/climate/illinois-climate-network-soil-data
Parsing https://clearinghouse.isgs.illinois.edu/data/climate/illinois-climate-network-weather-data
Parsing https://clearinghouse.isgs.illinois.edu/data/coastal/HTEM/lake-michigan-coast-2017
Parsing https://clearinghouse.isgs.illinois.edu/data/coastal/habitat
Parsing https://clearinghouse.isgs.illinois.edu/data/coastal/UAS/IBSP
Parsing https://clearinghouse.isgs.illinois.edu/data/coastal/shorelines/UAS
Parsing https://clearinghouse.isgs.illinois.edu/data/coastal/bathy
Parsing https://clearinghouse.isgs.illinois.edu/data/elevation/illinois-height-modernization-ilhmp
Parsing https://clearinghouse.isgs.illinois.edu/data/elevation/surface-elevation-30-meter-digital-elevation-model-dem
Parsing https://clearinghouse.isgs.illinois.edu/data/elevation/surface-elevation-30-meter-shaded-relief-map
Parsing https://clearinghouse.isgs.illinois.edu/data/elevation/surface-elevation-301-foot-digital-elevation

Parsing https://clearinghouse.isgs.illinois.edu/data/land-cover/land-cover-illinois-1999-2000-data
Parsing https://clearinghouse.isgs.illinois.edu/data/land-cover/usda-nass-cropland-data-layer-illinois-1999-2006
Parsing https://clearinghouse.isgs.illinois.edu/data/land-cover/usda-nass-cropland-data-layer-illinois-2007
Parsing https://clearinghouse.isgs.illinois.edu/data/reference/blm-illinois-public-land-survey-system
Parsing https://clearinghouse.isgs.illinois.edu/data/reference/illinois-county-boundaries-polygons-and-lines
Parsing https://clearinghouse.isgs.illinois.edu/data/reference/illinois-plss-townships
Parsing https://clearinghouse.isgs.illinois.edu/data/reference/illinois-public-land-survey-system-plss-boundaries
Parsing https://clearinghouse.isgs.illinois.edu/data/reference/illinois-state-boundary
Parsing https://clearinghouse.isgs.illinois.edu/data/reference/indian-treaty-boundary-lines
Parsing https://clearinghouse.isgs.illinois.edu/data/reference/state-plane-zones
Parsing 