In [None]:
'''
The script aims to parse HTML elements for the Illinois Geospatial Data Clearinghouse
and extract parsed content into a local CSV. The progress is maintained on GitHub
(https://github.com/BTAA-Geospatial-Data-Project/parse-html).


Files
-----
x.csv
	A local csv file stores existing urls that are prepared to parse.
output_yyyymmdd.csv
	The output file after parsing and it is followed by the action date.


Developers
----------
Original created on xxxxx
Created by Karen Majewicz  @karenmajewicz

Updated December 14, 2020
Updated by Ziying Cheng  @Ziiiiing

Updated May 26, 2021 for Illinois Geospatial Data Clearinghouse

'''

In [77]:
import csv
import time
import urllib.request
from bs4 import BeautifulSoup

# extract exising urls from local csv file
urls = []

with open('02a-01.csv') as fr:
    reader = csv.reader(fr)  # reader object
    for row in reader:
        urls.append(row)


# store parsed elements for all urls
parseElements = []

for url in urls:
    page = urllib.request.urlopen(url[0]).read()
    soup = BeautifulSoup(page, "html.parser")
    print(f'Parsing {url[0]}')

    #TITLE - works
    titleField = soup.find(attrs={'id':'page-title'})
    title = titleField.text.strip()
    
    #METADATA LINK - works
    try:
        metadataLink = soup.find('a', href=True, text = "Link")
        metadata = metadataLink['href']
    except:
        metadata = "none"
        
    #SUMMARY - works
    try:
        summaryField = soup.find(attrs={"property":"content:encoded"})
        summary = summaryField.text.strip()
    except:
        summary = "none"

    #Download file info - works, but pulls entire class
    
    for fileContent in soup.find_all(attrs={'class':'collapsible collapsed group-downloads field-group-htab form-wrapper'}):   
        for downloadFields in fileContent.children:
            try:
                fileInfo = downloadFields
            except:
                fileInfo = "none"
            print(downloadFields)
            
    #Service info - works, but pulls entire class

    for serviceContent in soup.find_all(attrs={'class':'collapsible collapsed group_services field-group-htab form-wrapper'}):   
        for serviceFields in serviceContent.children:
            try:
                serviceInfo = serviceFields
            except:
                serviceInfo = "none"

                
#     #Possible option for everything else - but would need to be extensively parsed    
#     nodeContentField = soup.find(attrs={'class':'node-content'})
#     nodeContent = nodeContentField.text.strip()
                
                
    #combine the scraped information
    parseElements.append([title,summary,metadata,fileInfo,serviceInfo])

# generate action date with format YYYYMMDD    
    
actionDate = time.strftime('%Y%m%d')

# write outputs to local csv file
with open(f'output_02a-01_{actionDate}.csv', 'w') as fw:
    fields = ['Title','Description','HTML','Download','Service']

    writer = csv.writer(fw)
    writer.writerow(fields)           # fieldnames
    writer.writerows(parseElements)   # elements

    print('#### Job done ####')
    
    

Parsing http://clearinghouse.isgs.illinois.edu/data/geology/elevation-base-barlow-limestone
<legend><span class="fieldset-legend">Data</span></legend>
<div class="fieldset-wrapper"><section class="field field-name-field-zip-data field-type-file field-label-above view-mode-full"><h2 class="field-label">Zip Data: </h2><div class="field-items"><div class="field-item even"><span class="file"><img alt="Package icon" class="file-icon" src="/modules/file/icons/package-x-generic.png" title="application/zip"/> <a href="http://clearinghouse.isgs.illinois.edu/sites/clearinghouse.isgs/files/Clearinghouse/data/ISGS/Geology/zips/IL_Structure_BaseBarlow_Elev_20ft_2009_Ln.zip" type="application/zip; length=4683903">IL_Structure_BaseBarlow_Elev_20ft_2009_Ln.zip</a></span></div></div></section></div>
#### Job done ####
