In [1]:
import csv 
import time
import urllib.request
from bs4 import BeautifulSoup

### STEP 1: Find All Data Links from Search Page

It seems like there are less than 200 data published on this site, so we set the `ext_page_size=200` to the `home_url` to get all search results in one page.

Next, we use the **Beautiful Soup** to find and store all data links in a list.

In [2]:
home_url = "https://data.humdata.org/search?ext_geodata=1&q=&ext_page_size=200"
home_page = urllib.request.urlopen(home_url).read()
soup = BeautifulSoup(home_page, "html.parser")

# find geodata links
data_urls = []
linkFields = soup.find_all('div', {'class': 'dataset-heading'})
for tag in linkFields:
    url = 'https://data.humdata.org' + tag.find('a')['href']
    data_urls.append(url)

### STEP 2: Extract metadata from Each Data Page

In [23]:
def collect_metadata(url):
    data_page = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(data_page, "html.parser")
    metadata = []
    # ##########################################
    # fields that can be extracted from the website
    alternativeTitle = soup.find('h1', {'class': 'itemTitle dataset-title'}).text.strip()

    descriptionField = soup.find('div', {'class': 'notes embedded-content'}).find_all('p')
    description = '\n'.join(x.text.strip() for x in descriptionField)

    creator = soup.find('th', text = 'Contributor').findNext('td').text.strip()


    titleSource= '?'       # publisher
    resourceClass = '?'

    keywordField = soup.find('th', text = 'Tags').findNext('td').find_all('a')
    keyword = keyword = '|'.join(x.text.strip() for x in keywordField)
    
    try:
        updatedField = soup.find('th', text='Updated').findNext('td').text.strip()
        dd = updatedField.split()[0].zfill(2)
        yyyy = updatedField.split()[2]
        mm = str(time.strptime(updatedField.split()[1], '%B').tm_mon).zfill(2)
        dateIssued = '-'.join((yyyy,mm,dd))
    except:
        dateIssued = ''


    temporalCoverage = soup.find('th', text='Date of Dataset').findNext('td').text.strip()
    fromY = temporalCoverage.split('-')[0].split()[-1]
    toY = temporalCoverage.split('-')[1].split()[-1]
    dateRange = '-'.join((fromY, toY)) 

    locationField = soup.find('th', text='Location').findNext('td').find_all('a')
    spatialCoverage = '|'.join(x.text.strip() for x in locationField)

    information = url
    identifier = url

    bbox = '?' 
    resourceType = '?'
    formatElement = '?'    # multiple files


    download = '?'   # multiple files, decide which one to use
    mapServer = '?'
    featureServer = '?'
    imageServer = '?'
    idElement = '?'  # maybe manually generate later ?
    
    
    # ##########################################
    # fields left empty for later manual edits
    title = ''
    isoTopCat = ''
    
    
    # ##########################################
    # fields with hardcoded values
    language = 'eng'
    provider = 'University of Minnesota'
    code = '?'
    memberOf = '?'
    status = 'Active'
    accrualMethod = '?'
    dateAccessioned = time.strftime("%Y-%m-%d")
    rights = ''
    accessRights = 'Public'
    suppressed = 'FALSE'
    childRecord = 'FALSE'
    fileSize = '?'
    
    metadata = [title, alternativeTitle, description, language, creator,
                titleSource, resourceClass, isoTopCat, keyword, dateIssued,
                temporalCoverage, dateRange, spatialCoverage, bbox, resourceType,
                formatElement, information, download, mapServer, featureServer,
                imageServer, idElement, identifier, provider, code, memberOf,
                status, accrualMethod, dateAccessioned, rights, accessRights,
                suppressed, childRecord, fileSize]
    
    return metadata

In [24]:
# iterate each data url to extract metadata
all_metadata = []
count = 0
for url in data_urls:
    count += 1
    print('>>> [{}/{}] harvesting dataset:\n{}'.format(count, len(data_urls), url))
    all_metadata.append(collect_metadata(url))
    

>>> [1/197] harvesting dataset:
https://data.humdata.org/dataset/beirut-port-explosion-operational-zones
>>> [2/197] harvesting dataset:
https://data.humdata.org/dataset/population-potentially-exposed-to-floods-between-12-21-july-2020-in-bangladesh
>>> [3/197] harvesting dataset:
https://data.humdata.org/dataset/satellite-detected-water-extent-as-of-21-july-2020-over-northwestern-region-of-bangladesh
>>> [4/197] harvesting dataset:
https://data.humdata.org/dataset/water-extent-as-of-20-july-2020-over-the-northeastern-region-of-bangladesh
>>> [5/197] harvesting dataset:
https://data.humdata.org/dataset/satellite-detected-water-extent-as-of-19-july-2020-of-bangladesh
>>> [6/197] harvesting dataset:
https://data.humdata.org/dataset/water-extent-as-of-18-july-2020-over-eastern-part-of-sylhet-division-bangladesh
>>> [7/197] harvesting dataset:
https://data.humdata.org/dataset/satellite-detected-water-extent-as-of-14-july-2020-over-province-2-of-nepal
>>> [8/197] harvesting dataset:
https://

>>> [63/197] harvesting dataset:
https://data.humdata.org/dataset/hotosm_irl_north_roads
>>> [64/197] harvesting dataset:
https://data.humdata.org/dataset/hotosm_irl_north_points_of_interest
>>> [65/197] harvesting dataset:
https://data.humdata.org/dataset/hotosm_irl_north_populated_places
>>> [66/197] harvesting dataset:
https://data.humdata.org/dataset/hotosm_irl_north_airports
>>> [67/197] harvesting dataset:
https://data.humdata.org/dataset/hotosm_irl_north_health_facilities
>>> [68/197] harvesting dataset:
https://data.humdata.org/dataset/hotosm_irl_north_waterways
>>> [69/197] harvesting dataset:
https://data.humdata.org/dataset/hotosm_irl_north_financial_services
>>> [70/197] harvesting dataset:
https://data.humdata.org/dataset/hotosm_irl_north_education_facilities
>>> [71/197] harvesting dataset:
https://data.humdata.org/dataset/caracterizacion-wash-2019
>>> [72/197] harvesting dataset:
https://data.humdata.org/dataset/water-extents-as-of-5-november-2019-over-basse-kotto-prefec

>>> [132/197] harvesting dataset:
https://data.humdata.org/dataset/wildfires-east-of-tyre-in-south-and-el-nabatieh-governorates-lebanon
>>> [133/197] harvesting dataset:
https://data.humdata.org/dataset/waters-extents-as-of-11-october-2019-over-logone-et-chari-department-far-north-region-of-c
>>> [134/197] harvesting dataset:
https://data.humdata.org/dataset/damage-assessment-in-the-southeastern-part-of-new-mirpur-azad-jammu-and-kashmir-pakistan-a
>>> [135/197] harvesting dataset:
https://data.humdata.org/dataset/damage-assessment-of-tulehu-area-eastern-part-of-salahutu-district-maluku-tengah-regency-m
>>> [136/197] harvesting dataset:
https://data.humdata.org/dataset/damage-assessment-of-waai-area-eastern-part-of-salahutu-district-maluku-tengah-regency-mal
>>> [137/197] harvesting dataset:
https://data.humdata.org/dataset/damage-assessment-in-the-southern-part-of-new-mirpur-azad-jammu-and-kashmir-pakistan-as-of
>>> [138/197] harvesting dataset:
https://data.humdata.org/dataset/damage-

>>> [197/197] harvesting dataset:
https://data.humdata.org/dataset/nepal-openstreetmap-extracts


### STEP 3: Write a CSV Report

In [14]:
fieldnames = ['Title', 'Alternative Title', 'Description', 'Language', 'Creator', 'Title Source', 'Resource Class',
              'ISO Topic Categories', 'Keyword', 'Date Issued', 'Temporal Coverage', 'Date Range', 'Spatial Coverage',
              'Bounding Box', 'Resource Type', 'Format', 'Information', 'Download', 'MapServer',
              'FeatureServer', 'ImageServer', 'ID', 'Identifier', 'Provider', 'Code', 'Member Of', 'Status',
              'Accrual Method', 'Date Accessioned', 'Rights', 'Access Rights', 'Suppressed', 'Child Record']

In [15]:
with open('All_Metadata.csv', 'w') as fw:
    writer = csv.writer(fw)
    writer.writerow(fieldnames)
    writer.writerows(all_metadata)