Skip to content

Commit

Permalink
Update scraper.py
Browse files Browse the repository at this point in the history
  • Loading branch information
Feng-Gao committed Mar 12, 2019
1 parent 938b37c commit b709d7e
Showing 1 changed file with 9 additions and 5 deletions.
14 changes: 9 additions & 5 deletions scraper.py
Expand Up @@ -85,11 +85,15 @@
resource_url = "https://data.taipei/dataset/detail/preview?id="+package_id+"&rid="+resource_id
result = requests.get(resource_url)
soup = BeautifulSoup(result.content,features="lxml")
#locate resource name via <div class=class="q-a_titile"><h6>resource name </h6></div>
resource_name = '"'+soup.find(attrs={"class":"q-a_titile"}).h6.text+'"'
#locate resource description via id=detailContent, which is a table, and get the second tr and its second th's text value
resource_desc = '"'+ soup.find(attrs={"id":"detailContent"}).contents[3].contents[1].contents[0] +'"'

try:
#locate resource name via <div class=class="q-a_titile"><h6>resource name </h6></div>
resource_name = '"'+soup.find(attrs={"class":"q-a_titile"}).h6.text+'"'
#locate resource description via id=detailContent, which is a table, and get the second tr and its second th's text value
resource_desc = '"'+ soup.find(attrs={"id":"detailContent"}).contents[3].contents[1].contents[0] +'"'
except Exception as ex:
print(ex)
resource_name = "BROKEN LINK"
resource_desc = "BROKEN LINK"

#package detail + resource detail as one record

Expand Down

0 comments on commit b709d7e

Please sign in to comment.