Skip to content

Commit

Permalink
Update scraper.py
Browse files Browse the repository at this point in the history
temp run
  • Loading branch information
Feng-Gao committed Apr 14, 2019
1 parent 9e44f2b commit 63d2da2
Showing 1 changed file with 64 additions and 70 deletions.
134 changes: 64 additions & 70 deletions scraper.py
Expand Up @@ -45,75 +45,69 @@
'附件下载:'.encode('utf-8'):'',
}
package_count = 0
problem_list=[]
problem_list=[{'name':u'基层法律工作者','url':'http://www.datashanghai.gov.cn/query!queryGdsDataInfoById.action?type=0&dataId=AB0002015032'},
{'name':u'行政执法案由信息','url':'http://www.datashanghai.gov.cn/query!queryGdsDataInfoById.action?type=0&dataId=AC7002018008'}]

for u in url_list:
print(u)
result = requests.get(u,headers=headers)
for p in problem_list:
#we create a package_dict to store
package_dict = {
'today':datetime.date.today().strftime("%m/%d/%Y"),
'id':0,
'url':'',
'name':'',
'desc':'',
'org':'',
'topics':'',
'tags':'',
'created':'',
'updated':'',
'frequency':'',
'count':
{
'view':0,
'download':0
},
'format':''
}
#for each package block on the list page, we parse the url to detail page, and package title
package_dict['url'] = "http://www.datashanghai.gov.cn/"+p['url']
package_dict['name'] = p['name']
package_dict['id'] = package_count+1
package_count += 1
print(package_dict['url'])
print(package_dict['name'])
result = requests.get(package_dict['url'],headers=headers)
#now for each package block, we fetch back its detail page and parse its metadata
soup = BeautifulSoup(result.content,features="lxml")
#fetch all dt blocks and get rid of the first 5 as they are irrelevant
package_blocks = soup.find_all('dt')[5:]
for p in package_blocks:
#we create a package_dict to store
package_dict = {
'today':datetime.date.today().strftime("%m/%d/%Y"),
'id':0,
'url':'',
'name':'',
'desc':'',
'org':'',
'topics':'',
'tags':'',
'created':'',
'updated':'',
'frequency':'',
'count':
{
'view':0,
'download':0
},
'format':''
}
#for each package block on the list page, we parse the url to detail page, and package title
package_dict['url'] = "http://www.datashanghai.gov.cn/"+p.a['href']
package_dict['name'] = p.a['title']
package_dict['id'] = package_count+1
package_count += 1
print(package_dict['url'])
print(package_dict['name'])
result = requests.get(package_dict['url'],headers=headers)
#now for each package block, we fetch back its detail page and parse its metadata
soup = BeautifulSoup(result.content,features="lxml")
#there are 4 tables on detail page
tables = soup.find_all('table')
#the first one contains metadata
try:
metadata_table = tables[0]
except:
print("add into problem_list to retry")
problem_list.append(package_dict['url'])
continue
trs = metadata_table.find_all('tr')
for tr in trs:
key = re.sub('[\r\t\n ]+', '', tr.th.text)
value = re.sub('[\r\t\n ]+', '', tr.td.text)
if key.encode('utf-8') == '访问/下载次数:'.encode('utf-8'):
view,download = value.split('/')
package_dict['count']['view'] = int(view)
package_dict['count']['download'] = int(download)
if key.encode('utf-8') == '附件下载:'.encode('utf-8'):
#datashanghai only contains image-based format list on its data package
#we need to iterate each file's image to parse its format
imgs = tr.find_all(src=re.compile("images/"))
format = []
for i in imgs:
format.append(i['src'].split('/')[1].split('.')[0])
format = '|'.join(format)
package_dict['format'] = format
else:
# for meta_dict elements that not mapped into package_dict it will create a '' key in package_dict
package_dict[meta_dict[key.encode('utf-8')]] = value
del package_dict['']
scraperwiki.sqlite.save(unique_keys=['today','id'],data=package_dict)
print('*******************end'+package_dict['name']+'end****************************')
print(problem_list)
#there are 4 tables on detail page
tables = soup.find_all('table')
#the first one contains metadata
try:
metadata_table = tables[0]
except:
print("add into problem_list to retry")
problem_list.append(package_dict['url'])
continue
trs = metadata_table.find_all('tr')
for tr in trs:
key = re.sub('[\r\t\n ]+', '', tr.th.text)
value = re.sub('[\r\t\n ]+', '', tr.td.text)
if key.encode('utf-8') == '访问/下载次数:'.encode('utf-8'):
view,download = value.split('/')
package_dict['count']['view'] = int(view)
package_dict['count']['download'] = int(download)
if key.encode('utf-8') == '附件下载:'.encode('utf-8'):
#datashanghai only contains image-based format list on its data package
#we need to iterate each file's image to parse its format
imgs = tr.find_all(src=re.compile("images/"))
format = []
for i in imgs:
format.append(i['src'].split('/')[1].split('.')[0])
format = '|'.join(format)
package_dict['format'] = format
else:
# for meta_dict elements that not mapped into package_dict it will create a '' key in package_dict
package_dict[meta_dict[key.encode('utf-8')]] = value
del package_dict['']
scraperwiki.sqlite.save(unique_keys=['today','id'],data=package_dict)
print('*******************end'+package_dict['name']+'end****************************')

0 comments on commit 63d2da2

Please sign in to comment.