Permalink
Browse files

First run

  • Loading branch information...
erinclark erinclark
erinclark authored and erinclark committed Mar 6, 2016
1 parent 7812f87 commit 2dd9fd2f8076b2a6a54cd7af372a84c2e558cacb
Showing with 111 additions and 0 deletions.
  1. +9 −0 requirements.txt
  2. +102 −0 scraper.py
View
@@ -1 +1,10 @@
-e git+http://github.com/openaustralia/scraperwiki-python.git@morph_defaults#egg=scraperwiki
lxml==3.4.4
cssselect==0.9.1
beautifulsoup4
python-dateutil
selenium
splinter>=0.7.3
View
@@ -1 +1,103 @@
# -*- coding: utf-8 -*-
import sys
from datetime import datetime
import urllib
from splinter import Browser
from bs4 import BeautifulSoup
import scraperwiki
reload(sys) # Reload does the trick!
sys.setdefaultencoding('UTF8')
tender_url = ''
def get_soup(url):
html = urllib.urlopen(url)
soup = BeautifulSoup(html, "lxml")
return soup
def get_pages(portal): #gets last page
browser = Browser("phantomjs", service_args=['--ignore-ssl-errors=true', '--ssl-protocol=any'])
browser.visit(portal)
html = browser.html
browser.quit()
soup = BeautifulSoup(html, "lxml")
last_page = soup.find('li', id="lastPage").text
return last_page
def get_links(url, base_url):
soup = get_soup(url)
links = []
hrefs = soup.find('span', text='Search').findNext('tbody').findAll('a')
for l in hrefs:
links.append(base_url + l['href'])
return links
def get_title(soup):
title = soup.find('div', {"class":"pbBlock pbTitleBlock "})\
.findNext('h1').text.strip()
return title
def get_detail(soup, text):
#soup.find('div', id="area0").findAllNext('td')#r')
detail = soup.find('div', id="area0").find('th', text=text).findNext('td').text
return detail
def get_pub_date(soup):
date = soup.find('div', {"class":"box_item1"})\
.findNext('p', {"class":"text right"}).text.strip()
print date
return date
if __name__ == '__main__':
todays_date = str(datetime.now())
portal = 'https://www.jetro.go.jp/en/database/procurement/local/?cat[]=&cat[]=&cat[]=&gov=&deadline=0&page=' # add page number
base_url = 'https://www.jetro.go.jp/en/database/procurement/local'
soup = get_soup(portal)
pages = get_pages(portal)
i = 0
while i<int(pages):
i+=1
page = portal + str(i)
print page
links = get_links(page, base_url)
for link in links:
print link
tender_soup = get_soup(link)
tender_url = link
tender_id = link[-21:-5]
title = get_title(tender_soup)
procuring_entity = get_detail(tender_soup, '\r\n Procuring Entity\r\n ')
category = get_detail(tender_soup, '\r\n Category\r\n ')
deadline = get_detail(tender_soup, '\r\n Deadline\r\n ')
summary = get_detail(tender_soup, '\r\n Summary\r\n ')
publication_date = get_pub_date(tender_soup)
data = {"tender_url": unicode(tender_url),
"tender_id": unicode(tender_id),
"title": unicode(title),
"procuring_entity": unicode(procuring_entity),
"category": unicode(category),
"deadline": unicode(deadline),
"summary": unicode(summary),
"publication_date": unicode(publication_date),
"todays_date": todays_date}
scraperwiki.sqlite.save(unique_keys=['tender_url'], data=data)

0 comments on commit 2dd9fd2

Please sign in to comment.