Permalink
Browse files

test

  • Loading branch information...
erinclark erinclark
erinclark authored and erinclark committed Mar 15, 2016
1 parent 10348ec commit a9a996a09c1bcf9063649b6f84b4a29c7b2a3eec
Showing with 2 additions and 6 deletions.
  1. +2 −6 scraper.py
View
@@ -11,15 +11,14 @@
tender_url = ''
def get_soup(url):
def get_soup(url):
html = urllib.urlopen(url)
soup = BeautifulSoup(html, "lxml")
return soup
def get_pages(portal): #gets last page
browser = Browser("phantomjs", service_args=['--ignore-ssl-errors=true', '--ssl-protocol=any'])
browser.visit(portal)
html = browser.html
@@ -30,7 +29,6 @@ def get_pages(portal):
def get_links(url, base_url):
soup = get_soup(url)
links = []
hrefs = soup.find('span', text='Search').findNext('tbody').findAll('a')
@@ -40,14 +38,12 @@ def get_links(url, base_url):
def get_title(soup):
title = soup.find('div', {"class":"pbBlock pbTitleBlock "})\
.findNext('h1').text.strip()
return title
def get_detail(soup, text):
try:
detail = soup.find('div', id="area0").find('th', text=text).findNext('td').text
except:
@@ -56,7 +52,6 @@ def get_detail(soup, text):
def get_pub_date(soup):
try:
date = soup.find('div', {"class":"box_item1"})\
.findNext('p', {"class":"text right"}).text.strip()
@@ -71,6 +66,7 @@ def get_pub_date(soup):
base_url = 'https://www.jetro.go.jp/en/database/procurement/local'
soup = get_soup(portal)
pages = get_pages(portal)
print pages
error_messages = []
i = 0

0 comments on commit a9a996a

Please sign in to comment.