Permalink
Browse files

lowered strings for string searches

  • Loading branch information...
erinclark erinclark
erinclark authored and erinclark committed Jan 7, 2016
1 parent c4794e7 commit 78f3e056632a71d335ecafe6e517b04cdac4bb41
Showing with 23 additions and 15 deletions.
  1. +23 −15 scraper.py
View
@@ -36,7 +36,16 @@ def get_tender_text(link):
tender_soup = BeautifulSoup(tender_html, "lxml")
tender_text = str(tender_soup.find('div',{"id":"content"}).findAll('td'))
tender_text = tender_text[tender_text.find('<br/>'):]
remove_list = ['<br/>', '</b>', '<b>', '</a>', '\\r', '\\n', '</tr>', '</table>', '</th>']
remove_list = ['<br/>',
'</b>',
'<b>',
'</a>',
'\\r',
'\\n',
'</tr>',
'</table>',
'</th>',
'<table border="0" cellpadding="0" cellspacing="1" width="100%"> <tr><th align="left">']
for r in remove_list:
tender_text = tender_text.replace(r, ' ')
tender_text = tender_text.replace('\\t', '.')
@@ -47,12 +56,14 @@ def get_tender_text(link):
def split_text(tender_text, front, back):
fro = tender_text.find(front)
to = tender_text.find(back)
fro = tender_text.lower().find(front)
to = tender_text.lower().find(back)
text_section = tender_text[fro:to]
item_text = text_section[text_section.find(':')+1:]
if ':' in text_section:
item_text = text_section[text_section.find(':')+1:]
else:
item_text = text_section
item_text = item_text.encode('utf-8').strip().strip('.')
#print item_text
return item_text
if __name__ == '__main__':
@@ -72,20 +83,17 @@ def split_text(tender_text, front, back):
tender_text = get_tender_text(link)
if 'II.1.1)' in tender_text:
authority_title = split_text(tender_text, 'Title:', 'I.1)')
contract_title = split_text(tender_text, 'II.1.1)', 'II.1.2)')
awarding_authority = split_text(tender_text, 'I.1)', 'I.2)')
authority_title = split_text(tender_text, 'title:', 'i.1)')
contract_title = split_text(tender_text, 'ii.1.1)', 'ii.1.2)')
awarding_authority = split_text(tender_text, 'i.1)', 'i.2)')
contract_type = ''
description = split_text(tender_text, 'II.1.4)', 'II.1.5)')
description = split_text(tender_text, 'ii.1.4)', 'ii.1.5)')
cpv_codes = ''
nuts_codes = ''
main_site = ''
print 'award auth'
awarding_authority_ref = split_text(tender_text, 'IV.3.1)', 'IV.3.2)')
print 'contract val'
contract_value = split_text(tender_text, 'II.2.1)', 'Section IV:')
print ' eoi'
eoi_or_award_date = split_text(tender_text, 'V.1)D', 'V.2)')
awarding_authority_ref = split_text(tender_text, 'iv.3.1)', 'iv.3.2)')
contract_value = split_text(tender_text, 'ii.2.1)', 'section iv')
eoi_or_award_date = split_text(tender_text, 'v.1)d', 'v.2)')
address_to_send = ''
other = ''

0 comments on commit 78f3e05

Please sign in to comment.