Permalink
Browse files

removed extra column

  • Loading branch information...
erinclark erinclark
erinclark authored and erinclark committed Jan 7, 2016
1 parent 6df5a89 commit bbef2e027fa51347df1dbdbac434c045cb6f70f8
Showing with 5 additions and 4 deletions.
  1. +5 −4 scraper.py
View
@@ -36,7 +36,7 @@ def get_tender_text(link):
tender_soup = BeautifulSoup(tender_html, "lxml")
tender_text = str(tender_soup.find('div',{"id":"content"}).findAll('td'))
tender_text = tender_text[tender_text.find('<br/>'):]
remove_list = ['<br/>', '</b>', '<b>', '</a>', '\\r', '\\n', '</tr>', '</table>']
remove_list = ['<br/>', '</b>', '<b>', '</a>', '\\r', '\\n', '</tr>', '</table>', '</th>']
for r in remove_list:
tender_text = tender_text.replace(r, ' ')
tender_text = tender_text.replace('\\t', '.')
@@ -52,7 +52,7 @@ def split_text(tender_text, front, back):
text_section = tender_text[fro:to]
item_text = text_section[text_section.find(':')+1:]
item_text = item_text.encode('utf-8').strip().strip('.')
#print item_text
print item_text
return item_text
if __name__ == '__main__':
@@ -80,8 +80,11 @@ def split_text(tender_text, front, back):
cpv_codes = ''
nuts_codes = ''
main_site = ''
print 'award auth'
awarding_authority_ref = split_text(tender_text, 'IV.3.1)', 'IV.3.2)')
print 'contract val'
contract_value = split_text(tender_text, 'II.2.1)', 'Section IV:')
print ' eoi'
eoi_or_award_date = split_text(tender_text, 'V.1)D', 'V.2)')
address_to_send = ''
other = ''
@@ -98,7 +101,6 @@ def split_text(tender_text, front, back):
awarding_authority_ref = split_text(tender_text, '8.', '9.')
contract_value = split_text(tender_text, '9.', '10.')
eoi_or_award_date = split_text(tender_text, '10.', '11.')
address_to_send = split_text(tender_text, '11.', '12.')
other = split_text(tender_text, '12.', '13.')
data = {"tender_url":unicode(tender_url),
@@ -112,7 +114,6 @@ def split_text(tender_text, front, back):
"main_site": unicode(main_site),
"awarding_authority_ref": unicode(awarding_authority_ref),
"eoi_or_award_date": unicode(eoi_or_award_date),
"address_to_send": unicode(address_to_send),
"other": unicode(other),
"full_text": tender_text}

0 comments on commit bbef2e0

Please sign in to comment.