From 61c7b4cf93f49f41e855d76f34fcde73730de128 Mon Sep 17 00:00:00 2001 From: Chahed Date: Mon, 2 Nov 2015 23:40:10 +0100 Subject: [PATCH] ss --- README.txt | 1 + scraper.py | 291 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 292 insertions(+) create mode 100644 README.txt create mode 100644 scraper.py diff --git a/README.txt b/README.txt new file mode 100644 index 0000000..c8d1e5c --- /dev/null +++ b/README.txt @@ -0,0 +1 @@ +Web Python Scrapers \ No newline at end of file diff --git a/scraper.py b/scraper.py new file mode 100644 index 0000000..0f84415 --- /dev/null +++ b/scraper.py @@ -0,0 +1,291 @@ +from bs4 import BeautifulSoup +from urllib.request import urlopen +import scraperwiki + +def convertirUrl(url): + l= url.split('/') + newurl= l[0]+'//'+l[2] + return newurl + +def id_url(url): + list= url.split('/') + return (list[len(list)-1]) + +def listUrl(url): + + response = urlopen(url) + htmltext = BeautifulSoup(response) + + NewUrl=convertirUrl(url) + soop = htmltext.find('div',{"class":"contract_set clearfix"}) + links = soop.find_all('a') + href=[] + for i in range(0,len(links)-1): + if i % 2 == 0 : + href.append(NewUrl+links[i].get('href')) + + return href + + +def get_numpages(url): + response = urlopen(url) + htmltext = BeautifulSoup(response) + + Liste_page =htmltext.find('div',{"id":"pager"}) + a=Liste_page.findAll('a') + el= a[len(a)-1].get('href') + n= el.split('/') + try : + num=n[6] + except : + num=n[4] + return num + +def suittext(text): + text=text.replace(", ,","") + text=text.replace("'","") + text=text.replace("\\n","") + text=text.replace(" ","") + text=text.replace("\\r","") + text=text.replace("[","") + text=text.replace("]","") + return text + +def Reference(htmltext): + REF= htmltext.find('p',{"class":"reference_number"}).text + REFERENCE= REF.split(' ') + R="" + for i in range(1,len(REFERENCE)): + R=R+" "+REFERENCE[i] + return R + +def Awarding_body_fc(htmltext): + try: + Awarding= htmltext.find('div',{"class":"contract_hd_left"}) + Awarding_body_list=Awarding.findAll('a') + Awarding_body= str(Awarding_body_list[0]) + Awarding_body=BeautifulSoup(Awarding_body).text + except : + Awarding1= htmltext.find('div',{"class":"contract_hd_left"}) + Awarding= Awarding1.findAll('p') + Awarding_list=str(Awarding[1]).split('') + Awarding_body= str(Awarding_list[1]) + Awarding_body=BeautifulSoup(Awarding_body).text + return Awarding_body + +def Detail_left_fc(htmltext): + Detail_left=str(htmltext.find('div',{"class":"detail_left"}).contents) + a=Detail_left.split('

') + Description=a[1].split('

') + Description= Description[1].encode('ascii','ignore') + Description=BeautifulSoup(Description).text + Description=suittext(Description) + return Description + +def Table(htmltext,id) : + Tr=htmltext.find('table',{"class":"additional_data"}).findNext('tbody') + Table=Tr.findAll('td') + return str(Table[id]) + +def Contact(htmltext): + Contact_Details= str(htmltext.find('div',{"class":"highlight_contact_bd"}).findNext('p').contents) + c=Contact_Details.split('
') + m= c[0]+c[1] + m=BeautifulSoup(m).text + m=suittext(m) + return m + + + +def scrap_live(url): + response = urlopen(url) + htmltext = BeautifulSoup(response) + + ID=id_url(url) + + REFERENCE= Reference(htmltext) + + Title =htmltext.find('div',{"class":"contract_hd_left"}).findNext('h1').text + + Awarding_body= Awarding_body_fc(htmltext) + + Description= Detail_left_fc(htmltext) + + Contract_Type =BeautifulSoup(Table(htmltext,0)).text + Procurement_Process =suittext(BeautifulSoup(Table(htmltext,1)).text) + Estimated_Value_TEXT_DESCRIPTION =suittext(Table(htmltext,2)) + Cat =suittext(Table(htmltext,3)) + Category= (BeautifulSoup(Cat).text).split(',') + Category=str(Category) + CPV_codes =suittext(BeautifulSoup(Table(htmltext,4)).text) + Suitable_for_SME =suittext(BeautifulSoup(Table(htmltext,5)).text) + + Document =htmltext.findAll('div',{"class":"highlight_date_body"}) + Contact_Details=suittext(Contact(htmltext)) + + Email = htmltext.find('div',{"class":"c_email"}).text + + DOCUMENT_AVAILABLE_UNTIL= suittext(BeautifulSoup(Document[0].getText()).text) + SUBMISSION_RETURN_BY= suittext(BeautifulSoup(Document[0].getText()).text) + + + + data={"ID":unicode(ID), \ + "Url":unicode(url),\ + "REFERENCE":unicode(REFERENCE),\ + "Title":unicode(Title),\ + "Awarding body":unicode(Awarding_body),\ + "Description":unicode(Description),\ + "Contract Type":unicode(Contract_Type),\ + "Procurement Process":unicode(Procurement_Process),\ + "Estimated Value TEXT DESCRIPTION":unicode(Estimated_Value_TEXT_DESCRIPTION),\ + "Category":unicode(Category),\ + "SPV codes":unicode(CPV_codes),\ + "Suitable for SME":unicode(Suitable_for_SME),\ + "DOCUMENT AVAILABLE UNTIL":unicode(DOCUMENT_AVAILABLE_UNTIL),\ + "SUBMISSION RETURN BY":unicode(SUBMISSION_RETURN_BY),\ + "Contact Details":unicode(Contact_Details),\ + "Email":unicode(Email)} + scraperwiki.sqlite.save(unique_keys=['ID'], data=data) + + +def scrap_awarded(url): + response = urlopen(url) + htmltext = BeautifulSoup(response) + + ID=id_url(url) + REFERENCE= Reference(htmltext) + + Title =htmltext.find('div',{"class":"contract_hd_left"}).findNext('h1').contents + Title=str(Title).encode('ascii','ignore') + Title=BeautifulSoup(Title).text + Title=suittext(Title) + Awarding_body= Awarding_body_fc(htmltext) + + Description= Detail_left_fc(htmltext) + + try: + Startdate =BeautifulSoup(Table(htmltext,0)).text + except: + Startdate="none" + try : + Enddate =suittext(BeautifulSoup(Table(htmltext,1)).text) + except: + Enddate = "none" + try: + CVP_codes =suittext(BeautifulSoup(Table(htmltext,2)).text) + except: + CVP_codes ="none" + + Date_awarded= htmltext.find('div',{"class":"highlight_date_body"}).text + Awarded_to= htmltext.find('div',{"class":"highlight_contact_hd"}).findNext('p').contents + Awarded_to=str(Awarded_to).encode('ascii','ignore') + Awarded_to=BeautifulSoup(Awarded_to).text + Awarded_to=suittext(Awarded_to) + + + + data={"ID":unicode(ID), \ + "Url":unicode(url),\ + "REFERENCE":unicode(REFERENCE),\ + "Title":unicode(Title),\ + "Awarding body":unicode(Awarding_body),\ + "Description":unicode(Description),\ + "Start Date":unicode(Startdate),\ + "End Date":unicode(Enddate),\ + "CVP Codes":unicode(CVP_codes),\ + "Date Awarded":unicode(Date_awarded),\ + "Awarded To":unicode(Awarded_to)} + scraperwiki.sqlite.save(unique_keys=['ID'], data=data) + +def scrap_recurring(url): + response = urlopen(url) + htmltext = BeautifulSoup(response) + ID=id_url(url) + REFERENCE= Reference(htmltext) + + Title =htmltext.find('div',{"class":"contract_hd_left"}).findNext('h1').contents + Title=str(Title).encode('ascii','ignore') + Title=BeautifulSoup(Title).text + Title=suittext(Title) + Awarding_body= Awarding_body_fc(htmltext) + + Description= Detail_left_fc(htmltext) + try: + Contract_type =BeautifulSoup(Table(htmltext,0)).text + except: + Contract_type="none" + try: + Option_to_extend =suittext(BeautifulSoup(Table(htmltext,1)).text) + except: + Option_to_extend="none" + try: + CVP_codes =suittext(BeautifulSoup(Table(htmltext,2)).text) + except : + CVP_codes="none" + + EXISITING_CONTRACT_END_DATE= htmltext.find('div',{"class":"highlight_date_body"}).text + + print ("REFERENCE="+REFERENCE) + print ("Title="+Title) + print ("Awarding body="+Awarding_body) + print ("Description="+Description) + print("Contact_type="+Contract_type) + print("Extendoptions="+Option_to_extend) + print("Cvp="+CVP_codes) + print("ExistingDate="+EXISITING_CONTRACT_END_DATE) + + data={"ID":unicode(ID), \ + "Url":unicode(url),\ + "REFERENCE":unicode(REFERENCE),\ + "Title":unicode(Title),\ + "Awarding body":unicode(Awarding_body),\ + "Description":unicode(Description),\ + "Contract Type":unicode(Contract_type),\ + "Option to extend":unicode(Option_to_extend),\ + "CVP Codes":unicode(CVP_codes),\ + "EXISITING CONTRACT END DATE":EXISITING_CONTRACT_END_DATE} + scraperwiki.sqlite.save(unique_keys=['ID'], data=data) + + + +def extract_data(url): + l = ["awarded","recurring","live"] + for el in l: + urltry=url + if el =="awarded": + urltry=urltry+ "contracts/awarded/type/awarded/page/" + else : + if el=="recurring": + urltry=urltry+"contracts/recurring/type/recurring/page/" + else : + urltry=urltry+"contracts/live/page/" + link = urltry+"1" + numb= get_numpages(link) + print (numb) + for i in range(1,int(numb)+1): + url2=urltry+str(i) + List= listUrl(url2) + for j in List: + if el=="awarded": + scrap_awarded(j) + else : + if el=="live": + scrap_live(j) + else : + if el=="recurring": + scrap_recurring(j) + + + +def main(): + urls=["http://www.sourcederbyshire.co.uk/","http://www.sourceleicestershire.co.uk/","http://www.sourcelincolnshire.co.uk/","http://www.sourcenorthamptonshire.co.uk/","http://www.sourcenottinghamshire.co.uk/","http://www.sourcerutland.co.uk/","http://www.sourcecambridgeshire.co.uk/"] + for url in urls : + try: + extract_data(url) + except: + pass + + +if __name__ == '__main__': + main() \ No newline at end of file