Skip to content

Commit

Permalink
ss
Browse files Browse the repository at this point in the history
  • Loading branch information
Chahed committed Nov 2, 2015
0 parents commit 61c7b4c
Show file tree
Hide file tree
Showing 2 changed files with 292 additions and 0 deletions.
1 change: 1 addition & 0 deletions README.txt
@@ -0,0 +1 @@
Web Python Scrapers
291 changes: 291 additions & 0 deletions scraper.py
@@ -0,0 +1,291 @@
from bs4 import BeautifulSoup
from urllib.request import urlopen
import scraperwiki

def convertirUrl(url):
l= url.split('/')
newurl= l[0]+'//'+l[2]
return newurl

def id_url(url):
list= url.split('/')
return (list[len(list)-1])

def listUrl(url):

response = urlopen(url)
htmltext = BeautifulSoup(response)

NewUrl=convertirUrl(url)
soop = htmltext.find('div',{"class":"contract_set clearfix"})
links = soop.find_all('a')
href=[]
for i in range(0,len(links)-1):
if i % 2 == 0 :
href.append(NewUrl+links[i].get('href'))

return href


def get_numpages(url):
response = urlopen(url)
htmltext = BeautifulSoup(response)

Liste_page =htmltext.find('div',{"id":"pager"})
a=Liste_page.findAll('a')
el= a[len(a)-1].get('href')
n= el.split('/')
try :
num=n[6]
except :
num=n[4]
return num

def suittext(text):
text=text.replace(", ,","")
text=text.replace("'","")
text=text.replace("\\n","")
text=text.replace(" ","")
text=text.replace("\\r","")
text=text.replace("[","")
text=text.replace("]","")
return text

def Reference(htmltext):
REF= htmltext.find('p',{"class":"reference_number"}).text
REFERENCE= REF.split(' ')
R=""
for i in range(1,len(REFERENCE)):
R=R+" "+REFERENCE[i]
return R

def Awarding_body_fc(htmltext):
try:
Awarding= htmltext.find('div',{"class":"contract_hd_left"})
Awarding_body_list=Awarding.findAll('a')
Awarding_body= str(Awarding_body_list[0])
Awarding_body=BeautifulSoup(Awarding_body).text
except :
Awarding1= htmltext.find('div',{"class":"contract_hd_left"})
Awarding= Awarding1.findAll('p')
Awarding_list=str(Awarding[1]).split('</strong>')
Awarding_body= str(Awarding_list[1])
Awarding_body=BeautifulSoup(Awarding_body).text
return Awarding_body

def Detail_left_fc(htmltext):
Detail_left=str(htmltext.find('div',{"class":"detail_left"}).contents)
a=Detail_left.split('<h4>')
Description=a[1].split('</h4>')
Description= Description[1].encode('ascii','ignore')
Description=BeautifulSoup(Description).text
Description=suittext(Description)
return Description

def Table(htmltext,id) :
Tr=htmltext.find('table',{"class":"additional_data"}).findNext('tbody')
Table=Tr.findAll('td')
return str(Table[id])

def Contact(htmltext):
Contact_Details= str(htmltext.find('div',{"class":"highlight_contact_bd"}).findNext('p').contents)
c=Contact_Details.split('<br/>')
m= c[0]+c[1]
m=BeautifulSoup(m).text
m=suittext(m)
return m



def scrap_live(url):
response = urlopen(url)
htmltext = BeautifulSoup(response)

ID=id_url(url)

REFERENCE= Reference(htmltext)

Title =htmltext.find('div',{"class":"contract_hd_left"}).findNext('h1').text

Awarding_body= Awarding_body_fc(htmltext)

Description= Detail_left_fc(htmltext)

Contract_Type =BeautifulSoup(Table(htmltext,0)).text
Procurement_Process =suittext(BeautifulSoup(Table(htmltext,1)).text)
Estimated_Value_TEXT_DESCRIPTION =suittext(Table(htmltext,2))
Cat =suittext(Table(htmltext,3))
Category= (BeautifulSoup(Cat).text).split(',')
Category=str(Category)
CPV_codes =suittext(BeautifulSoup(Table(htmltext,4)).text)
Suitable_for_SME =suittext(BeautifulSoup(Table(htmltext,5)).text)

Document =htmltext.findAll('div',{"class":"highlight_date_body"})
Contact_Details=suittext(Contact(htmltext))

Email = htmltext.find('div',{"class":"c_email"}).text

DOCUMENT_AVAILABLE_UNTIL= suittext(BeautifulSoup(Document[0].getText()).text)
SUBMISSION_RETURN_BY= suittext(BeautifulSoup(Document[0].getText()).text)



data={"ID":unicode(ID), \
"Url":unicode(url),\
"REFERENCE":unicode(REFERENCE),\
"Title":unicode(Title),\
"Awarding body":unicode(Awarding_body),\
"Description":unicode(Description),\
"Contract Type":unicode(Contract_Type),\
"Procurement Process":unicode(Procurement_Process),\
"Estimated Value TEXT DESCRIPTION":unicode(Estimated_Value_TEXT_DESCRIPTION),\
"Category":unicode(Category),\
"SPV codes":unicode(CPV_codes),\
"Suitable for SME":unicode(Suitable_for_SME),\
"DOCUMENT AVAILABLE UNTIL":unicode(DOCUMENT_AVAILABLE_UNTIL),\
"SUBMISSION RETURN BY":unicode(SUBMISSION_RETURN_BY),\
"Contact Details":unicode(Contact_Details),\
"Email":unicode(Email)}
scraperwiki.sqlite.save(unique_keys=['ID'], data=data)


def scrap_awarded(url):
response = urlopen(url)
htmltext = BeautifulSoup(response)

ID=id_url(url)
REFERENCE= Reference(htmltext)

Title =htmltext.find('div',{"class":"contract_hd_left"}).findNext('h1').contents
Title=str(Title).encode('ascii','ignore')
Title=BeautifulSoup(Title).text
Title=suittext(Title)
Awarding_body= Awarding_body_fc(htmltext)

Description= Detail_left_fc(htmltext)

try:
Startdate =BeautifulSoup(Table(htmltext,0)).text
except:
Startdate="none"
try :
Enddate =suittext(BeautifulSoup(Table(htmltext,1)).text)
except:
Enddate = "none"
try:
CVP_codes =suittext(BeautifulSoup(Table(htmltext,2)).text)
except:
CVP_codes ="none"

Date_awarded= htmltext.find('div',{"class":"highlight_date_body"}).text
Awarded_to= htmltext.find('div',{"class":"highlight_contact_hd"}).findNext('p').contents
Awarded_to=str(Awarded_to).encode('ascii','ignore')
Awarded_to=BeautifulSoup(Awarded_to).text
Awarded_to=suittext(Awarded_to)



data={"ID":unicode(ID), \
"Url":unicode(url),\
"REFERENCE":unicode(REFERENCE),\
"Title":unicode(Title),\
"Awarding body":unicode(Awarding_body),\
"Description":unicode(Description),\
"Start Date":unicode(Startdate),\
"End Date":unicode(Enddate),\
"CVP Codes":unicode(CVP_codes),\
"Date Awarded":unicode(Date_awarded),\
"Awarded To":unicode(Awarded_to)}
scraperwiki.sqlite.save(unique_keys=['ID'], data=data)

def scrap_recurring(url):
response = urlopen(url)
htmltext = BeautifulSoup(response)
ID=id_url(url)
REFERENCE= Reference(htmltext)

Title =htmltext.find('div',{"class":"contract_hd_left"}).findNext('h1').contents
Title=str(Title).encode('ascii','ignore')
Title=BeautifulSoup(Title).text
Title=suittext(Title)
Awarding_body= Awarding_body_fc(htmltext)

Description= Detail_left_fc(htmltext)
try:
Contract_type =BeautifulSoup(Table(htmltext,0)).text
except:
Contract_type="none"
try:
Option_to_extend =suittext(BeautifulSoup(Table(htmltext,1)).text)
except:
Option_to_extend="none"
try:
CVP_codes =suittext(BeautifulSoup(Table(htmltext,2)).text)
except :
CVP_codes="none"

EXISITING_CONTRACT_END_DATE= htmltext.find('div',{"class":"highlight_date_body"}).text

print ("REFERENCE="+REFERENCE)
print ("Title="+Title)
print ("Awarding body="+Awarding_body)
print ("Description="+Description)
print("Contact_type="+Contract_type)
print("Extendoptions="+Option_to_extend)
print("Cvp="+CVP_codes)
print("ExistingDate="+EXISITING_CONTRACT_END_DATE)

data={"ID":unicode(ID), \
"Url":unicode(url),\
"REFERENCE":unicode(REFERENCE),\
"Title":unicode(Title),\
"Awarding body":unicode(Awarding_body),\
"Description":unicode(Description),\
"Contract Type":unicode(Contract_type),\
"Option to extend":unicode(Option_to_extend),\
"CVP Codes":unicode(CVP_codes),\
"EXISITING CONTRACT END DATE":EXISITING_CONTRACT_END_DATE}
scraperwiki.sqlite.save(unique_keys=['ID'], data=data)



def extract_data(url):
l = ["awarded","recurring","live"]
for el in l:
urltry=url
if el =="awarded":
urltry=urltry+ "contracts/awarded/type/awarded/page/"
else :
if el=="recurring":
urltry=urltry+"contracts/recurring/type/recurring/page/"
else :
urltry=urltry+"contracts/live/page/"
link = urltry+"1"
numb= get_numpages(link)
print (numb)
for i in range(1,int(numb)+1):
url2=urltry+str(i)
List= listUrl(url2)
for j in List:
if el=="awarded":
scrap_awarded(j)
else :
if el=="live":
scrap_live(j)
else :
if el=="recurring":
scrap_recurring(j)



def main():
urls=["http://www.sourcederbyshire.co.uk/","http://www.sourceleicestershire.co.uk/","http://www.sourcelincolnshire.co.uk/","http://www.sourcenorthamptonshire.co.uk/","http://www.sourcenottinghamshire.co.uk/","http://www.sourcerutland.co.uk/","http://www.sourcecambridgeshire.co.uk/"]
for url in urls :
try:
extract_data(url)
except:
pass


if __name__ == '__main__':
main()

0 comments on commit 61c7b4c

Please sign in to comment.