From 61c7b4cf93f49f41e855d76f34fcde73730de128 Mon Sep 17 00:00:00 2001
From: Chahed <hedi.chahed.2i@gmail.com>
Date: Mon, 2 Nov 2015 23:40:10 +0100
Subject: [PATCH] ss

---
 README.txt |   1 +
 scraper.py | 291 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 292 insertions(+)
 create mode 100644 README.txt
 create mode 100644 scraper.py
diff --git a/README.txt b/README.txt
new file mode 100644
index 0000000..c8d1e5c
--- /dev/null
+++ b/README.txt
@@ -0,0 +1 @@
+Web Python Scrapers
\ No newline at end of file
diff --git a/scraper.py b/scraper.py
new file mode 100644
index 0000000..0f84415
--- /dev/null
+++ b/scraper.py
@@ -0,0 +1,291 @@
+from bs4 import BeautifulSoup
+from urllib.request import urlopen
+import scraperwiki
+
+def convertirUrl(url):
+    l= url.split('/')
+    newurl= l[0]+'//'+l[2]
+    return newurl
+
+def id_url(url):
+    list= url.split('/')
+    return (list[len(list)-1])
+
+def listUrl(url):
+
+    response = urlopen(url)
+    htmltext = BeautifulSoup(response)
+
+    NewUrl=convertirUrl(url)
+    soop = htmltext.find('div',{"class":"contract_set clearfix"})
+    links = soop.find_all('a')
+    href=[]
+    for i in range(0,len(links)-1):
+        if i % 2 == 0 :
+            href.append(NewUrl+links[i].get('href'))
+
+    return href
+
+
+def get_numpages(url):
+    response = urlopen(url)
+    htmltext = BeautifulSoup(response)
+
+    Liste_page =htmltext.find('div',{"id":"pager"})
+    a=Liste_page.findAll('a')
+    el= a[len(a)-1].get('href')
+    n= el.split('/')
+    try :
+        num=n[6]
+    except :
+        num=n[4]
+    return num
+
+def suittext(text):
+    text=text.replace(", ,","")
+    text=text.replace("'","")
+    text=text.replace("\\n","")
+    text=text.replace("  ","")
+    text=text.replace("\\r","")
+    text=text.replace("[","")
+    text=text.replace("]","")
+    return text
+
+def Reference(htmltext):
+    REF= htmltext.find('p',{"class":"reference_number"}).text
+    REFERENCE= REF.split(' ')
+    R=""
+    for i in range(1,len(REFERENCE)):
+        R=R+" "+REFERENCE[i]
+    return R
+
+def Awarding_body_fc(htmltext):
+    try:
+        Awarding= htmltext.find('div',{"class":"contract_hd_left"})
+        Awarding_body_list=Awarding.findAll('a')
+        Awarding_body= str(Awarding_body_list[0])
+        Awarding_body=BeautifulSoup(Awarding_body).text
+    except :
+        Awarding1= htmltext.find('div',{"class":"contract_hd_left"})
+        Awarding= Awarding1.findAll('p')
+        Awarding_list=str(Awarding[1]).split('</strong>')
+        Awarding_body= str(Awarding_list[1])
+        Awarding_body=BeautifulSoup(Awarding_body).text
+    return Awarding_body
+
+def Detail_left_fc(htmltext):
+    Detail_left=str(htmltext.find('div',{"class":"detail_left"}).contents)
+    a=Detail_left.split('<h4>')
+    Description=a[1].split('</h4>')
+    Description= Description[1].encode('ascii','ignore')
+    Description=BeautifulSoup(Description).text
+    Description=suittext(Description)
+    return Description
+
+def Table(htmltext,id) :
+    Tr=htmltext.find('table',{"class":"additional_data"}).findNext('tbody')
+    Table=Tr.findAll('td')
+    return str(Table[id])
+
+def Contact(htmltext):
+    Contact_Details= str(htmltext.find('div',{"class":"highlight_contact_bd"}).findNext('p').contents)
+    c=Contact_Details.split('<br/>')
+    m= c[0]+c[1]
+    m=BeautifulSoup(m).text
+    m=suittext(m)
+    return m
+
+
+
+def scrap_live(url):
+    response = urlopen(url)
+    htmltext = BeautifulSoup(response)
+
+    ID=id_url(url)
+
+    REFERENCE= Reference(htmltext)
+
+    Title =htmltext.find('div',{"class":"contract_hd_left"}).findNext('h1').text
+
+    Awarding_body= Awarding_body_fc(htmltext)
+
+    Description= Detail_left_fc(htmltext)
+
+    Contract_Type =BeautifulSoup(Table(htmltext,0)).text
+    Procurement_Process =suittext(BeautifulSoup(Table(htmltext,1)).text)
+    Estimated_Value_TEXT_DESCRIPTION =suittext(Table(htmltext,2))
+    Cat =suittext(Table(htmltext,3))
+    Category= (BeautifulSoup(Cat).text).split(',')
+    Category=str(Category)
+    CPV_codes =suittext(BeautifulSoup(Table(htmltext,4)).text)
+    Suitable_for_SME =suittext(BeautifulSoup(Table(htmltext,5)).text)
+
+    Document =htmltext.findAll('div',{"class":"highlight_date_body"})
+    Contact_Details=suittext(Contact(htmltext))
+
+    Email = htmltext.find('div',{"class":"c_email"}).text
+
+    DOCUMENT_AVAILABLE_UNTIL= suittext(BeautifulSoup(Document[0].getText()).text)
+    SUBMISSION_RETURN_BY= suittext(BeautifulSoup(Document[0].getText()).text)
+
+
+
+    data={"ID":unicode(ID), \
+          "Url":unicode(url),\
+          "REFERENCE":unicode(REFERENCE),\
+          "Title":unicode(Title),\
+          "Awarding body":unicode(Awarding_body),\
+          "Description":unicode(Description),\
+          "Contract Type":unicode(Contract_Type),\
+          "Procurement Process":unicode(Procurement_Process),\
+          "Estimated Value TEXT DESCRIPTION":unicode(Estimated_Value_TEXT_DESCRIPTION),\
+          "Category":unicode(Category),\
+          "SPV codes":unicode(CPV_codes),\
+          "Suitable for SME":unicode(Suitable_for_SME),\
+          "DOCUMENT AVAILABLE UNTIL":unicode(DOCUMENT_AVAILABLE_UNTIL),\
+          "SUBMISSION RETURN BY":unicode(SUBMISSION_RETURN_BY),\
+          "Contact Details":unicode(Contact_Details),\
+          "Email":unicode(Email)}
+    scraperwiki.sqlite.save(unique_keys=['ID'], data=data)
+
+
+def scrap_awarded(url):
+    response = urlopen(url)
+    htmltext = BeautifulSoup(response)
+
+    ID=id_url(url)
+    REFERENCE= Reference(htmltext)
+
+    Title =htmltext.find('div',{"class":"contract_hd_left"}).findNext('h1').contents
+    Title=str(Title).encode('ascii','ignore')
+    Title=BeautifulSoup(Title).text
+    Title=suittext(Title)
+    Awarding_body= Awarding_body_fc(htmltext)
+
+    Description= Detail_left_fc(htmltext)
+
+    try:
+        Startdate =BeautifulSoup(Table(htmltext,0)).text
+    except:
+        Startdate="none"
+    try :
+        Enddate =suittext(BeautifulSoup(Table(htmltext,1)).text)
+    except:
+        Enddate = "none"
+    try:
+        CVP_codes =suittext(BeautifulSoup(Table(htmltext,2)).text)
+    except:
+        CVP_codes ="none"
+
+    Date_awarded= htmltext.find('div',{"class":"highlight_date_body"}).text
+    Awarded_to= htmltext.find('div',{"class":"highlight_contact_hd"}).findNext('p').contents
+    Awarded_to=str(Awarded_to).encode('ascii','ignore')
+    Awarded_to=BeautifulSoup(Awarded_to).text
+    Awarded_to=suittext(Awarded_to)
+
+
+
+    data={"ID":unicode(ID), \
+          "Url":unicode(url),\
+          "REFERENCE":unicode(REFERENCE),\
+          "Title":unicode(Title),\
+          "Awarding body":unicode(Awarding_body),\
+          "Description":unicode(Description),\
+          "Start Date":unicode(Startdate),\
+          "End Date":unicode(Enddate),\
+          "CVP Codes":unicode(CVP_codes),\
+          "Date Awarded":unicode(Date_awarded),\
+          "Awarded To":unicode(Awarded_to)}
+    scraperwiki.sqlite.save(unique_keys=['ID'], data=data)
+
+def scrap_recurring(url):
+    response = urlopen(url)
+    htmltext = BeautifulSoup(response)
+    ID=id_url(url)
+    REFERENCE= Reference(htmltext)
+
+    Title =htmltext.find('div',{"class":"contract_hd_left"}).findNext('h1').contents
+    Title=str(Title).encode('ascii','ignore')
+    Title=BeautifulSoup(Title).text
+    Title=suittext(Title)
+    Awarding_body= Awarding_body_fc(htmltext)
+
+    Description= Detail_left_fc(htmltext)
+    try:
+        Contract_type =BeautifulSoup(Table(htmltext,0)).text
+    except:
+        Contract_type="none"
+    try:
+        Option_to_extend =suittext(BeautifulSoup(Table(htmltext,1)).text)
+    except:
+        Option_to_extend="none"
+    try:
+        CVP_codes =suittext(BeautifulSoup(Table(htmltext,2)).text)
+    except :
+        CVP_codes="none"
+
+    EXISITING_CONTRACT_END_DATE= htmltext.find('div',{"class":"highlight_date_body"}).text
+
+    print  ("REFERENCE="+REFERENCE)
+    print  ("Title="+Title)
+    print ("Awarding body="+Awarding_body)
+    print ("Description="+Description)
+    print("Contact_type="+Contract_type)
+    print("Extendoptions="+Option_to_extend)
+    print("Cvp="+CVP_codes)
+    print("ExistingDate="+EXISITING_CONTRACT_END_DATE)
+
+    data={"ID":unicode(ID), \
+          "Url":unicode(url),\
+          "REFERENCE":unicode(REFERENCE),\
+          "Title":unicode(Title),\
+          "Awarding body":unicode(Awarding_body),\
+          "Description":unicode(Description),\
+          "Contract Type":unicode(Contract_type),\
+          "Option to extend":unicode(Option_to_extend),\
+          "CVP Codes":unicode(CVP_codes),\
+          "EXISITING CONTRACT END DATE":EXISITING_CONTRACT_END_DATE}
+    scraperwiki.sqlite.save(unique_keys=['ID'], data=data)
+
+
+
+def extract_data(url):
+    l = ["awarded","recurring","live"]
+    for el in l:
+        urltry=url
+        if el =="awarded":
+            urltry=urltry+ "contracts/awarded/type/awarded/page/"
+        else :
+            if el=="recurring":
+                urltry=urltry+"contracts/recurring/type/recurring/page/"
+            else :
+                urltry=urltry+"contracts/live/page/"
+        link = urltry+"1"
+        numb= get_numpages(link)
+        print (numb)
+        for i in range(1,int(numb)+1):
+            url2=urltry+str(i)
+            List= listUrl(url2)
+            for j in List:
+                if el=="awarded":
+                    scrap_awarded(j)
+                else :
+                    if el=="live":
+                        scrap_live(j)
+                    else :
+                        if el=="recurring":
+                            scrap_recurring(j)
+
+
+
+def main():
+    urls=["http://www.sourcederbyshire.co.uk/","http://www.sourceleicestershire.co.uk/","http://www.sourcelincolnshire.co.uk/","http://www.sourcenorthamptonshire.co.uk/","http://www.sourcenottinghamshire.co.uk/","http://www.sourcerutland.co.uk/","http://www.sourcecambridgeshire.co.uk/"]
+    for url in urls :
+        try:
+            extract_data(url)
+        except:
+            pass
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file