Skip to content

Commit

Permalink
Fork of code from ScraperWiki at https://classic.scraperwiki.com/scra…
Browse files Browse the repository at this point in the history
  • Loading branch information
DrJosepon authored and DrJosepon committed Jun 18, 2015
0 parents commit e462bc0
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Ignore output of scraper
data.sqlite
1 change: 1 addition & 0 deletions README.textile
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This scraper scrapes the members of UK Parliament - it is very simple and made as an instruction piece for schoolofdata.org
29 changes: 29 additions & 0 deletions scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import scraperwiki
import lxml.html

url = "http://www.parliament.uk/mps-lords-and-offices/mps/"

html = scraperwiki.scrape(url) # get the webpage

root= lxml.html.fromstring(html) # transform the webpage so we can work with it

table=root.cssselect("table")[1] # find the second table (0 is the first one)

rows=table.cssselect("tbody tr") # find every row in the table

for row in rows:
# for each row do the following
content=row.cssselect("td") # find column data

if len(content)>1:
# only process if both columns are present..,
constituency=content[1].text_content() # constituency is the second column
(last,first_party)=content[0].text_content().split(",") # process the first column, first name is seperated from the rest by comma
(first,party)=first_party.split(" (") # seperat last name and party (seperated by ' (')
party=party.replace(")","") # remove the parentheses from the party
mep={"first":first,
"last":last,
"party":party,
"constituency":constituency,
"unique":"%s-%s-%s"%(first,last,party)} # construct an entry for the mep
scraperwiki.sqlite.save(unique_keys=["unique"],data=mep) # save the entry

0 comments on commit e462bc0

Please sign in to comment.