-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Fork of code from ScraperWiki at https://classic.scraperwiki.com/scra…
- Loading branch information
DrJosepon
authored and
DrJosepon
committed
Jun 18, 2015
0 parents
commit e462bc0
Showing
3 changed files
with
32 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
# Ignore output of scraper | ||
data.sqlite |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
This scraper scrapes the members of UK Parliament - it is very simple and made as an instruction piece for schoolofdata.org |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
import scraperwiki | ||
import lxml.html | ||
|
||
url = "http://www.parliament.uk/mps-lords-and-offices/mps/" | ||
|
||
html = scraperwiki.scrape(url) # get the webpage | ||
|
||
root= lxml.html.fromstring(html) # transform the webpage so we can work with it | ||
|
||
table=root.cssselect("table")[1] # find the second table (0 is the first one) | ||
|
||
rows=table.cssselect("tbody tr") # find every row in the table | ||
|
||
for row in rows: | ||
# for each row do the following | ||
content=row.cssselect("td") # find column data | ||
|
||
if len(content)>1: | ||
# only process if both columns are present.., | ||
constituency=content[1].text_content() # constituency is the second column | ||
(last,first_party)=content[0].text_content().split(",") # process the first column, first name is seperated from the rest by comma | ||
(first,party)=first_party.split(" (") # seperat last name and party (seperated by ' (') | ||
party=party.replace(")","") # remove the parentheses from the party | ||
mep={"first":first, | ||
"last":last, | ||
"party":party, | ||
"constituency":constituency, | ||
"unique":"%s-%s-%s"%(first,last,party)} # construct an entry for the mep | ||
scraperwiki.sqlite.save(unique_keys=["unique"],data=mep) # save the entry |