public
Description: Small RDF python scraper for gumtree ads (include geocoding)
Clone URL: git://github.com/moustaki/flatscrap.git
moustaki (author)
Thu Mar 06 11:59:54 -0800 2008
commit  01c615b2df65967d110c4ed85cd9a89e6a986168
tree    adb43bc94e5b44e36063513d32495b33bf5dfafe
parent  3121dd142062dc7deffc8fe65c71fb40fde74a7e
flatscrap / flatscrap.py
100755 80 lines (61 sloc) 2.228 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/python
import urllib
import sys
from BeautifulSoup import BeautifulSoup
from geopy import geocoders
from rdflib import ConjunctiveGraph
from rdflib import BNode, Literal, Namespace, URIRef
from rdflib import plugin
 
 
#print "Scrapping "+sys.argv[1]
 
f = urllib.urlopen(sys.argv[1])
html = f.read()
f.close()
 
soup = BeautifulSoup(html)
 
def clean(atom):
        t1 = ''.join(atom.rsplit(' '))
        t2 = ''.join(t1.rsplit('\n'))
        return t2
 
 
# Now, let's scrap!
 
location = clean(soup('span','location')[0].contents[1])
title = clean(soup('div',id="title")[0].contents[0].contents[0])
description = soup('div',id="desc")[0].contents[0].contents[0].contents[0]
email1 = soup('span','email')[0].contents[2].attrs[0][1]
if email1.startswith('/cgi-bin'):
  email = "http://www.gumtree.com"+email1
else :
  email = email1
try:
  image = "http://www.gumtree.com"+soup('div',id="images")[0].contents[1].attrs[0][1]
except:
  image = ''
 
#tel = clean(soup('div',id="replyto")[0].contents[0].contents[3])
 
# Geocoding
g = geocoders.Google('ABQIAAAAu0AMQcAkvqfViJpEeSH_-hT2yXp_ZAY8_ufC3CFXhHIE1NvwkxQ0_Z6CDgX2Q08wvAh1aYjckybfeA')
place, (lat,lng) = g.geocode(location)
 
 
#print "Location: " + location
#print "Title: " + title
#print "Description: " + description
#print "Email: "+email
#print "Image: "+image
 
 
RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
GT = Namespace("http://purl.org/ontology/flat/")
FOAF = Namespace("http://xmlns.com/foaf/0.1/")
DC = Namespace("http://purl.org/dc/elements/1.1/")
WGS = Namespace("http://www.w3.org/2003/01/geo/wgs84_pos#")
graph = ConjunctiveGraph()
 
flat = URIRef("#flat")
p = BNode()
e = URIRef(email)
i = URIRef(image)
 
graph.add((flat,RDF.type,GT['Flat']))
graph.add((flat,FOAF['based_near'],p))
graph.add((p,RDFS.label,Literal(place)))
graph.add((p,DC['title'],Literal(location)))
graph.add((p,WGS['lat'],Literal(lat)))
graph.add((p,WGS['long'],Literal(lng)))
graph.add((flat,FOAF['mbox'],e))
graph.add((flat,FOAF['depiction'],i))
graph.add((flat,DC['title'],Literal(title)))
graph.add((flat,DC['description'],Literal(description)))
 
print graph.serialize(destination=sys.argv[2],format='rdf')