public
Description: Small RDF python scraper for gumtree ads (include geocoding)
Clone URL: git://github.com/moustaki/flatscrap.git
moustaki (author)
Sat Mar 08 08:51:33 -0800 2008
commit  7cb804bc6ed9f65e0ca7e07fa44048d45c544f14
tree    35fd0792c3ee222bd620a2834bbd1ec5ac384ea0
parent  277b9df29d61b9cc7f8e6f635eeb185d44ee4a0d
flatscrap / flatscrap.py
100755 80 lines (62 sloc) 2.282 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/python
import urllib
import sys
from BeautifulSoup import BeautifulSoup
from geopy import geocoders
from rdflib import ConjunctiveGraph
from rdflib import BNode, Literal, Namespace, URIRef
from rdflib import plugin
from rdflib.syntax.serializers import TurtleSerializer
 
#print "Scrapping "+sys.argv[1]
 
f = urllib.urlopen(sys.argv[1])
html = f.read()
f.close()
 
soup = BeautifulSoup(html)
 
def clean(atom):
        t1 = ''.join(atom.rsplit(' '))
        t2 = ''.join(t1.rsplit('\n'))
        return t2
 
 
# Now, let's scrap!
 
location = clean(soup('span','location')[0].contents[1])
title = clean(soup('div',id="title")[0].contents[0].contents[0])
description = soup('div',id="desc")[0].contents[0].contents[0].contents[0]
email1 = soup('span','email')[0].contents[2].attrs[0][1]
if email1.startswith('/cgi-bin'):
  email = "http://www.gumtree.com"+email1
else :
  email = email1
try:
  image = "http://www.gumtree.com"+soup('div',id="images")[0].contents[1].attrs[0][1]
except:
  image = ''
 
#tel = clean(soup('div',id="replyto")[0].contents[0].contents[3])
 
# Geocoding
g = geocoders.Google('ABQIAAAAu0AMQcAkvqfViJpEeSH_-hT2yXp_ZAY8_ufC3CFXhHIE1NvwkxQ0_Z6CDgX2Q08wvAh1aYjckybfeA')
place, (lat,lng) = g.geocode(location)
 
 
#print "Location: " + location
#print "Title: " + title
#print "Description: " + description
#print "Email: "+email
#print "Image: "+image
 
 
RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
GT = Namespace("http://purl.org/ontology/flat/")
FOAF = Namespace("http://xmlns.com/foaf/0.1/")
DC = Namespace("http://purl.org/dc/elements/1.1/")
WGS = Namespace("http://www.w3.org/2003/01/geo/wgs84_pos#")
graph = ConjunctiveGraph()
 
flat = URIRef("#flat")
p = BNode()
e = URIRef(email)
i = URIRef(image)
 
graph.add((flat,RDF.type,GT['Flat']))
graph.add((flat,FOAF['based_near'],p))
graph.add((p,RDFS.label,Literal(place)))
graph.add((p,DC['title'],Literal(location)))
graph.add((p,WGS['lat'],Literal(lat)))
graph.add((p,WGS['long'],Literal(lng)))
graph.add((flat,FOAF['mbox'],e))
graph.add((flat,FOAF['depiction'],i))
graph.add((flat,DC['title'],Literal(title)))
graph.add((flat,DC['description'],Literal(description)))
 
print graph.serialize(destination=sys.argv[2],format='xml')