public
Description: Small RDF python scraper for gumtree ads (include geocoding)
Clone URL: git://github.com/moustaki/flatscrap.git
moustaki (author)
Thu Mar 06 11:07:45 -0800 2008
commit  419a804d446669005734e621d0990d3078bf34b4
tree    10e835ad624db7caaf13189c70fd4e422fb3959f
parent  f1a66b250ddc871ec225d054f5af65ee11db8e92
flatscrap / gtscrap.py
100755 70 lines (52 sloc) 1.804 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/python
import urllib
import sys
from BeautifulSoup import BeautifulSoup
from rdflib import ConjunctiveGraph
from rdflib import BNode, Literal, Namespace, URIRef
from rdflib import plugin
 
 
print "Scrapping "+sys.argv[1]
 
f = urllib.urlopen(sys.argv[1])
html = f.read()
f.close()
 
soup = BeautifulSoup(html)
 
def clean(atom):
        t1 = ''.join(atom.rsplit(' '))
        t2 = ''.join(t1.rsplit('\n'))
        return t2
 
 
# Now, let's scrap!
 
location = clean(soup('span','location')[0].contents[1])
title = clean(soup('div',id="title")[0].contents[0].contents[0])
description = soup('div',id="desc")[0].contents[0].contents[0].contents[0]
email1 = soup('span','email')[0].contents[2].attrs[0][1]
if email1.startswith('/cgi-bin'):
  email = "http://www.gumtree.com"+email1
else :
  email = email1
try:
  image = "http://www.gumtree.com"+soup('div',id="images")[0].contents[1].attrs[0][1]
except:
  image = ''
 
#tel = clean(soup('div',id="replyto")[0].contents[0].contents[3])
 
 
print "Location: " + location
print "Title: " + title
print "Description: " + description
print "Email: "+email
print "Image: "+image
 
 
RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
GT = Namespace("http://purl.org/ontology/flat/")
FOAF = Namespace("http://xmlns.com/foaf/0.1/")
DC = Namespace("http://purl.org/dc/elements/1.1/")
graph = ConjunctiveGraph()
 
flat = BNode()
p = BNode()
e = URIRef(email)
 
graph.add((flat,RDF.type,GT['Flat']))
graph.add((flat,FOAF['based_near'],p))
graph.add((p,RDFS.label,Literal(location)))
graph.add((flat,FOAF['mbox'],e))
graph.add((flat,FOAF['depiction'],image))
graph.add((flat,DC['title'],Literal(title)))
graph.add((flat,DC['description'],Literal(description)))
 
print graph.serialize(format='rdf')