public
Description: Small RDF python scraper for gumtree ads (include geocoding)
Clone URL: git://github.com/moustaki/flatscrap.git
Search Repo:
Prodding filenames
moustaki (author)
Thu Mar 06 11:59:54 -0800 2008
commit  01c615b2df65967d110c4ed85cd9a89e6a986168
tree    adb43bc94e5b44e36063513d32495b33bf5dfafe
parent  3121dd142062dc7deffc8fe65c71fb40fde74a7e
...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
0
@@ -1 +1,79 @@
0
+#!/usr/bin/python
0
+import urllib
0
+import sys
0
+from BeautifulSoup import BeautifulSoup
0
+from geopy import geocoders
0
+from rdflib import ConjunctiveGraph
0
+from rdflib import BNode, Literal, Namespace, URIRef
0
+from rdflib import plugin
0
+
0
+
0
+#print "Scrapping "+sys.argv[1]
0
+
0
+f = urllib.urlopen(sys.argv[1])
0
+html = f.read()
0
+f.close()
0
+
0
+soup = BeautifulSoup(html)
0
+
0
+def clean(atom):
0
+ t1 = ''.join(atom.rsplit(' '))
0
+ t2 = ''.join(t1.rsplit('\n'))
0
+ return t2
0
+
0
+
0
+# Now, let's scrap!
0
+
0
+location = clean(soup('span','location')[0].contents[1])
0
+title = clean(soup('div',id="title")[0].contents[0].contents[0])
0
+description = soup('div',id="desc")[0].contents[0].contents[0].contents[0]
0
+email1 = soup('span','email')[0].contents[2].attrs[0][1]
0
+if email1.startswith('/cgi-bin'):
0
+ email = "http://www.gumtree.com"+email1
0
+else :
0
+ email = email1
0
+try:
0
+ image = "http://www.gumtree.com"+soup('div',id="images")[0].contents[1].attrs[0][1]
0
+except:
0
+ image = ''
0
+
0
+#tel = clean(soup('div',id="replyto")[0].contents[0].contents[3])
0
+
0
+# Geocoding
0
+g = geocoders.Google('ABQIAAAAu0AMQcAkvqfViJpEeSH_-hT2yXp_ZAY8_ufC3CFXhHIE1NvwkxQ0_Z6CDgX2Q08wvAh1aYjckybfeA')
0
+place, (lat,lng) = g.geocode(location)
0
+
0
+
0
+#print "Location: " + location
0
+#print "Title: " + title
0
+#print "Description: " + description
0
+#print "Email: "+email
0
+#print "Image: "+image
0
+
0
+
0
+RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
0
+RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
0
+GT = Namespace("http://purl.org/ontology/flat/")
0
+FOAF = Namespace("http://xmlns.com/foaf/0.1/")
0
+DC = Namespace("http://purl.org/dc/elements/1.1/")
0
+WGS = Namespace("http://www.w3.org/2003/01/geo/wgs84_pos#")
0
+graph = ConjunctiveGraph()
0
+
0
+flat = URIRef("#flat")
0
+p = BNode()
0
+e = URIRef(email)
0
+i = URIRef(image)
0
+
0
+graph.add((flat,RDF.type,GT['Flat']))
0
+graph.add((flat,FOAF['based_near'],p))
0
+graph.add((p,RDFS.label,Literal(place)))
0
+graph.add((p,DC['title'],Literal(location)))
0
+graph.add((p,WGS['lat'],Literal(lat)))
0
+graph.add((p,WGS['long'],Literal(lng)))
0
+graph.add((flat,FOAF['mbox'],e))
0
+graph.add((flat,FOAF['depiction'],i))
0
+graph.add((flat,DC['title'],Literal(title)))
0
+graph.add((flat,DC['description'],Literal(description)))
0
+
0
+print graph.serialize(destination=sys.argv[2],format='rdf')
...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0
@@ -1,79 +1 @@
0
-#!/usr/bin/python
0
-import urllib
0
-import sys
0
-from BeautifulSoup import BeautifulSoup
0
-from geopy import geocoders
0
-from rdflib import ConjunctiveGraph
0
-from rdflib import BNode, Literal, Namespace, URIRef
0
-from rdflib import plugin
0
-
0
-
0
-#print "Scrapping "+sys.argv[1]
0
-
0
-f = urllib.urlopen(sys.argv[1])
0
-html = f.read()
0
-f.close()
0
-
0
-soup = BeautifulSoup(html)
0
-
0
-def clean(atom):
0
- t1 = ''.join(atom.rsplit(' '))
0
- t2 = ''.join(t1.rsplit('\n'))
0
- return t2
0
-
0
-
0
-# Now, let's scrap!
0
-
0
-location = clean(soup('span','location')[0].contents[1])
0
-title = clean(soup('div',id="title")[0].contents[0].contents[0])
0
-description = soup('div',id="desc")[0].contents[0].contents[0].contents[0]
0
-email1 = soup('span','email')[0].contents[2].attrs[0][1]
0
-if email1.startswith('/cgi-bin'):
0
- email = "http://www.gumtree.com"+email1
0
-else :
0
- email = email1
0
-try:
0
- image = "http://www.gumtree.com"+soup('div',id="images")[0].contents[1].attrs[0][1]
0
-except:
0
- image = ''
0
-
0
-#tel = clean(soup('div',id="replyto")[0].contents[0].contents[3])
0
-
0
-# Geocoding
0
-g = geocoders.Google('ABQIAAAAu0AMQcAkvqfViJpEeSH_-hT2yXp_ZAY8_ufC3CFXhHIE1NvwkxQ0_Z6CDgX2Q08wvAh1aYjckybfeA')
0
-place, (lat,lng) = g.geocode(location)
0
-
0
-
0
-#print "Location: " + location
0
-#print "Title: " + title
0
-#print "Description: " + description
0
-#print "Email: "+email
0
-#print "Image: "+image
0
-
0
-
0
-RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
0
-RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
0
-GT = Namespace("http://purl.org/ontology/flat/")
0
-FOAF = Namespace("http://xmlns.com/foaf/0.1/")
0
-DC = Namespace("http://purl.org/dc/elements/1.1/")
0
-WGS = Namespace("http://www.w3.org/2003/01/geo/wgs84_pos#")
0
-graph = ConjunctiveGraph()
0
-
0
-flat = URIRef("#flat")
0
-p = BNode()
0
-e = URIRef(email)
0
-i = URIRef(image)
0
-
0
-graph.add((flat,RDF.type,GT['Flat']))
0
-graph.add((flat,FOAF['based_near'],p))
0
-graph.add((p,RDFS.label,Literal(place)))
0
-graph.add((p,DC['title'],Literal(location)))
0
-graph.add((p,WGS['lat'],Literal(lat)))
0
-graph.add((p,WGS['long'],Literal(lng)))
0
-graph.add((flat,FOAF['mbox'],e))
0
-graph.add((flat,FOAF['depiction'],i))
0
-graph.add((flat,DC['title'],Literal(title)))
0
-graph.add((flat,DC['description'],Literal(description)))
0
-
0
-print graph.serialize(destination=sys.argv[2],format='rdf')

Comments

    No one has commented yet.