public
Description: Small RDF python scraper for gumtree ads (include geocoding)
Clone URL: git://github.com/moustaki/flatscrap.git
Search Repo:
Finally working with Exhibit
works in single or batch mode
moustaki (author)
Sat Mar 08 11:30:53 -0800 2008
commit  46b9b83c496cd25fd4c1a625752a5f388616ebc2
tree    2f804796330901a3399ce0e08029d52271f3f194
parent  da3177547e90e4f55cbb65018050a4a51d079ba7
...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
0
@@ -1 +1,45 @@
0
+#!/usr/bin/python
0
+
0
+import sys
0
+import feedparser
0
+import simplejson
0
+from FlatScrap import *
0
+
0
+class BatchScrap :
0
+
0
+ def __init__(self,rss,limit=10):
0
+ self.rss = rss
0
+ self.limit = limit
0
+
0
+ def scrap(self) :
0
+ d = feedparser.parse(self.rss)
0
+ self.title = d.feed.title
0
+ e = d['entries']
0
+ self.urls=[]
0
+ self.fs=[]
0
+ for k in range(0,min(len(e),self.limit)) :
0
+ url = e[k].links[0].href
0
+ self.urls.append(url)
0
+ f = FlatScrap(url)
0
+ f.scrap()
0
+ self.fs.append(f)
0
+
0
+ def outJs(self,file) :
0
+ items = []
0
+ for k in range(0,len(self.fs)):
0
+ fs = self.fs[k]
0
+ latlong = str(fs.lat) + "," + str(fs.lng)
0
+ if fs.place=='':
0
+ items.append({'type':'Flat','label':fs.title,'description':fs.description,'email':fs.email,'address':fs.location,'imageURL':fs.image})
0
+ else:
0
+ items.append({'type':'Flat','label':fs.title,'description':fs.description,'email':fs.email,'address':fs.place,'location':fs.location,'addressLatLng':latlong,'imageURL':fs.image})
0
+ json = {"items":items}
0
+ f = open(file,'w')
0
+ simplejson.dump(json,f),
0
+ f.close()
0
+
0
+
0
+bs = BatchScrap(sys.argv[1],limit=50)
0
+bs.scrap()
0
+bs.outJs(sys.argv[2])
...
22
23
24
25
 
26
27
28
29
...
33
34
35
36
37
38
39
40
41
 
 
 
 
 
 
 
 
42
43
44
45
...
46
47
48
49
50
 
 
 
 
 
 
 
 
 
 
 
 
 
51
52
53
54
55
...
90
91
92
93
94
95
96
 
 
 
 
...
22
23
24
 
25
26
27
28
29
...
33
34
35
 
 
 
 
 
36
37
38
39
40
41
42
43
44
45
46
47
48
...
49
50
51
 
 
52
53
54
55
56
57
58
59
60
61
62
63
64
65
 
66
67
68
...
103
104
105
 
 
 
 
106
107
108
109
0
@@ -22,7 +22,7 @@
0
                 return t2
0
 
0
 
0
- def scrap(self) :
0
+ def scrap(self,geolocation=True, geostring='London UK') :
0
     f = urllib.urlopen(self.url)
0
     html = f.read()
0
     f.close()
0
0
@@ -33,12 +33,15 @@
0
     self.location = self.__clean(soup('span','location')[0].contents[1])
0
     self.title = self.__clean(soup('div',id="title")[0].contents[0].contents[0])
0
     self.description = soup('div',id="desc")[0].contents[0].contents[0].contents[0]
0
- email1 = soup('span','email')[0].contents[2].attrs[0][1]
0
- if email1.startswith('/cgi-bin'):
0
- self.email = "http://www.gumtree.com"+email1
0
- else :
0
- self.email = email1
0
     try:
0
+ email1 = soup('span','email')[0].contents[2].attrs[0][1]
0
+ if email1.startswith('/cgi-bin'):
0
+ self.email = "http://www.gumtree.com"+email1
0
+ else :
0
+ self.email = email1
0
+ except:
0
+ self.email = ''
0
+ try:
0
       self.image = "http://www.gumtree.com"+soup('div',id="images")[0].contents[1].attrs[0][1]
0
     except:
0
       self.image = ''
0
0
@@ -46,10 +49,20 @@
0
     #tel = clean(soup('div',id="replyto")[0].contents[0].contents[3])
0
 
0
     # Geocoding
0
- g = geocoders.Google('ABQIAAAAu0AMQcAkvqfViJpEeSH_-hT2yXp_ZAY8_ufC3CFXhHIE1NvwkxQ0_Z6CDgX2Q08wvAh1aYjckybfeA')
0
- self.place, (self.lat,self.lng) = g.geocode(self.location)
0
+ if geolocation==True:
0
+ try:
0
+ search = self.location + " " + geostring
0
+ g = geocoders.Google('ABQIAAAAu0AMQcAkvqfViJpEeSH_-hT2yXp_ZAY8_ufC3CFXhHIE1NvwkxQ0_Z6CDgX2Q08wvAh1aYjckybfeA')
0
+ self.place, (self.lat,self.lng) = g.geocode(search)
0
+ except:
0
+ self.place= ''
0
+ self.lat=''
0
+ self.lng=''
0
+ else:
0
+ self.place= ''
0
+ self.lat=''
0
+ self.lng=''
0
 
0
-
0
     #print "Location: " + location
0
     #print "Title: " + title
0
     #print "Description: " + description
0
@@ -90,8 +103,8 @@
0
 
0
 
0
 # Main
0
-
0
-fs = FlatScrap(sys.argv[1])
0
-fs.scrap()
0
-fs.out(sys.argv[2])
0
+#if len(sys.argv)==3:
0
+# fs = FlatScrap(sys.argv[1])
0
+# fs.scrap()
0
+# fs.out(sys.argv[2])
0
...
1
 
 
 
 
 
 
 
 
...
 
1
2
3
4
5
6
7
8
0
@@ -1,2 +1,9 @@
0
-./flatscrap.py <gumtreeurl> <outputfile>
0
+./SingleScrap.py <gumtreeurl> <outputfile>
0
+eg.
0
+./SingleScrap.py http://www.gumtree.com/london/93/21153193.html test.rdf
0
+
0
+./BatchScrap <gumtreerss> <jsonfile>
0
+eg.
0
+./BatchScrap http://www.gumtree.com/london/3.xml flatscrap.js
0
+(Exhibit JSON working with flatscrap.html)
...
 
 
 
 
 
 
 
...
1
2
3
4
5
6
7
0
@@ -1 +1,8 @@
0
+#!/usr/bin/python
0
+import sys
0
+from FlatScrap import *
0
+
0
+fs = FlatScrap(sys.argv[1])
0
+fs.scrap()
0
+fs.out(sys.argv[2])
...
6
7
8
9
 
10
11
12
13
14
 
 
 
 
 
 
15
16
17
...
6
7
8
 
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
0
@@ -6,12 +6,18 @@
0
   
0
     <script src="http://static.simile.mit.edu/exhibit/api-2.0/exhibit-api.js"
0
             type="text/javascript"></script>
0
-
0
+ <script src="http://static.simile.mit.edu/exhibit/extensions-2.0/map/map-extension.js?gmapkey="></script>
0
     <style>
0
     </style>
0
  </head>
0
  <body>
0
     <h1>Gum Tree adds</h1>
0
+ <div ex:role="view"
0
+ ex:viewClass="Map"
0
+ ex:latlng=".addressLatLng"
0
+ ex:colorKey=".type">
0
+ </div>
0
+
0
     <table width="100%">
0
         <tr valign="top">
0
             <td ex:role="viewPanel">

Comments

    No one has commented yet.