public
Description: Small RDF python scraper for gumtree ads (include geocoding)
Clone URL: git://github.com/moustaki/flatscrap.git
Search Repo:
moustaki (author)
Mon Mar 10 08:29:17 -0700 2008
commit  b5a5358a5e39612d4fe7fe7f5543526761d622a8
tree    e3d2a0909652752e7fcccd50196447ae831fe01a
parent  46b9b83c496cd25fd4c1a625752a5f388616ebc2
flatscrap / BatchScrap.py
100755 44 lines (37 sloc) 1.114 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#!/usr/bin/python
 
import sys
import feedparser
import simplejson
from FlatScrap import *
 
class BatchScrap :
  
  def __init__(self,rss,limit=10):
    self.rss = rss
    self.limit = limit
  
  def scrap(self) :
    d = feedparser.parse(self.rss)
    self.title = d.feed.title
    e = d['entries']
    self.urls=[]
    self.fs=[]
    for k in range(0,min(len(e),self.limit)) :
      url = e[k].links[0].href
      self.urls.append(url)
      f = FlatScrap(url)
      f.scrap()
      self.fs.append(f)
  
  def outJs(self,file) :
    items = []
    for k in range(0,len(self.fs)):
      fs = self.fs[k]
      latlong = str(fs.lat) + "," + str(fs.lng)
      if fs.place=='':
        items.append({'type':'Flat','label':fs.title,'description':fs.description,'email':fs.email,'address':fs.location,'imageURL':fs.image})
      else:
        items.append({'type':'Flat','label':fs.title,'description':fs.description,'email':fs.email,'address':fs.place,'location':fs.location,'addressLatLng':latlong,'imageURL':fs.image})
    json = {"items":items}
    f = open(file,'w')
    simplejson.dump(json,f),
    f.close()
 
 
bs = BatchScrap(sys.argv[1],limit=50)
bs.scrap()
bs.outJs(sys.argv[2])