public
Description: Small RDF python scraper for gumtree ads (include geocoding)
Clone URL: git://github.com/moustaki/flatscrap.git
Search Repo:
moustaki (author)
Mon Mar 10 08:29:17 -0700 2008
commit  b5a5358a5e39612d4fe7fe7f5543526761d622a8
tree    e3d2a0909652752e7fcccd50196447ae831fe01a
parent  46b9b83c496cd25fd4c1a625752a5f388616ebc2
flatscrap / FlatScrap.py
100755 109 lines (87 sloc) 3.012 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/python
import urllib
import sys
from BeautifulSoup import BeautifulSoup
from geopy import geocoders
from rdflib import ConjunctiveGraph
from rdflib import BNode, Literal, Namespace, URIRef
from rdflib import plugin
from rdflib.syntax.serializers import TurtleSerializer
 
#print "Scrapping "+sys.argv[1]
 
 
class FlatScrap :
 
  def __init__(self,url) :
    self.url = url
 
        def __clean(self,atom):
                t1 = ''.join(atom.rsplit(' '))
                t2 = ''.join(t1.rsplit('\n'))
                return t2
 
 
  def scrap(self,geolocation=True, geostring='London UK') :
    f = urllib.urlopen(self.url)
    html = f.read()
    f.close()
    soup = BeautifulSoup(html)
 
    # Now, let's scrap!
 
    self.location = self.__clean(soup('span','location')[0].contents[1])
    self.title = self.__clean(soup('div',id="title")[0].contents[0].contents[0])
    self.description = soup('div',id="desc")[0].contents[0].contents[0].contents[0]
    try:
      email1 = soup('span','email')[0].contents[2].attrs[0][1]
      if email1.startswith('/cgi-bin'):
        self.email = "http://www.gumtree.com"+email1
      else :
        self.email = email1
    except:
      self.email = ''
    try:
      self.image = "http://www.gumtree.com"+soup('div',id="images")[0].contents[1].attrs[0][1]
    except:
      self.image = ''
 
    #tel = clean(soup('div',id="replyto")[0].contents[0].contents[3])
 
    # Geocoding
    if geolocation==True:
      try:  
        search = self.location + " " + geostring
        g = geocoders.Google('ABQIAAAAu0AMQcAkvqfViJpEeSH_-hT2yXp_ZAY8_ufC3CFXhHIE1NvwkxQ0_Z6CDgX2Q08wvAh1aYjckybfeA')
        self.place, (self.lat,self.lng) = g.geocode(search)
      except:
        self.place= ''
        self.lat=''
        self.lng=''
    else:
      self.place= ''
      self.lat=''
      self.lng=''
 
    #print "Location: " + location
    #print "Title: " + title
    #print "Description: " + description
    #print "Email: "+email
    #print "Image: "+image
 
  def out(self,file) :
 
    # RDF output
 
    RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
    RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
    GT = Namespace("http://purl.org/ontology/flat/")
    FOAF = Namespace("http://xmlns.com/foaf/0.1/")
    DC = Namespace("http://purl.org/dc/elements/1.1/")
    WGS = Namespace("http://www.w3.org/2003/01/geo/wgs84_pos#")
    graph = ConjunctiveGraph()
    
    flat = URIRef("#flat")
    p = BNode()
    e = URIRef(self.email)
    i = URIRef(self.image)
    
    graph.add((flat,RDF.type,GT['Flat']))
    graph.add((flat,FOAF['based_near'],p))
    graph.add((p,RDFS.label,Literal(self.place)))
    graph.add((p,DC['title'],Literal(self.location)))
    graph.add((p,WGS['lat'],Literal(self.lat)))
    graph.add((p,WGS['long'],Literal(self.lng)))
    graph.add((flat,FOAF['mbox'],e))
    graph.add((flat,FOAF['depiction'],i))
    graph.add((flat,DC['title'],Literal(self.title)))
    graph.add((flat,DC['description'],Literal(self.description)))
 
    print graph.serialize(destination=file,format='xml')
 
 
 
 
# Main
#if len(sys.argv)==3:
#  fs = FlatScrap(sys.argv[1])
#  fs.scrap()
#  fs.out(sys.argv[2])