public
Description: Small RDF python scraper for gumtree ads (include geocoding)
Clone URL: git://github.com/moustaki/flatscrap.git
moustaki (author)
Sat Mar 08 11:30:53 -0800 2008
commit  46b9b83c496cd25fd4c1a625752a5f388616ebc2
tree    2f804796330901a3399ce0e08029d52271f3f194
parent  da3177547e90e4f55cbb65018050a4a51d079ba7
flatscrap / FlatScrap.py
100755 112 lines (87 sloc) 3.012 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/python
import urllib
import sys
from BeautifulSoup import BeautifulSoup
from geopy import geocoders
from rdflib import ConjunctiveGraph
from rdflib import BNode, Literal, Namespace, URIRef
from rdflib import plugin
from rdflib.syntax.serializers import TurtleSerializer
 
#print "Scrapping "+sys.argv[1]
 
 
class FlatScrap :
 
  def __init__(self,url) :
    self.url = url
 
        def __clean(self,atom):
                t1 = ''.join(atom.rsplit(' '))
                t2 = ''.join(t1.rsplit('\n'))
                return t2
 
 
  def scrap(self,geolocation=True, geostring='London UK') :
    f = urllib.urlopen(self.url)
    html = f.read()
    f.close()
    soup = BeautifulSoup(html)
 
    # Now, let's scrap!
 
    self.location = self.__clean(soup('span','location')[0].contents[1])
    self.title = self.__clean(soup('div',id="title")[0].contents[0].contents[0])
    self.description = soup('div',id="desc")[0].contents[0].contents[0].contents[0]
    try:
      email1 = soup('span','email')[0].contents[2].attrs[0][1]
      if email1.startswith('/cgi-bin'):
        self.email = "http://www.gumtree.com"+email1
      else :
        self.email = email1
    except:
      self.email = ''
    try:
      self.image = "http://www.gumtree.com"+soup('div',id="images")[0].contents[1].attrs[0][1]
    except:
      self.image = ''
 
    #tel = clean(soup('div',id="replyto")[0].contents[0].contents[3])
 
    # Geocoding
    if geolocation==True:
      try:  
        search = self.location + " " + geostring
        g = geocoders.Google('ABQIAAAAu0AMQcAkvqfViJpEeSH_-hT2yXp_ZAY8_ufC3CFXhHIE1NvwkxQ0_Z6CDgX2Q08wvAh1aYjckybfeA')
        self.place, (self.lat,self.lng) = g.geocode(search)
      except:
        self.place= ''
        self.lat=''
        self.lng=''
    else:
      self.place= ''
      self.lat=''
      self.lng=''
 
    #print "Location: " + location
    #print "Title: " + title
    #print "Description: " + description
    #print "Email: "+email
    #print "Image: "+image
 
  def out(self,file) :
 
    # RDF output
 
    RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
    RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
    GT = Namespace("http://purl.org/ontology/flat/")
    FOAF = Namespace("http://xmlns.com/foaf/0.1/")
    DC = Namespace("http://purl.org/dc/elements/1.1/")
    WGS = Namespace("http://www.w3.org/2003/01/geo/wgs84_pos#")
    graph = ConjunctiveGraph()
    
    flat = URIRef("#flat")
    p = BNode()
    e = URIRef(self.email)
    i = URIRef(self.image)
    
    graph.add((flat,RDF.type,GT['Flat']))
    graph.add((flat,FOAF['based_near'],p))
    graph.add((p,RDFS.label,Literal(self.place)))
    graph.add((p,DC['title'],Literal(self.location)))
    graph.add((p,WGS['lat'],Literal(self.lat)))
    graph.add((p,WGS['long'],Literal(self.lng)))
    graph.add((flat,FOAF['mbox'],e))
    graph.add((flat,FOAF['depiction'],i))
    graph.add((flat,DC['title'],Literal(self.title)))
    graph.add((flat,DC['description'],Literal(self.description)))
 
    print graph.serialize(destination=file,format='xml')
 
 
 
 
# Main
#if len(sys.argv)==3:
#  fs = FlatScrap(sys.argv[1])
#  fs.scrap()
#  fs.out(sys.argv[2])