public this repo is viewable by everyone
Description: Small RDF python scraper for gumtree ads (include geocoding)
Clone URL: git://github.com/moustaki/flatscrap.git
commit  102ef28e58945239a4b62f0d0a11bd4e6cfba8f8
tree    609deb7d55499311853eda561e6d10e502509a57
parent  b5a5358a5e39612d4fe7fe7f5543526761d622a8
flatscrap / FlatScrape.py
100755 109 lines (87 sloc) 3.014 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/python
import urllib
import sys
from BeautifulSoup import BeautifulSoup
from geopy import geocoders
from rdflib import ConjunctiveGraph
from rdflib import BNode, Literal, Namespace, URIRef
from rdflib import plugin
from rdflib.syntax.serializers import TurtleSerializer
 
#print "Scrapping "+sys.argv[1]
 
 
class FlatScrape :
 
  def __init__(self,url) :
    self.url = url
 
        def __clean(self,atom):
                t1 = ''.join(atom.rsplit(' '))
                t2 = ''.join(t1.rsplit('\n'))
                return t2
 
 
  def scrape(self,geolocation=True, geostring='London UK') :
    f = urllib.urlopen(self.url)
    html = f.read()
    f.close()
    soup = BeautifulSoup(html)
 
    # Now, let's scrap!
 
    self.location = self.__clean(soup('span','location')[0].contents[1])
    self.title = self.__clean(soup('div',id="title")[0].contents[0].contents[0])
    self.description = soup('div',id="desc")[0].contents[0].contents[0].contents[0]
    try:
      email1 = soup('span','email')[0].contents[2].attrs[0][1]
      if email1.startswith('/cgi-bin'):
        self.email = "http://www.gumtree.com"+email1
      else :
        self.email = email1
    except:
      self.email = ''
    try:
      self.image = "http://www.gumtree.com"+soup('div',id="images")[0].contents[1].attrs[0][1]
    except:
      self.image = ''
 
    #tel = clean(soup('div',id="replyto")[0].contents[0].contents[3])
 
    # Geocoding
    if geolocation==True:
      try:  
        search = self.location + " " + geostring
        g = geocoders.Google('ABQIAAAAu0AMQcAkvqfViJpEeSH_-hT2yXp_ZAY8_ufC3CFXhHIE1NvwkxQ0_Z6CDgX2Q08wvAh1aYjckybfeA')
        self.place, (self.lat,self.lng) = g.geocode(search)
      except:
        self.place= ''
        self.lat=''
        self.lng=''
    else:
      self.place= ''
      self.lat=''
      self.lng=''
 
    #print "Location: " + location
    #print "Title: " + title
    #print "Description: " + description
    #print "Email: "+email
    #print "Image: "+image
 
  def out(self,file) :
 
    # RDF output
 
    RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
    RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
    GT = Namespace("http://purl.org/ontology/flat/")
    FOAF = Namespace("http://xmlns.com/foaf/0.1/")
    DC = Namespace("http://purl.org/dc/elements/1.1/")
    WGS = Namespace("http://www.w3.org/2003/01/geo/wgs84_pos#")
    graph = ConjunctiveGraph()
    
    flat = URIRef("#flat")
    p = BNode()
    e = URIRef(self.email)
    i = URIRef(self.image)
    
    graph.add((flat,RDF.type,GT['Flat']))
    graph.add((flat,FOAF['based_near'],p))
    graph.add((p,RDFS.label,Literal(self.place)))
    graph.add((p,DC['title'],Literal(self.location)))
    graph.add((p,WGS['lat'],Literal(self.lat)))
    graph.add((p,WGS['long'],Literal(self.lng)))
    graph.add((flat,FOAF['mbox'],e))
    graph.add((flat,FOAF['depiction'],i))
    graph.add((flat,DC['title'],Literal(self.title)))
    graph.add((flat,DC['description'],Literal(self.description)))
 
    print graph.serialize(destination=file,format='xml')
 
 
 
 
# Main
#if len(sys.argv)==3:
#  fs = FlatScrap(sys.argv[1])
#  fs.scrap()
#  fs.out(sys.argv[2])