public
Description: Small RDF python scraper for gumtree ads (include geocoding)
Clone URL: git://github.com/moustaki/flatscrap.git
Search Repo:
moustaki (author)
Sat Mar 08 09:41:35 -0800 2008
commit  da3177547e90e4f55cbb65018050a4a51d079ba7
tree    a31ad0df5702afa6a9a877129c5ff5ae8ab7ed97
parent  7cb804bc6ed9f65e0ca7e07fa44048d45c544f14
flatscrap / FlatScrap.py
100755 99 lines (72 sloc) 2.712 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#!/usr/bin/python
import urllib
import sys
from BeautifulSoup import BeautifulSoup
from geopy import geocoders
from rdflib import ConjunctiveGraph
from rdflib import BNode, Literal, Namespace, URIRef
from rdflib import plugin
from rdflib.syntax.serializers import TurtleSerializer
 
#print "Scrapping "+sys.argv[1]
 
 
class FlatScrap :
 
  def __init__(self,url) :
    self.url = url
 
        def __clean(self,atom):
                t1 = ''.join(atom.rsplit(' '))
                t2 = ''.join(t1.rsplit('\n'))
                return t2
 
 
  def scrap(self) :
    f = urllib.urlopen(self.url)
    html = f.read()
    f.close()
    soup = BeautifulSoup(html)
 
    # Now, let's scrap!
 
    self.location = self.__clean(soup('span','location')[0].contents[1])
    self.title = self.__clean(soup('div',id="title")[0].contents[0].contents[0])
    self.description = soup('div',id="desc")[0].contents[0].contents[0].contents[0]
    email1 = soup('span','email')[0].contents[2].attrs[0][1]
    if email1.startswith('/cgi-bin'):
      self.email = "http://www.gumtree.com"+email1
    else :
      self.email = email1
    try:
      self.image = "http://www.gumtree.com"+soup('div',id="images")[0].contents[1].attrs[0][1]
    except:
      self.image = ''
 
    #tel = clean(soup('div',id="replyto")[0].contents[0].contents[3])
 
    # Geocoding
    g = geocoders.Google('ABQIAAAAu0AMQcAkvqfViJpEeSH_-hT2yXp_ZAY8_ufC3CFXhHIE1NvwkxQ0_Z6CDgX2Q08wvAh1aYjckybfeA')
    self.place, (self.lat,self.lng) = g.geocode(self.location)
 
 
    #print "Location: " + location
    #print "Title: " + title
    #print "Description: " + description
    #print "Email: "+email
    #print "Image: "+image
 
  def out(self,file) :
 
    # RDF output
 
    RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
    RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
    GT = Namespace("http://purl.org/ontology/flat/")
    FOAF = Namespace("http://xmlns.com/foaf/0.1/")
    DC = Namespace("http://purl.org/dc/elements/1.1/")
    WGS = Namespace("http://www.w3.org/2003/01/geo/wgs84_pos#")
    graph = ConjunctiveGraph()
    
    flat = URIRef("#flat")
    p = BNode()
    e = URIRef(self.email)
    i = URIRef(self.image)
    
    graph.add((flat,RDF.type,GT['Flat']))
    graph.add((flat,FOAF['based_near'],p))
    graph.add((p,RDFS.label,Literal(self.place)))
    graph.add((p,DC['title'],Literal(self.location)))
    graph.add((p,WGS['lat'],Literal(self.lat)))
    graph.add((p,WGS['long'],Literal(self.lng)))
    graph.add((flat,FOAF['mbox'],e))
    graph.add((flat,FOAF['depiction'],i))
    graph.add((flat,DC['title'],Literal(self.title)))
    graph.add((flat,DC['description'],Literal(self.description)))
 
    print graph.serialize(destination=file,format='xml')
 
 
 
 
# Main
 
fs = FlatScrap(sys.argv[1])
fs.scrap()
fs.out(sys.argv[2])