public this repo is viewable by everyone
Description: Small RDF python scraper for gumtree ads (include geocoding)
Clone URL: git://github.com/moustaki/flatscrap.git
Just renaming from scrap to scrape
moustaki (author)
2 months ago
commit  102ef28e58945239a4b62f0d0a11bd4e6cfba8f8
tree    609deb7d55499311853eda561e6d10e502509a57
parent  b5a5358a5e39612d4fe7fe7f5543526761d622a8
...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0
@@ -1,45 +0,0 @@
0
-#!/usr/bin/python
0
-
0
-import sys
0
-import feedparser
0
-import simplejson
0
-from FlatScrap import *
0
-
0
-class BatchScrap :
0
-  
0
-  def __init__(self,rss,limit=10):
0
-    self.rss = rss
0
-    self.limit = limit
0
-  
0
-  def scrap(self) :
0
-    d = feedparser.parse(self.rss)
0
-    self.title = d.feed.title
0
-    e = d['entries']
0
-    self.urls=[]
0
-    self.fs=[]
0
-    for k in range(0,min(len(e),self.limit)) :
0
-      url = e[k].links[0].href
0
-      self.urls.append(url)
0
-      f = FlatScrap(url)
0
-      f.scrap()
0
-      self.fs.append(f)
0
-  
0
-  def outJs(self,file) :
0
-    items = []
0
-    for k in range(0,len(self.fs)):
0
-      fs = self.fs[k]
0
-      latlong = str(fs.lat) + "," + str(fs.lng)
0
-      if fs.place=='':
0
-        items.append({'type':'Flat','label':fs.title,'description':fs.description,'email':fs.email,'address':fs.location,'imageURL':fs.image})
0
-      else:
0
-        items.append({'type':'Flat','label':fs.title,'description':fs.description,'email':fs.email,'address':fs.place,'location':fs.location,'addressLatLng':latlong,'imageURL':fs.image})
0
-    json = {"items":items}
0
-    f = open(file,'w')
0
-    simplejson.dump(json,f),
0
-    f.close()
0
-
0
-
0
-bs = BatchScrap(sys.argv[1],limit=50)
0
-bs.scrap()
0
-bs.outJs(sys.argv[2])
0
-
...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
0
@@ -0,0 +1,45 @@
0
+#!/usr/bin/python
0
+
0
+import sys
0
+import feedparser
0
+import simplejson
0
+from FlatScrape import *
0
+
0
+class BatchScrape :
0
+  
0
+  def __init__(self,rss,limit=10):
0
+    self.rss = rss
0
+    self.limit = limit
0
+  
0
+  def scrape(self) :
0
+    d = feedparser.parse(self.rss)
0
+    self.title = d.feed.title
0
+    e = d['entries']
0
+    self.urls=[]
0
+    self.fs=[]
0
+    for k in range(0,min(len(e),self.limit)) :
0
+      url = e[k].links[0].href
0
+      self.urls.append(url)
0
+      f = FlatScrape(url)
0
+      f.scrape()
0
+      self.fs.append(f)
0
+  
0
+  def outJs(self,file) :
0
+    items = []
0
+    for k in range(0,len(self.fs)):
0
+      fs = self.fs[k]
0
+      latlong = str(fs.lat) + "," + str(fs.lng)
0
+      if fs.place=='':
0
+        items.append({'type':'Flat','label':fs.title,'description':fs.description,'email':fs.email,'address':fs.location,'imageURL':fs.image})
0
+      else:
0
+        items.append({'type':'Flat','label':fs.title,'description':fs.description,'email':fs.email,'address':fs.place,'location':fs.location,'addressLatLng':latlong,'imageURL':fs.image})
0
+    json = {"items":items}
0
+    f = open(file,'w')
0
+    simplejson.dump(json,f),
0
+    f.close()
0
+
0
+
0
+bs = BatchScrape(sys.argv[1],limit=50)
0
+bs.scrape()
0
+bs.outJs(sys.argv[2])
0
+
...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0
@@ -1,111 +0,0 @@
0
-#!/usr/bin/python
0
-import urllib
0
-import sys
0
-from BeautifulSoup import BeautifulSoup
0
-from geopy import geocoders
0
-from rdflib import ConjunctiveGraph
0
-from rdflib import BNode, Literal, Namespace, URIRef
0
-from rdflib import plugin
0
-from rdflib.syntax.serializers import TurtleSerializer
0
-
0
-#print "Scrapping "+sys.argv[1]
0
-
0
-
0
-class FlatScrap :
0
-
0
-  def __init__(self,url) :
0
-    self.url = url
0
-
0
- def __clean(self,atom):
0
- t1 = ''.join(atom.rsplit(' '))
0
- t2 = ''.join(t1.rsplit('\n'))
0
- return t2
0
-
0
-
0
-  def scrap(self,geolocation=True, geostring='London UK') :
0
-    f = urllib.urlopen(self.url)
0
-    html = f.read()
0
-    f.close()
0
-    soup = BeautifulSoup(html)
0
-
0
-    # Now, let's scrap!
0
-
0
-    self.location = self.__clean(soup('span','location')[0].contents[1])
0
-    self.title = self.__clean(soup('div',id="title")[0].contents[0].contents[0])
0
-    self.description = soup('div',id="desc")[0].contents[0].contents[0].contents[0]
0
-    try:
0
-      email1 = soup('span','email')[0].contents[2].attrs[0][1]
0
-      if email1.startswith('/cgi-bin'):
0
-        self.email = "http://www.gumtree.com"+email1
0
-      else :
0
-        self.email = email1
0
-    except:
0
-      self.email = ''
0
-    try:
0
-      self.image = "http://www.gumtree.com"+soup('div',id="images")[0].contents[1].attrs[0][1]
0
-    except:
0
-      self.image = ''
0
-
0
-    #tel = clean(soup('div',id="replyto")[0].contents[0].contents[3])
0
-
0
-    # Geocoding
0
-    if geolocation==True:
0
-      try:  
0
-        search = self.location + " " + geostring
0
-        g = geocoders.Google('ABQIAAAAu0AMQcAkvqfViJpEeSH_-hT2yXp_ZAY8_ufC3CFXhHIE1NvwkxQ0_Z6CDgX2Q08wvAh1aYjckybfeA')
0
-        self.place, (self.lat,self.lng) = g.geocode(search)
0
-      except:
0
-        self.place= ''
0
-        self.lat=''
0
-        self.lng=''
0
-    else:
0
-      self.place= ''
0
-      self.lat=''
0
-      self.lng=''
0
-
0
-    #print "Location: " + location
0
-    #print "Title: " + title
0
-    #print "Description: " + description
0
-    #print "Email: "+email
0
-    #print "Image: "+image
0
-
0
-  def out(self,file) :
0
-
0
-    # RDF output
0
-
0
-    RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
0
-    RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
0
-    GT = Namespace("http://purl.org/ontology/flat/")
0
-    FOAF = Namespace("http://xmlns.com/foaf/0.1/")
0
-    DC = Namespace("http://purl.org/dc/elements/1.1/")
0
-    WGS = Namespace("http://www.w3.org/2003/01/geo/wgs84_pos#")
0
-    graph = ConjunctiveGraph()
0
-    
0
-    flat = URIRef("#flat")
0
-    p = BNode()
0
-    e = URIRef(self.email)
0
-    i = URIRef(self.image)
0
-    
0
-    graph.add((flat,RDF.type,GT['Flat']))
0
-    graph.add((flat,FOAF['based_near'],p))
0
-    graph.add((p,RDFS.label,Literal(self.place)))
0
-    graph.add((p,DC['title'],Literal(self.location)))
0
-    graph.add((p,WGS['lat'],Literal(self.lat)))
0
-    graph.add((p,WGS['long'],Literal(self.lng)))
0
-    graph.add((flat,FOAF['mbox'],e))
0
-    graph.add((flat,FOAF['depiction'],i))
0
-    graph.add((flat,DC['title'],Literal(self.title)))
0
-    graph.add((flat,DC['description'],Literal(self.description)))
0
-
0
-    print graph.serialize(destination=file,format='xml')
0
-
0
-
0
-
0
-
0
-# Main
0
-#if len(sys.argv)==3:
0
-#  fs = FlatScrap(sys.argv[1])
0
-#  fs.scrap()
0
-#  fs.out(sys.argv[2])
0
-
0
-
...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
0
@@ -0,0 +1,111 @@
0
+#!/usr/bin/python
0
+import urllib
0
+import sys
0
+from BeautifulSoup import BeautifulSoup
0
+from geopy import geocoders
0
+from rdflib import ConjunctiveGraph
0
+from rdflib import BNode, Literal, Namespace, URIRef
0
+from rdflib import plugin
0
+from rdflib.syntax.serializers import TurtleSerializer
0
+
0
+#print "Scrapping "+sys.argv[1]
0
+
0
+
0
+class FlatScrape :
0
+
0
+  def __init__(self,url) :
0
+    self.url = url
0
+
0
+ def __clean(self,atom):
0
+ t1 = ''.join(atom.rsplit(' '))
0
+ t2 = ''.join(t1.rsplit('\n'))
0
+ return t2
0
+
0
+
0
+  def scrape(self,geolocation=True, geostring='London UK') :
0
+    f = urllib.urlopen(self.url)
0
+    html = f.read()
0
+    f.close()
0
+    soup = BeautifulSoup(html)
0
+
0
+    # Now, let's scrap!
0
+
0
+    self.location = self.__clean(soup('span','location')[0].contents[1])
0
+    self.title = self.__clean(soup('div',id="title")[0].contents[0].contents[0])
0
+    self.description = soup('div',id="desc")[0].contents[0].contents[0].contents[0]
0
+    try:
0
+      email1 = soup('span','email')[0].contents[2].attrs[0][1]
0
+      if email1.startswith('/cgi-bin'):
0
+        self.email = "http://www.gumtree.com"+email1
0
+      else :
0
+        self.email = email1
0
+    except:
0
+      self.email = ''
0
+    try:
0
+      self.image = "http://www.gumtree.com"+soup('div',id="images")[0].contents[1].attrs[0][1]
0
+    except:
0
+      self.image = ''
0
+
0
+    #tel = clean(soup('div',id="replyto")[0].contents[0].contents[3])
0
+
0
+    # Geocoding
0
+    if geolocation==True:
0
+      try:  
0
+        search = self.location + " " + geostring
0
+        g = geocoders.Google('ABQIAAAAu0AMQcAkvqfViJpEeSH_-hT2yXp_ZAY8_ufC3CFXhHIE1NvwkxQ0_Z6CDgX2Q08wvAh1aYjckybfeA')
0
+        self.place, (self.lat,self.lng) = g.geocode(search)
0
+      except:
0
+        self.place= ''
0
+        self.lat=''
0
+        self.lng=''
0
+    else:
0
+      self.place= ''
0
+      self.lat=''
0
+      self.lng=''
0
+
0
+    #print "Location: " + location
0
+    #print "Title: " + title
0
+    #print "Description: " + description
0
+    #print "Email: "+email
0
+    #print "Image: "+image
0
+
0
+  def out(self,file) :
0
+
0
+    # RDF output
0
+
0
+    RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
0
+    RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
0
+    GT = Namespace("http://purl.org/ontology/flat/")
0
+    FOAF = Namespace("http://xmlns.com/foaf/0.1/")
0
+    DC = Namespace("http://purl.org/dc/elements/1.1/")
0
+    WGS = Namespace("http://www.w3.org/2003/01/geo/wgs84_pos#")
0
+    graph = ConjunctiveGraph()
0
+    
0
+    flat = URIRef("#flat")
0
+    p = BNode()
0
+    e = URIRef(self.email)
0
+    i = URIRef(self.image)
0
+    
0
+    graph.add((flat,RDF.type,GT['Flat']))
0
+    graph.add((flat,FOAF['based_near'],p))
0
+    graph.add((p,RDFS.label,Literal(self.place)))
0
+    graph.add((p,DC['title'],Literal(self.location)))
0
+    graph.add((p,WGS['lat'],Literal(self.lat)))
0
+    graph.add((p,WGS['long'],Literal(self.lng)))
0
+    graph.add((flat,FOAF['mbox'],e))
0
+    graph.add((flat,FOAF['depiction'],i))
0
+    graph.add((flat,DC['title'],Literal(self.title)))
0
+    graph.add((flat,DC['description'],Literal(self.description)))
0
+
0
+    print graph.serialize(destination=file,format='xml')
0
+
0
+
0
+
0
+
0
+# Main
0
+#if len(sys.argv)==3:
0
+#  fs = FlatScrap(sys.argv[1])
0
+#  fs.scrap()
0
+#  fs.out(sys.argv[2])
0
+
0
+
...
1
 
2
3
 
4
5
 
6
7
 
8
9
...
 
1
2
 
3
4
 
5
6
 
7
8
9
0
@@ -1,9 +1,9 @@
0
-./SingleScrap.py <gumtreeurl> <outputfile>
0
+./SingleScrape.py <gumtreeurl> <outputfile>
0
 eg.
0
-./SingleScrap.py http://www.gumtree.com/london/93/21153193.html test.rdf
0
+./SingleScrape.py http://www.gumtree.com/london/93/21153193.html test.rdf
0
 
0
-./BatchScrap <gumtreerss> <jsonfile>
0
+./BatchScrape <gumtreerss> <jsonfile>
0
 eg.
0
-./BatchScrap http://www.gumtree.com/london/3.xml flatscrap.js
0
+./BatchScrape http://www.gumtree.com/london/3.xml flatscrap.js
0
 (Exhibit JSON working with flatscrap.html)
0
 
...
1
2
3
4
5
6
7
8
...
 
 
 
 
 
 
 
 
0
@@ -1,8 +0,0 @@
0
-#!/usr/bin/python
0
-import sys
0
-from FlatScrap import *
0
-
0
-fs = FlatScrap(sys.argv[1])
0
-fs.scrap()
0
-fs.out(sys.argv[2])
0
-
...
 
 
 
 
 
 
 
 
...
1
2
3
4
5
6
7
8
0
@@ -0,0 +1,8 @@
0
+#!/usr/bin/python
0
+import sys
0
+from FlatScrape import *
0
+
0
+fs = FlatScrape(sys.argv[1])
0
+fs.scrape()
0
+fs.out(sys.argv[2])
0
+

Comments

    No one has commented yet.