/
wikisnip.py
42 lines (33 loc) · 1.05 KB
/
wikisnip.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import itertools
import urllib2
import urlparse
from BeautifulSoup import BeautifulSoup
try:
from google.appengine.api import urlfetch
except ImportError:
urlfetch = None
def wget(url):
if urlfetch:
return urlfetch.fetch(url).content
else:
req = urllib2.Request(url)
req.add_header("User-Agent", "Mozilla/5.0 (Compatible)")
return urllib2.urlopen(req).read()
def wikisnip(url):
html = wget(url)
soup = BeautifulSoup(html)
div = soup.find('div', {'id': 'bodyContent'})
snip = BeautifulSoup('')
for node in div.childGenerator():
if (isinstance(node, basestring) or
node.name.lower() in ["table", "script"] or
node.get('id') in ["siteSub", "contentSub", "jump-to-nav"] or
node.get('class') in ['dablink', 'toclimit-2']):
continue
if node.name.lower() == "h2":
break
snip.append(node)
for a in snip.findAll('a'):
if a.get('href'):
a['href'] = urlparse.urljoin(url, a['href'])
return snip