/
microformats__yelp_hreview.py
62 lines (47 loc) · 1.75 KB
/
microformats__yelp_hreview.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# -*- coding: utf-8 -*-
import sys
import re
import urllib2
import json
import HTMLParser
from BeautifulSoup import BeautifulSoup
# Pass in a URL that contains hReview info such as
# http://www.yelp.com/biz/bangkok-golden-fort-washington-2
url = sys.argv[1]
# Parse out some of the pertinent information for a Yelp review
# Unfortunately, the quality of hReview implementations varies
# widely so your mileage may vary. This code is *not* a spec
# parser by any stretch. See http://microformats.org/wiki/hreview
def parse_hreviews(url):
try:
page = urllib2.urlopen(url)
except urllib2.URLError, e:
print 'Failed to fetch ' + url
raise e
try:
soup = BeautifulSoup(page)
except HTMLParser.HTMLParseError, e:
print 'Failed to parse ' + url
raise e
hreviews = soup.findAll(True, 'hreview')
all_hreviews = []
for hreview in hreviews:
if hreview and len(hreview) > 1:
# As of 1 Jan 2010, Yelp does not implement reviewer as an hCard,
# per the spec
reviewer = hreview.find(True, 'reviewer').text
dtreviewed = hreview.find(True, 'dtreviewed').text
rating = hreview.find(True, 'rating').find(True, 'value-title')['title']
description = hreview.find(True, 'description').text
item = hreview.find(True, 'item').text
all_hreviews.append({
'reviewer': reviewer,
'dtreviewed': dtreviewed,
'rating': rating,
'description': description,
})
return all_hreviews
reviews = parse_hreviews(url)
# Do something interesting like plot out reviews over time
# or mine the text in the descriptions...
print json.dumps(reviews, indent=4)