forked from snarron/doctorbook
-
Notifications
You must be signed in to change notification settings - Fork 1
/
get_wiki_links.py
117 lines (90 loc) · 3.54 KB
/
get_wiki_links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/bin/env python
from urllib2 import urlopen
import re
import json
from nltk import metrics
from bs4 import BeautifulSoup, Tag
class WikiUrlFetch():
def __init__(self,term):
self.cleaned_term = self.clean_term(term)
self.results = self.get_wiki_url(self.cleaned_term)
def clean_term(self,term):
return re.sub(r"[^A-Za-z0-9 ]","",term.lower())
def check_dbpedia(self, term):
api = 'http://lookup.dbpedia.org/api/search.asmx/KeywordSearch?MaxHits=8&QueryString='
#api = 'http://lookup.dbpedia.org/api/search.asmx/PrefixSearch?MaxHits=10&QueryString='
try:
response = urlopen(api+term)
except:
return ""
soup = BeautifulSoup(response.read())
results = []
for result in soup.findAll('result'):
for child in result.children:
if isinstance(child,Tag):
if child.name == 'label':
current_label = child.string.lower()
if child.name == 'uri':
results.append({ 'term': current_label.encode('utf-8'), 'url': child.string.encode('utf-8') })
#print results[0]
return self.rank_dbpedia_results(results,term)
def normalize(self,string):
strings = string.split(" ")
strings.sort()
return " ".join(strings)
def rank_dbpedia_results(self,results,term):
"""
logic:
if edit distance 0, exact match
if edit distance 1-4, good-partial match
if edit distance > 4, partial match (results unsorted)
"""
matches = []
for result in results:
matches.append([metrics.edit_distance(self.normalize(result['term']), self.normalize(term)), result])
matches.sort()
print matches
if len(matches) == 0:
return [ { 'match': 'none', 'term': term.encode('utf-8') } ]
elif matches[0][0] == 0:
new_results = [ matches[0][1] ]
new_results[0]['match'] = 'exact'
return new_results
elif matches[0][0] <= 3:
new_results = []
for match in matches:
if match[0] <= 3:
result = match[1]
result['match'] = 'good-partial'
new_results.append(result)
return new_results
else:
new_results = []
for result in results[0:2]:
result['match'] = 'partial'
new_results.append(result)
return new_results
def wiki_url(self,url):
term = url[url.rfind('/'):]
entity_page = 'http://dbpedia.org/data/{}.json'.format(term)
wiki_type = 'http://xmlns.com/foaf/0.1/primaryTopic'
try:
response = urlopen(entity_page)
except:
return
data = json.loads(response.read())
for key,value in data.items():
'http://xmlns.com/foaf/0.1/primaryTopic'
if 'http://xmlns.com/foaf/0.1/primaryTopic' in value:
print key
return key.encode('utf-8')
def get_wiki_url(self, term):
results = self.check_dbpedia(term)
for result in results:
if result['match'] != 'none':
wiki = self.wiki_url(result['url'])
result['wiki_url'] = wiki
return results
if __name__ == '__main__':
w = WikiUrlFetch('Nunberg, Geoff')
w = WikiUrlFetch('Penguins')