-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathsolr.py
123 lines (88 loc) · 3.05 KB
/
solr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# -*- coding: utf-8 -*-
import json
import urllib
import urllib2
"""
Helper class for querying an Apache Solr database.
"""
class Solr:
def __init__(self):
self.solr_url = None
def setSolrURL(self, url):
self.solr_url = url
def _getDocumentsFromIDChunk(self, ids):
ids = [x.encode('utf8') for x in ids]
ids_qclause = '"' + '" OR "'.join(ids) + '"'
ids_q = 'id: (' + ids_qclause + ')'
docs = self.getDocuments(ids_q)
return docs
def buildSolrQuery(self, q_dict):
''' Build a query for a Solr request. '''
q = []
for k, v in q_dict.iteritems():
sub_q = '%s:"%s"' % (k, v)
q.append(sub_q)
query = ' AND '.join(q)
return query
def getResultsFound(self, q, fq = None):
""" report the number of results found for any given request. """
data = {
'q': q,
'start': 0,
'rows': 10,
'wt': 'json'
}
if fq:
data['fq'] = fq
data = urllib.urlencode(data)
req = urllib2.Request(self.solr_url, data)
res = urllib2.urlopen(req)
res = json.loads(res.read())
return res['response']['numFound']
def getDocuments(self, q, fq = None):
""" makes Solr requests to get article texts """
data = {
'q': q,
'start': 0,
'rows': 10,
'wt': 'json',
}
if fq:
data['fq'] = fq
data = urllib.urlencode(data)
req = urllib2.Request(self.solr_url, data)
res = urllib2.urlopen(req)
res = json.loads(res.read())
numFound = res['response']['numFound']
print("%d documents found." % numFound)
interval = 100
## add 100 to get everything for sure
numFound += interval
articles = []
for i in range(0, numFound, interval):
data = {
'q': q,
'rows': interval,
'start': i,
'wt': 'json'
}
if fq:
data['fq'] = fq
data = urllib.urlencode(data)
req = urllib2.Request(self.solr_url, data)
res = urllib2.urlopen(req)
res = json.loads(res.read())
articles.extend(res['response']['docs'])
if i % 1000 == 0 and i > 0:
print('%d documents collected.' % i)
return articles
def getDocumentsFromIDs(self, ids, maxclauses=1024):
# Chunk trick from https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
id_chunks = [ids[i:i + maxclauses] for i
in xrange(0, len(ids), maxclauses)]
docs = list()
# NB: making copies of this list will use up memory fast!
for i in range(len(id_chunks)):
print("### Chunk %d of %d" % (i + 1, len(id_chunks)))
docs.extend(self._getDocumentsFromIDChunk(id_chunks[i]))
return docs