openlibrary / openlibrary

One webpage for every book ever published!

This URL has Read+Write access

openlibrary / solr / logdaemon.py
100644 218 lines (176 sloc) 6.496 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
import sys
sys.path.insert(0, "../infogami")
 
logfile = '/1/dbg/import-logs/dbglog'
logfile = '/1/pharos/db/authortest'
logfile = '/1/pharos/db/good'
logfile = '/1/pharos/db/pharos'
# logfile = '/1/pharos/db/crap'
 
#logfile = '/tmp/log.test'
outfile = sys.stdout
outfile = open('solr1.xml', 'w')
oca_map = open('oca-map.log', 'a')
 
# tcp socket of solr server
solr = ('localhost', 8983)
 
from infogami.tdb.logger import parse, parse1, parse2a, parse2b
import infogami.tdb.tdb
import web
 
import re
import socket
from itertools import *
from itools import *
from cStringIO import StringIO
from operator import itemgetter
from time import time
 
fst = itemgetter(0)
snd = itemgetter(1)
 
def setup():
    web.config.db_parameters = dict(dbn="postgres",
# db="pharos",
                                    db="pharos",
                                    user="pharos",
                                    pw="pharos")
    web.load()
 
def logparse(log_fd):
    return parse2b(parse1(log_fd,
                          infinite=True))
 
def speed():
    p = logparse(logfile)
    t0=time()
    n = sum(1 for x in p)
    dt = time()-t0
    print '%d items, %.3f seconds, %.2f items/sec'% (n, dt, n/dt)
 
setup()
 
# ================================================================
 
# from exclude import excluded_fields, multivalued_fields
import solr_fields
 
def main():
    import time as _time
 
    global t,k
    # out = open('solr.xml', 'w')
    t1 = t0 = time()
    print 'start time: ', _time.ctime(t0)
 
    log_fd = open(logfile)
    lastpos_fd = open('lastpos', 'r+', 0)
    lastpos = int(open('lastpos').readline())
    print 'seeking to %d'% lastpos
    log_fd.seek(lastpos)
 
    for i,t in enumerate(logparse(log_fd)):
# print (t,t.type,type(t.type),t.type.name, type(t.type.name))
        if time()-t1 > 5 or i % 100 == 0:
            print (i, time()-t1, time()-t0)
            sys.stdout.flush()
            t1 = time()
 
        if t.type.name not in ('delete', 'edition'):
            continue
        if t.type.name == 'delete': action = 'delete'
        else: action = 'add'
 
        outbuf = StringIO()
        print >>outbuf, "<%s>"% action
        if emit_doc (outbuf, action, t) is None:
            continue
        print >>outbuf, "</%s>"% action
 
        if 1:
            xml = outbuf.getvalue()
            # print 'xml:(%s)'% xml
            solr_response = solr_submit(xml)
            assert '<result status="0">' in solr_response, solr_response
            # print 'solr response: ((%s))\n'% solr_response
 
        else:
            outfile.write(outbuf.getvalue())
            outfile.flush()
 
        lastpos_fd.seek(0)
        log_pos = log_fd.tell()
        lastpos_fd.write('%d\n'% log_pos)
        lastpos_fd.flush()
 
    lastpos_fd.close()
 
def sort_canon(s):
    s = s.strip().lower()
    s = re.sub('\s+', ' ', s)
    return s
 
# dict of fields for which there will be a corresponding sortable
# field. The field values will have to be transformed into a
# sorting key that does stuff like strips punctuation and folds
# case, or inserts leading zeros for numeric fields,
# but for now we just use the identity function. The
# dict entries are of the form:
# fieldname : (new field name, conversion function)
 
def identity(x): return x
 
sorted_field_dict = {
# 'authors': ('creatorSorter', sort_canon),
# 'title': ('titleSorter', sort_canon),
    # for publish_date, also output 'date' which basic query
    # treats specially to handle ranges
# 'publish_date': ('date', identity),
    }
 
ids_seen = set()
 
def emit_doc(outbuf, action, t, loss=count()):
    assert t.name not in ids_seen, t.name
    for forbidden in ('text', 'identifier'):
        assert forbidden not in t.d
    ids_seen.add(t.name)
 
    print >>outbuf, "<doc>"
 
    emit_field(outbuf, 'identifier', t.name)
 
    if 'oca_identifier' in t.d:
        print >> oca_map, (t.d.oca_identifier, t.name, time())
        oca_map.flush()
 
    if action != 'delete':
        for k in t.d:
            v = getattr(t.d, k)
 
            try:
                emit_field(outbuf, k, v)
            except infogami.tdb.tdb.NotFound, e:
                print ('dropping', loss.next(), time(), t.name)
                return None
 
            if k in solr_fields.singleton_fields:
                # should do something better than crash the importer
                # on finding this error. but it shouldn't happen. @@
                assert type(v) != list or len(v) == 1, (t,k,v)
 
            if k not in solr_fields.excluded_fields:
                emit_field(outbuf, 'text', v)
            if k in sorted_field_dict:
                sfname, conversion = sorted_field_dict[k]
                global z # debug @@
                z = (t,k,v)
                if type(v) != list:
                    assert type(v) == str
                    v = [v]
                emit_field(outbuf, sfname, map(conversion,map(str,v)))
                       
    print >>outbuf, "</doc>\n"
    return outbuf.getvalue()
 
def emit_field(outbuf,
               field_name,
               field_val,
               non_strings = set()):
    from cgi import escape
    assert escape(field_name) == field_name
 
    if type(field_val) == list:
        for v in field_val:
            emit_field(outbuf, field_name, v)
    else:
        # some fields are numeric--may need to pad these with
        # leading zeros for sorting purposes, but don't bother for now. @@
        if type(field_val) != str:
            field_val = str(field_val)
            if field_name not in non_strings:
                # temporarily print out names of fields which
                # are found to (sometimes) actually contain a
                # non-string value. @@
                print 'non-string fieldname', field_name
                non_strings.add(field_name)
 
        print >>outbuf, ' <field name="%s">%s</field>'% (field_name,
                                                         escape(field_val))
 
def solr_submit(xml):
    """submit an XML document to solr"""
    sock = socket.socket()
    try:
        sock.connect(solr)
        sock.sendall('POST /solr/update HTTP/1.1\n')
        sock.sendall('Host: %s:%d\n'% solr)
        sock.sendall('Content-type:text/xml; charset=utf-8\n')
        sock.sendall('Content-Length: %d\n\n'% len(xml))
        sock.sendall(xml)
        response = sock.recv(10000)
    finally:
        sock.close()
    return response
 
main()