<?xml version="1.0" encoding="UTF-8"?>
<commit>
  <added type="array"/>
  <modified type="array">
    <modified>
      <diff>@@ -14,9 +14,9 @@ class LRU(object):
     &gt;&gt;&gt; assert (4 in a) and (5 in a)
     &gt;&gt;&gt; assert (1 not in a) and (2 not in a) # these are aged out
     &quot;&quot;&quot;
-    def __init__(self, max_size = 1000):
+    def __init__(self, min_retention = 1000):
         self.cache = {}
-        self.max_size = max_size
+        self.min_retention = min_retention
         self.size = 0
         self.timestamp = count().next
 
@@ -34,12 +34,12 @@ class LRU(object):
         if a is not None:
             a[1] = value
         else:
-            if len(self.cache) &gt;= 2*self.max_size:
+            if len(self.cache) &gt;= 2*self.min_retention:
                 # replace the cache with a new one containing only the
-                # self.max_size newest entries.
+                # self.min_retention newest entries.
                 chopped = sorted(self.cache.iteritems(),
                                  key=(lambda (k,(ts,v)): ts),
-                                 reverse=True)[:self.max_size]
+                                 reverse=True)[:self.min_retention]
                 self.cache = dict(chopped)
             self.cache[key] = [self.timestamp(), value]
         </diff>
      <filename>solr/LRU.py</filename>
    </modified>
    <modified>
      <diff>@@ -1,56 +1,173 @@
+#!/usr/bin/python2.5
 from __future__ import with_statement
 import pdb                              # @@
-import re
+import re,cgi
 from collections import defaultdict
 from itertools import *
 from itools import *
+import os,stat,sys
+from xml.etree import cElementTree as ET
 
-interesting = set(('title',
-                   'title_prefix_len',
-                   'title_sort',
-                   'authors',
-                   'publish_date'))
+from facet_hash import facet_token
+token_counts = defaultdict(int)
 
-# file -&gt; [([(str,str)], {str:[int,str]})]
-# @slicer(20)
+def mprint(*args):
+    print args
+    sys.stdout.flush()
+
+class Record(object):
+    def __init__(self, buf, dd):
+        self.buf = buf                  # [(name, value)]
+        self.dd = dd                    # { name : [Int] }
+        
+global dxml
+
+# @slicer(5)
 def alldocs(fp):
     while True:
-        # this relies on being able to ignore some &lt;add&gt; and &lt;/add&gt; lines
-        for x in fp:
-            if x == '&lt;doc&gt;\n':
-                break
+        buf,dd = parse_doc_xml(fp)
+        if len(buf) == 0:
+            print ('alldocs done', buf,dd)
+            return
+        yield (buf, dd)
+
+
+global desc_flush
+desc_flush = 0
+
+def parse_doc_xml(fp):
+    global dxml, desc_flush
+    dxml = get_doc_xml(fp).decode('utf-8')
+
+    dd = defaultdict(list)
+    buf = []
+
+    from traceback import print_exc
+
+    try:
+        e = ET.XML(dxml)
+    except (Exception,SyntaxError), x:
+        print 'syntax error:'
+        for i,line in zip(count(1), dxml.split('\n')):
+            print '%-3d|%s'% (i,line)
+        raise ValueError, (x,dd)
+        
+    fname = ''
+    for i,t in enumerate(e.getiterator('field')):
+        if fname == 'description':
+            # this gets rid of the current catchall ('text') field
+            # if the PRECEDING field was a description.  We used to
+            # index descriptions as part of the catchall text and so
+            # it's included in the input xml, but it's messing up the
+            # search results so we get rid of it here.        
+            assert t.get('name') == 'text', t
+            assert t.text == ftext, (i, e, fname, t.text, ftext)
+            ftext = ''
+            desc_flush += 1
         else:
-            # we never found the beginning of a document, so we're done
-            return # sentinel
-
-        dd = defaultdict(list)
-        buf = []
-        pat = re.compile(r' *\&lt;field name=&quot;(.*?)&quot;\&gt;(.*?)\&lt;/field\&gt;')
-        for i,x in enumerate(fp):
-            g = pat.match(x)
-            if not g:
-                break
-            fname, fval = g.groups()
-            buf.append((fname, fval))
-            if True or fname in interesting:
+            ftext = t.text if t.text is not None else ''
+
+        fname = t.get('name')
+        dd[fname].append((i,ftext))
+        buf.append((fname,ftext))
+
+    return (buf, dd)
+    
+def get_doc_xml(fp):
+    for x in fp:
+        if x.strip() == '&lt;doc&gt;': break
+    else:
+        raise StopIteration
+    a = [x]
+    for x in fp:
+        a.append(x)
+        if x.strip() == '&lt;/doc&gt;': break
+    return ''.join(a)
+
+# file -&gt; [([(str,str)], {str:[int,str]})]
+# @slicer(20)
+def alldocs_old(fp):
+    raise AssertionError, &quot;this function is obsolete!!!!&quot;
+
+    nmissing = 0                        # @@
+    try:
+        while True:
+            # this relies on being able to ignore some &lt;add&gt; and &lt;/add&gt; lines
+            for x in fp:
+                if x == '&lt;doc&gt;\n':
+                    break
+            else:
+                # we never found the beginning of a document, so we're done
+                return # sentinel
+
+            dd = defaultdict(list)
+            buf = []
+            pat = re.compile(r' *\&lt;field name=&quot;(.*?)&quot;\&gt;(.*?)\&lt;/field\&gt;')
+            for i,x in enumerate(fp):
+                g = pat.match(x)
+                if not g:
+                    break
+                fname, fval = g.groups()
+
+                buf.append((fname, fval))
                 dd[fname].append((i,fval))
-        yield buf,dd
 
-testfile = 'xml/solr1.xml'
+            if 'title' not in dd:
+                # need to fix this!!!  Sometimes we don't read a
+                # complete record because an xml field is spread across
+                # several lines.  For now, we just ignore the docs
+                # that lose their titles this way. :-(((  @@
+                nmissing += 1
+                if nmissing % 500 == 0:
+                    mprint ('missing title', nmissing, dd)
+            else:
+                yield buf,dd
+    finally:
+        mprint ('missed titles', nmissing)
+
 testfile = 'xml/s2.xml'
+testfile = 'xml/solr1.xml'
+out_dir = 'barf9'
 
-import os,stat,sys
+testfile = '/x-home/phr/my-ol/pharos/pharos/access/access0000.xml'
+testfile = 'access_in/b0.xml'
+out_dir = 'access_out'
+
+params = {'out_dir': out_dir, 'in_filename': testfile }
+def update_params():
+    for x in sys.argv:
+        v = x.split('=')
+        if len(v) == 2:
+            a,b = v
+            assert re.match('^[a-z_]+$', a)
+            params[a] = b
+
+update_params()
 
 bunch_size = 1000
 
+import gzip
+from contextlib import closing
+def zopen(file, *mode):
+    if file.endswith('.gz'):
+        return closing(gzip.open(file,*mode))
+    return open(file,*mode)
+
+def main2():
+    try:
+        main1()
+    except AssertionError, m:
+        print 'assertion error (%s)'%m
+        pdb.set_trace()
+
 def main():
     for i,b in enumerate(bunches()):
-        with open('barf/%05d.xml'% i, 'w') as f:
+        with zopen('%s/%05d.xml.gz'% (params['out_dir'], i), 'w') as f:
             f.write('&lt;add&gt;\n')
             for j,x in b:
                 map_ (f.write, dump(x))
             f.write('&lt;/add&gt;\n')
-
+    mprint ('date fields seen', date_fields_seen)
 
 def bunches():
     for i,p in groupby(enumerate(process()),
@@ -62,91 +179,184 @@ def process():
     from time import time,ctime
     t1 = t0 = time()
 
-    print ('start', ctime())
-    tf = open(testfile)
-    tsize = os.stat(testfile)[stat.ST_SIZE]
-    print ('file size', tsize)
+    filename = params['in_filename']
+    mprint ('start', filename, ctime())
+    tf = open(filename)
+    tsize = os.stat(filename)[stat.ST_SIZE]
+    mprint ('file size', tsize)
+
+    # pdb.set_trace()
 
     for i,(buf,dd) in enumerate(alldocs(tf)):
-        if i % 1000 == 0:
+        if i % 2500 == 0:
             t2 = time()
             pos = tf.tell()
-            print ('nrec', i, t2-t1, t2-t0, pos, float(pos)/tsize)
-            # print ('buf,dd', buf, dd)
+            mprint ('nrec', i, t2-t1, t2-t0, pos, float(pos)/tsize)
+            # mprint ('buf,dd', buf, dd)
             t1=t2
         hack_title(buf, dd)
+        hack_date(buf, dd)
+        hack_field_aliases(buf, dd)
         tokens = get_facet_tokens(buf, dd)
         buf.append(('facet_tokens', ' '.join(tokens)))
         yield buf
 
-    print ('done', i, time()-t0)
+    mprint ('done', i, time()-t0)
+
+field_aliases = (
+    ('language_code', 'language'),
+    ('subjects', 'subject'),
+    )
 
 # [str,str], {str : [int, str]} -&gt; None
-# mutate dictionary dd
+# mutate buf, dd
+def hack_field_aliases(buf, dd):
+    # this is a kludge, it should happen earlier in the process
+    def d(f): return dd.get(f, [])
+    for src,target in field_aliases:
+        ns = sorted(d(src)+d(target), key=lambda(a,b): b)
+    # stub: doesn't change anything for now @@
+
+# [str,str], {str : [int, str]} -&gt; None
+# mutate buf, should mutate dd too @@
 def hack_title(buf, dd):
     title_v = dd.get('title')
-    assert len(title_v) == 1, title_v
+    try:
+        assert title_v and len(title_v) == 1, (title_v, dd)
+    except AssertionError:
+        mprint('missing title',title_v,dd)
+        raise
+        return
     idx,title = title_v[0]
-    assert buf[idx][0] == 'title'
+    assert buf[idx][0] == 'title',buf
 
     tpl_v = dd.get('title_prefix_len')
     if tpl_v:
         assert len(tpl_v) == 1, (title_v,tpl_v)
-        tpl = int(tpl_v[0])
+        try:
+            tpl = int(tpl_v[0][1])
+        except TypeError, e:
+            mprint('tpl_v failure',tpl_v,dd,e.args)
+            raise
     else:
         tpl = 0
 
     title_sort = title[tpl:]
     if tpl:
-        print ('tpl', title, title_sort)
+        pass # mprint ('tpl', title, title_sort)
     assert 'title_sort' not in dd
-    buf.append(('title_sort', title_sort))
+    buf.append(('titleSorter', title_sort))
+
+date_fields_seen = defaultdict(int)
 
+# [str,str], {str : [int, str]} -&gt; None
+# mutate dictionary dd
+def hack_date(buf, dd):
+    for d,v in dd.iteritems():
+        if 'date' in d:
+            date_fields_seen[d] += 1
+            if d !='publish_date':
+                print ('alternate date field', (d, v))
+    
+    assert 'facet_year' not in dd
+    publish_dates = dd.get('publish_date', [])
+    assert len(publish_dates) &lt;= 1
+    if publish_dates:
+        yyyy = re.search(r'\d{4}', publish_dates[0][1])
+        apparent_year = int(yyyy.group(0)) if yyyy else 0
+        yprint.a('publish_dates',publish_dates, yyyy, apparent_year)
+        if yyyy and (1500 &lt; apparent_year &lt; 2010):
+            facet_year = facetize_year(apparent_year)
+            dd['facet_year'] = [(len(buf), facet_year)]
+            buf.append(('facet_year', facet_year))
+            py = str(apparent_year)
+            dd['publication_year'] = [(len(buf), py)]
+            buf.append(('publication_year', py))  # add as stored field to solr @@
+            yprint.a('buf-dd', buf, dd)
+
+# String -&gt; String
+def facetize_year(yyyy):
+    &quot;&quot;&quot;Convert 4-digit numeric year to the facet string for its
+    date range, usually a 20 year period.  The facet strings
+    are 2000, 1980, 1960, 1940, 1920, pre1920, and unknown
+    &gt;&gt;&gt; print facetize_year(2007)
+    2000
+    &gt;&gt;&gt; print facetize_year(2000)
+    2000
+    &gt;&gt;&gt; print facetize_year(1997)
+    1980
+    &gt;&gt;&gt; print facetize_year(1923)
+    1920
+    &gt;&gt;&gt; print facetize_year(1920)
+    1920
+    &gt;&gt;&gt; print facetize_year(1919)
+    pre1920
+    &gt;&gt;&gt; print facetize_year(5864)  # hebrew calendar year
+    unknown
+    &quot;&quot;&quot;
+    y = int(yyyy)
+    if 1920 &lt;= y &lt;= 2010:
+        return '%d' % (y - (y % 20))
+    elif y &lt; 1920:
+        return 'pre1920'
+    else:
+        return 'unknown'
+
+# [str,str], {str : [int, str]} -&gt; None
+# mutate buf and dd
+def hack_isbn(buf, dd):
+    isbns = dd.get('ISBN_10', []) + dd.get('ISBN_13', [])
+    if isbns:
+        ii = ' '.join(isbns)
+        dd['publication_year'] = [(len(buf), ii)]
+        buf.append(('ISBN', ii))
+        
 def get_facet_tokens(buf, dd):
     # for each facet field:
     #    generate facet tokens
     # insert tokens into buf
 
+    global token_counts
+
     # should get facet fields from schema, not put them here. @@
-    facet_fields = ('authors', 'publisher')
+    facet_fields = ('authors',
+                    'publisher',
+                    'subject',
+                    ('subjects', 'subject'),
+                    'source',
+                    'language',
+                    'language_code',
+                    'has_fulltext',
+                    'facet_year',
+                    )
     tokens = []
-    for field in facet_fields:
+    for xfield in facet_fields:
+        if type(xfield) == tuple:
+            field, flabel = xfield
+        else:
+            field = flabel = xfield
         for idx,v in dd.get(field, []):
             assert buf[idx][0] == field
-            tokens.append(facet_token(field, v))
+            token_counts[field] += 1
+            tokens.append(facet_token(flabel, v))
     return tokens
-    
-import string
-from hashlib import md5 as mkhash
-
-# choose token length to make collisions unlikely (if there is a
-# rare collision once in a while, we tolerate it, it just means
-# that users may occasionally see some extra search results.
-# don't make it excessively large because the tokens do use index space.
-# The probability of a collision is approx.  1 - exp(-k**2 / (2*n)) where
-# k = total # of facet tokens (= # of books * avg # of fields)
-# n = 26 ** facet_token_length
-# so for k = 10**8 and facet_token_length = 12,
-# this probability is 1 - exp(-1e16/(2*26**12)) = approx 0.05.
-# (That's the prob of EVER getting a collision, not the prob. of
-# seeing a collision on any particular query).
-
-facet_token_length = 12
-
-# str, str -&gt; str
-def facet_token(field, v):
-    token = []
-    q = int(mkhash('FT,%s,%s'%(field,v)).hexdigest(), 16)
-    for i in xrange(facet_token_length):
-        q,r = divmod(q, 26)
-        token.append(string.lowercase[r])
-    return ''.join(token)
 
+class yprint:
+    m = 0
+    @staticmethod
+    def a(*args):
+        if yprint.m &lt; 20:
+            mprint (yprint.m, args)
+        yprint.m += 1
+            
 # [(str, [str])] -&gt; seq[str]
 def dump(buf):
     yield '&lt;doc&gt;\n'
     for name, val in buf:
-        yield '  &lt;field name=&quot;%s&quot;&gt;%s&lt;/field&gt;\n'% (name, val)
+        if val:
+            yield '  &lt;field name=&quot;%s&quot;&gt;%s&lt;/field&gt;\n'% \
+                  (name,
+                   cgi.escape(val.encode('utf-8')))
     yield '&lt;/doc&gt;\n'
 
 def map_(func, seq):
@@ -178,4 +388,8 @@ def solr_submit(solr, xml):
         sock.close()
         return response
 
-main()
+if __name__ == '__main__':
+    import doctest
+    # doctest.testmod()
+    main()
+</diff>
      <filename>solr/convert-xml.py</filename>
    </modified>
    <modified>
      <diff>@@ -56,48 +56,25 @@ def doc_seq():
 
 # doc -&gt; doc
 def fix_doc(d):
-    global sid                          # @@
-
     try:
         e = ET.XML(d)
     except SyntaxError, x:
         raise ValueError, (x,d)
-    sid = defaultdict(list)
-    for a in e.getiterator('field'):
-        field_name = a.get('name')
-        # the description field is basically SEO spam, so don't index it
-        if field_name != 'description':
-            sid[field_name].append(a.text)
-
-    # this tuple unpack should raise an exception if there's more than
-    # one title field
-    (title,) = sid['title']
-    if 'title_prefix_len' in sid:
-        (tlx,) = sid['title_prefix_len'] # tuple unpack as above
-        sort_title = title[:tlx]
-    else:
-        sort_title = title
-
-    assert 'sort_title' not in sid
-    sid['sort_title'].append(sort_title)
-    
-    print ('sid',sid)
 
-    def mk_xml():
-        yield '&lt;doc&gt;'
-        for a,b in sid.iteritems():
-            for bi in b:
-                yield '  &lt;field name=&quot;%s&quot;&gt;%s&lt;/field&gt;'% (a,bi)
-        yield '&lt;/doc&gt;'
+    (title,) = (x for x in e.findall('field') if x.get('name')=='title')
+    tplx = list(x for x in e.findall('field') if x.get('name')=='title_prefix_len')
 
-    xml_out = '\n'.join(mk_xml())
+    assert len(tplx) &lt;= 1
+    tp_len = int(tplx[0]) if tplx else 0
+    ts = ET.SubElement(e, 'field')
+    ts.set('name', 'titlesort')
+    ts.text = title.text[tp_len:]
+    if tp_len:
+        print (tp_len, title.text)
 
-    print ('xml_out', xml_out)
+    # xml_out = '\n'.join(mk_xml())
+    xml_out = ET.tostring(e)
     return xml_out
-                              
-                              
-
-    # do nothing with e, this is just to time the parser
 
 def tf2():
     global d
@@ -110,13 +87,14 @@ def testfix():
 
 import operator
 snd = operator.itemgetter(1)
-
-def cc(groupsize=100):
+    
+# [doc], int -&gt; [compound doc]
+def cc(seq, groupsize=100):
     # generate sequence of compound docs that can be injected
     # into solr
 
     # emit runs of groupsize elements
-    bb = groupby(enumerate(doc_seq()), lambda (i,d): i//groupsize)
+    bb = groupby(enumerate(seq), lambda (i,d): i//groupsize)
     for i,d in bb:
         yield '&lt;add&gt;' + \
               '\n\n&lt;!-- #### --&gt;\n\n'.join(b for a,b in d) + \
@@ -140,13 +118,13 @@ def inject():
     print ('done',i,t2-t0)
         
 import socket
-frotz = False
+frotz = False                           # debugging cruft @@
 def solr_submit(solr, xml):
     global frotz
     &quot;&quot;&quot;submit an XML document to solr&quot;&quot;&quot;
     sock = socket.socket()
 
-    if not frotz:
+    if not frotz:                       # @@
         frotz = True
         with open('frotz','w') as f:
             f.write(xml)</diff>
      <filename>solr/flat.py</filename>
    </modified>
    <modified>
      <diff>@@ -7,11 +7,12 @@ def slicer(n):
         return lambda *a,**kw: islice(g(*a,**kw), n)
     return s2
 
+from operator import itemgetter
+snd = itemgetter(1)           # snd(tuple t) = second element of t
+
 # separate sequence into runs of size `runsize'
 def runs(seq, runsize):
     from itertools import groupby, imap
-    from operator import itemgetter
-    snd = itemgetter(1)           # snd(tuple t) = second element of t
     for n,g in groupby(enumerate(seq),
                        lambda (n,s): n // runsize):
         yield imap(snd, g)</diff>
      <filename>solr/itools.py</filename>
    </modified>
    <modified>
      <diff>@@ -1,15 +1,20 @@
 import sys
+from time import time
+
+sys.path.insert(0, &quot;..&quot;)
 sys.path.insert(0, &quot;../infogami&quot;)
 
 logfile = '/1/dbg/import-logs/dbglog'
 logfile = '/1/pharos/db/authortest'
 logfile = '/1/pharos/db/good'
 logfile = '/1/pharos/db/pharos'
+logfile = '/x-home/phr/pharos-log'
 # logfile = '/1/pharos/db/crap'
 
 #logfile = '/tmp/log.test'
 outfile = sys.stdout
-outfile = open('solr1.xml', 'w')
+
+outfile = open('solr1-%f.xml' % time(), 'w')
 oca_map = open('oca-map.log', 'a')
 
 # tcp socket of solr server
@@ -21,23 +26,25 @@ import web
 
 import re
 import socket
+import random, string
 from itertools import *
 from itools import *
 from cStringIO import StringIO
 from operator import itemgetter
-from time import time
 
 fst = itemgetter(0)
 snd = itemgetter(1)
 
 def setup():
     web.config.db_parameters = dict(dbn=&quot;postgres&quot;,
-#                                    db=&quot;pharos&quot;,
+#                                    host='apollonius.us.archive.org',
+                                    host='localhost',
                                     db=&quot;pharos&quot;,
                                     user=&quot;pharos&quot;,
                                     pw=&quot;pharos&quot;)
     web.load()
 
+# @slicer(20)
 def logparse(log_fd):
     return parse2b(parse1(log_fd,
                           infinite=True))
@@ -56,7 +63,19 @@ setup()
 # from exclude import excluded_fields, multivalued_fields
 import solr_fields
 
+sys.path.append('../pharos')
+import run
+
+import pdb
+debug = False
+
 def main():
+    if debug:
+        pdb.run('main2()')
+    else:
+        main2()
+
+def main2():
     import time as _time
 
     global t,k
@@ -71,16 +90,22 @@ def main():
     log_fd.seek(lastpos)
 
     for i,t in enumerate(logparse(log_fd)):
-#        print (t,t.type,type(t.type),t.type.name, type(t.type.name))
+        # print (t,t.type,type(t.type),t.type.name, type(t.type.name))
         if time()-t1 &gt; 5 or i % 100 == 0:
             print (i, time()-t1, time()-t0)
             sys.stdout.flush()
             t1 = time()
 
-        if t.type.name not in ('delete', 'edition'):
+        assert t.type.name.startswith('type/')
+        typename = t.type.name[5:]
+        assert '/' not in typename
+
+        action = {'delete': 'delete',
+                  'edition': 'add'}.get(typename)
+        if action is None:
+            # this is probably an author record; anyway it's something
+            # that we don't index.
             continue
-        if t.type.name == 'delete': action = 'delete'
-        else: action = 'add'
 
         outbuf = StringIO()
         print &gt;&gt;outbuf, &quot;&lt;%s&gt;&quot;% action
@@ -88,7 +113,7 @@ def main():
             continue
         print &gt;&gt;outbuf, &quot;&lt;/%s&gt;&quot;% action
 
-        if 1:
+        if 0:
             xml = outbuf.getvalue()
             # print 'xml:(%s)'% xml
             solr_response = solr_submit(xml)
@@ -132,7 +157,13 @@ sorted_field_dict = {
 ids_seen = set()
 
 def emit_doc(outbuf, action, t, loss=count()):
-    assert t.name not in ids_seen, t.name
+
+    if t.name in ids_seen:
+        # This is not supposed to happen and there was an assertion
+        # against it, but it kept triggering so we ignore it for now.
+        print ('error', loss.next(), time(), t.name)
+        return ''
+
     for forbidden in ('text', 'identifier'):
         assert forbidden not in t.d
     ids_seen.add(t.name)
@@ -141,13 +172,24 @@ def emit_doc(outbuf, action, t, loss=count()):
 
     emit_field(outbuf, 'identifier', t.name)
 
-    if 'oca_identifier' in t.d:
-        print &gt;&gt; oca_map, (t.d.oca_identifier, t.name, time())
-        oca_map.flush()
+    # if 'oca_identifier' in t.d:
+    #    print &gt;&gt; oca_map, (t.d.oca_identifier, t.name, time())
+    #    oca_map.flush()
 
     if action != 'delete':
         for k in t.d:
-            v = getattr(t.d, k)
+            if k == 'authors':
+                def translate(a):
+                    try:                   return a.d.name
+                    except (AttributeError,infogami.tdb.tdb.NotFound), e:
+                        id_str = t.d.get('identifier', '(no identifier)')
+                        print ('nameless_author', loss.next(), a, id_str, e.args)
+                        return a
+
+                v = list(translate(a) for a in getattr(t.d,k))
+                # print 'expanded authors (%s)=&gt;(%s)'% (getattr(t.d,k), v)
+            else:
+                v = getattr(t.d, k)
 
             try:
                 emit_field(outbuf, k, v)
@@ -162,6 +204,7 @@ def emit_doc(outbuf, action, t, loss=count()):
 
             if k not in solr_fields.excluded_fields:
                 emit_field(outbuf, 'text', v)
+
             if k in sorted_field_dict:
                 sfname, conversion = sorted_field_dict[k]
                 global z                    # debug @@
@@ -170,7 +213,22 @@ def emit_doc(outbuf, action, t, loss=count()):
                     assert type(v) == str
                     v = [v]
                 emit_field(outbuf, sfname, map(conversion,map(str,v)))
+
+        # emit a field indicating the availability of fulltext,
+        # so we can give it a big scoring bonus at query time.
+        if 'oca_identifier' in t.d:
+            emit_field(outbuf, 'has_fulltext', '1')
+        else:
+            emit_field(outbuf, 'has_fulltext', '0')
                        
+        # make an xfacet field (random 10-character &quot;word&quot;).
+        # this is for use in statistical faceting.
+        # might want to add more such words or fields, to help make
+        # multiple overlapping queries get uncorrelated result sets.
+        random_xword = ''.join(random.choice(string.lowercase)
+                               for i in xrange(10))
+        emit_field(outbuf, 'xfacet', random_xword)
+
     print &gt;&gt;outbuf, &quot;&lt;/doc&gt;\n&quot;
     return outbuf.getvalue()
 
@@ -200,6 +258,8 @@ def emit_field(outbuf,
                                                          escape(field_val))
 
 def solr_submit(xml):
+    raise ValueError, 'oops'
+
     &quot;&quot;&quot;submit an XML document to solr&quot;&quot;&quot;
     sock = socket.socket()
     try:</diff>
      <filename>solr/log2.py</filename>
    </modified>
    <modified>
      <diff>@@ -1,7 +1,8 @@
 # get set of meta tag names to exclude from solr slop field
 from xml.etree.ElementTree import ElementTree as ET
 
-_schema_file = ET().parse('/home/phr/petabox/solr/example/solr/conf/schema.xml')
+# _schema_file = ET().parse('/home/phr/petabox/solr/example/solr/conf/schema.xml')
+_schema_file = ET().parse('solr-schema.xml')
 
 def _checkfields(fieldname, attrname, pred, resultname):
     def m():</diff>
      <filename>solr/solr_fields.py</filename>
    </modified>
  </modified>
  <removed type="array"/>
  <parents type="array">
    <parent>
      <id>23291afc7301303f5c1aeef8a60533907ac9c500</id>
    </parent>
  </parents>
  <author>
    <name>phr@ia301442.us.archive.org</name>
    <email>phr@ia301442.us.archive.org</email>
  </author>
  <url>http://github.com/openlibrary/openlibrary/commit/c8885d4594587c171d3ce4c0c396230569f9b659</url>
  <id>c8885d4594587c171d3ce4c0c396230569f9b659</id>
  <committed-date>2008-04-06T01:03:32-07:00</committed-date>
  <authored-date>2008-04-06T01:03:32-07:00</authored-date>
  <message>commit a bunch of changes to internal tools, to satisfy hg merge.
This is awful stuff that is largely obsolete and that no one should
want to mess with.</message>
  <tree>32b2f38eb247b7918e1455850413ed05326fa33b</tree>
  <committer>
    <name>phr@ia301442.us.archive.org</name>
    <email>phr@ia301442.us.archive.org</email>
  </committer>
</commit>
