1
+ <?xml version =" 1.0" encoding =" UTF-8" ?>
2
+ <schema name =" mirlyn" version =" 1.4" >
3
+ <types >
4
+
5
+ <!--
6
+ #########################
7
+ #### Stock solr types ###
8
+ #########################
9
+ -->
10
+
11
+
12
+ <!-- Numeric -->
13
+ <fieldType name =" int" class =" solr.TrieIntField" precisionStep =" 0" omitNorms =" true" positionIncrementGap =" 0" />
14
+ <fieldType name =" float" class =" solr.TrieFloatField" precisionStep =" 0" omitNorms =" true" positionIncrementGap =" 0" />
15
+ <fieldType name =" long" class =" solr.TrieLongField" precisionStep =" 0" omitNorms =" true" positionIncrementGap =" 0" />
16
+ <fieldType name =" double" class =" solr.TrieDoubleField" precisionStep =" 0" omitNorms =" true" positionIncrementGap =" 0" />
17
+ <fieldType name =" tint" class =" solr.TrieIntField" precisionStep =" 8" omitNorms =" true" positionIncrementGap =" 0" />
18
+ <fieldType name =" tfloat" class =" solr.TrieFloatField" precisionStep =" 8" omitNorms =" true" positionIncrementGap =" 0" />
19
+ <fieldType name =" tlong" class =" solr.TrieLongField" precisionStep =" 8" omitNorms =" true" positionIncrementGap =" 0" />
20
+ <fieldType name =" tdouble" class =" solr.TrieDoubleField" precisionStep =" 8" omitNorms =" true" positionIncrementGap =" 0" />
21
+
22
+ <!-- Date/Time
23
+
24
+ The format for this date field is of the form 1995-12-31T23:59:59Z, and
25
+ is a more restricted form of the canonical representation of dateTime
26
+ http://www.w3.org/TR/xmlschema-2/#dateTime
27
+ The trailing "Z" designates UTC time and is mandatory.
28
+ Optional fractional seconds are allowed: 1995-12-31T23:59:59.999Z
29
+ All other components are mandatory.
30
+
31
+ Expressions can also be used to denote calculations that should be
32
+ performed relative to "NOW" to determine the value, ie...
33
+
34
+ NOW/HOUR
35
+ ... Round to the start of the current hour
36
+ NOW-1DAY
37
+ ... Exactly 1 day prior to now
38
+ NOW/DAY+6MONTHS+3DAYS
39
+ ... 6 months and 3 days in the future from the start of
40
+ the current day
41
+
42
+ Consult the DateField javadocs for more information.
43
+
44
+ Note: For faster range queries, consider the tdate type
45
+ -->
46
+ <fieldType name =" date" class =" solr.TrieDateField" omitNorms =" true" precisionStep =" 0" positionIncrementGap =" 0" />
47
+
48
+ <!-- A Trie based date field for faster date range queries and date faceting. -->
49
+ <fieldType name =" tdate" class =" solr.TrieDateField" omitNorms =" true" precisionStep =" 6" positionIncrementGap =" 0" />
50
+
51
+ <!-- boolean type: "true" or "false" -->
52
+ <fieldType name =" boolean" class =" solr.BoolField" sortMissingLast =" true" omitNorms =" true" />
53
+
54
+ <!-- Binary data type. The data should be sent/retrieved in as Base64 encoded Strings -->
55
+ <fieldtype name =" binary" class =" solr.BinaryField" />
56
+
57
+ <!-- Ignored -->
58
+ <fieldtype name =" ignored" stored =" false" indexed =" false" multiValued =" true" class =" solr.StrField" />
59
+
60
+ <!-- A specialized field for geospatial search. If indexed, this fieldType must not be multivalued. -->
61
+ <fieldType name =" location" class =" solr.LatLonType" subFieldSuffix =" _coordinate" />
62
+
63
+ <!--
64
+ A Geohash is a compact representation of a latitude longitude pair in a single field.
65
+ See http://wiki.apache.org/solr/SpatialSearch
66
+ -->
67
+ <fieldtype name =" geohash" class =" solr.GeoHashField" />
68
+
69
+ <!-- String -->
70
+ <fieldType name =" string" class =" solr.StrField" sortMissingLast =" true" omitNorms =" true" />
71
+
72
+
73
+
74
+ <!--
75
+ ######################################
76
+ ########### Text Types #############
77
+ ######################################
78
+
79
+ -->
80
+
81
+ <!-- text - A standard text type, with icu tokenization and unicode normalization.
82
+ - With the ICUFolding, we get:
83
+ + NFKC normalization (precomosing),
84
+ + Unicode case folding (i.e., lowercasing)
85
+ + search term folding (removing accents, etc).
86
+ - Synonyms can be put in syn.txt (see sample synonyms.txt file in solr example)
87
+ - Word delimiter splits on CaseChange and numbers (e.g., code4lib).
88
+ - The CJK stuff produces bigrams for those languages
89
+ - Remove Duplicates does what it says on the tin.
90
+ -->
91
+ <fieldtype name =" text" class =" solr.TextField" positionIncrementGap =" 1000" >
92
+ <analyzer >
93
+ <tokenizer class =" solr.ICUTokenizerFactory" />
94
+ <filter class =" solr.ICUFoldingFilterFactory" />
95
+ <filter class =" solr.SynonymFilterFactory" synonyms =" syn.txt" ignoreCase =" true" expand =" false" />
96
+ <filter class =" solr.WordDelimiterFilterFactory" generateWordParts =" 1" generateNumberParts =" 1" catenateWords =" 1" catenateNumbers =" 1" catenateAll =" 0" />
97
+ <filter class =" solr.CJKWidthFilterFactory" />
98
+ <filter class =" solr.CJKBigramFilterFactory" />
99
+ <filter class =" solr.TrimFilterFactory" />
100
+ <filter class =" solr.RemoveDuplicatesTokenFilterFactory" />
101
+ </analyzer >
102
+ </fieldtype >
103
+
104
+ <!-- same as text, but with some stemming thrown in -->
105
+ <fieldtype name =" text_stemmed" class =" solr.TextField" positionIncrementGap =" 1000" >
106
+ <analyzer >
107
+ <tokenizer class =" solr.ICUTokenizerFactory" />
108
+ <filter class =" solr.ICUFoldingFilterFactory" />
109
+ <filter class =" solr.KStemFilterFactory" />
110
+ <filter class =" solr.SynonymFilterFactory" synonyms =" syn.txt" ignoreCase =" true" expand =" false" />
111
+ <filter class =" solr.WordDelimiterFilterFactory" generateWordParts =" 1" generateNumberParts =" 1" catenateWords =" 1" catenateNumbers =" 1" catenateAll =" 0" />
112
+ <filter class =" solr.CJKWidthFilterFactory" />
113
+ <filter class =" solr.CJKBigramFilterFactory" />
114
+ <filter class =" solr.TrimFilterFactory" />
115
+ <filter class =" solr.RemoveDuplicatesTokenFilterFactory" />
116
+ </analyzer >
117
+ </fieldtype >
118
+
119
+ <!-- text_(l|r|lr): text that is anchored on one or both ends.
120
+ These are useful for phrase searches only; for non-phrase searches we're
121
+ basically just adding one or two useless tokens to the mix.
122
+
123
+ It's mostly the same as text, but with the addition of one or
124
+ two anchors. We don't stem these.
125
+
126
+ text_lr is essentially an "exact match" where "exact" means
127
+ "...except for runs of spaces, case, diacritics, and most punctuation".
128
+ I find it useful for boosting the bejeebus out of exact title matches.
129
+ -->
130
+
131
+ <fieldtype name =" text_l" class =" solr.TextField" positionIncrementGap =" 1000" >
132
+ <analyzer >
133
+ <tokenizer class =" solr.ICUTokenizerFactory" />
134
+ <filter class =" solr.ICUFoldingFilterFactory" />
135
+ <filter class =" solr.SynonymFilterFactory" synonyms =" syn.txt" ignoreCase =" true" expand =" false" />
136
+ <filter class =" solr.CJKWidthFilterFactory" />
137
+ <filter class =" solr.CJKBigramFilterFactory" />
138
+ <filter class =" solr.TrimFilterFactory" />
139
+ <filter class =" solr.PatternReplaceFilterFactory"
140
+ pattern =" \p{Z}+" replacement =" "
141
+ />
142
+ <charFilter class =" solr.PatternReplaceCharFilterFactory"
143
+ pattern =" ^(.*)$" replacement =" AAAA $1" />
144
+ <filter class =" solr.RemoveDuplicatesTokenFilterFactory" />
145
+ </analyzer >
146
+ </fieldtype >
147
+
148
+ <fieldtype name =" text_r" class =" solr.TextField" positionIncrementGap =" 1000" >
149
+ <analyzer >
150
+ <tokenizer class =" solr.ICUTokenizerFactory" />
151
+ <filter class =" solr.ICUFoldingFilterFactory" />
152
+ <filter class =" solr.SynonymFilterFactory" synonyms =" syn.txt" ignoreCase =" true" expand =" false" />
153
+ <filter class =" solr.CJKWidthFilterFactory" />
154
+ <filter class =" solr.CJKBigramFilterFactory" />
155
+ <filter class =" solr.TrimFilterFactory" />
156
+ <filter class =" solr.PatternReplaceFilterFactory"
157
+ pattern =" \p{Z}+" replacement =" "
158
+ />
159
+ <charFilter class =" solr.PatternReplaceCharFilterFactory"
160
+ pattern =" ^(.*)$" replacement =" $1 ZZZZ" />
161
+ <filter class =" solr.RemoveDuplicatesTokenFilterFactory" />
162
+ </analyzer >
163
+ </fieldtype >
164
+
165
+ <!-- text_fullanchored anchors on both ends and is basically a more forgiving
166
+ "exact match"
167
+ -->
168
+
169
+ <fieldtype name =" text_lr" class =" solr.TextField" positionIncrementGap =" 1000" >
170
+ <analyzer >
171
+ <tokenizer class =" solr.ICUTokenizerFactory" />
172
+ <filter class =" solr.ICUFoldingFilterFactory" />
173
+ <filter class =" solr.SynonymFilterFactory" synonyms =" syn.txt" ignoreCase =" true" expand =" false" />
174
+ <filter class =" solr.CJKWidthFilterFactory" />
175
+ <filter class =" solr.CJKBigramFilterFactory" />
176
+ <filter class =" solr.TrimFilterFactory" />
177
+ <filter class =" solr.PatternReplaceFilterFactory"
178
+ pattern =" \p{Z}+" replacement =" "
179
+ />
180
+ <charFilter class =" solr.PatternReplaceCharFilterFactory"
181
+ pattern =" ^(.*)$" replacement =" AAAA $1 ZZZZ" />
182
+ <filter class =" solr.RemoveDuplicatesTokenFilterFactory" />
183
+ </analyzer >
184
+ </fieldtype >
185
+
186
+
187
+ <!--
188
+ saneString: a string for exact matches, but trim, fold multiple spaces,
189
+ and ditch some closing punctuation. Designed for facets where the
190
+ values might not be as controlled as you'd like (e.g., LCSH)
191
+
192
+ Note that you might want to facet on saneString, but allow searches against
193
+ text_lr, since the latter does lowercasing, synonyms, and
194
+ unicode folding.
195
+ -->
196
+
197
+ <fieldtype name =" sane_string" class =" solr.TextField" positionIncrementGap =" 1000" omitNorms =" true" >
198
+ <analyzer >
199
+ <tokenizer class =" solr.KeywordTokenizerFactory" />
200
+ <filter class =" solr.PatternReplaceFilterFactory"
201
+ pattern =" (.*?)[ ,.!?/]+$" replacement =" $1"
202
+ />
203
+ <filter class =" solr.TrimFilterFactory" />
204
+ <filter class =" solr.PatternReplaceFilterFactory"
205
+ pattern =" \p{Z}+" replacement =" "
206
+ />
207
+ </analyzer >
208
+ </fieldtype >
209
+
210
+ <!--
211
+ ###################################################
212
+ ########## Useful library types ################
213
+ ##################################################
214
+
215
+ -->
216
+
217
+ <!-- numericID:
218
+ - take the first string of digits/dashes/dots and an optional X or x
219
+ that is at least six characters long (OCLC, ISBN, ISSN, etc.)
220
+ and throw away everything that's left
221
+ - lowercase it (i.e., turn any trailing X into an x)
222
+ - ditch everything that's not a number or an 'x'
223
+ -
224
+ - ditch any leading zeros
225
+ -->
226
+
227
+ <fieldtype name =" numericID" class =" solr.TextField" positionIncrementGap =" 1000" omitNorms =" true" >
228
+ <analyzer >
229
+ <tokenizer class =" solr.KeywordTokenizerFactory" />
230
+ <!-- Start by finding the first substring that starts with a digit, ends with a digit, and
231
+ has at least four digits in-between, followed by an optional X
232
+ Throw away everything else, and stick a '***' on the front as an anchor
233
+ -->
234
+ <filter class =" solr.PatternReplaceFilterFactory"
235
+ pattern =" ^.*?(\p{N}[\p{N}\-\.]{4,}\p{N}[xX]?).*$" replacement =" ***$1"
236
+ />
237
+
238
+ <!-- This is a little silly, but basically we find anything that does *not*
239
+ start with '*' and throw it all away, on the basis that if it had contained
240
+ a valid number, it would start with a '*' due to the pattern replacement
241
+ above. The '*' is nice in that it's a wildcard character and will throw an
242
+ error if your search actually *does* start with it. -->
243
+
244
+ <filter class =" solr.PatternReplaceFilterFactory"
245
+ pattern =" ^[^\*].*$" replacement =" "
246
+ />
247
+
248
+ <!-- Get rid of the '***' -->
249
+
250
+ <filter class =" solr.PatternReplaceFilterFactory"
251
+ pattern =" ^\*\*\*" replacement =" "
252
+ />
253
+
254
+ <!-- Lowercase it and get rid of anything that's not a number or an 'x' -->
255
+ <filter class =" solr.LowerCaseFilterFactory" />
256
+ <filter class =" solr.PatternReplaceFilterFactory"
257
+ pattern =" [^\p{N}x]" replacement =" " replace =" all"
258
+ />
259
+
260
+ <!-- Throw away everything that's not long enough anymore (e.g., at least five
261
+ digits plus an optional 'x', or six digits). This will include stuff from the
262
+ second step that got reduced to the empty string. -->
263
+ <filter class =" solr.LengthFilterFactory" min =" 8" max =" 100" />
264
+
265
+
266
+ <!-- Finally, get rid of leading zeros -->
267
+ <filter class =" solr.PatternReplaceFilterFactory"
268
+ pattern =" ^0*" replacement =" "
269
+ />
270
+ </analyzer >
271
+ </fieldtype >
272
+
273
+ <!-- csn (comma-separated numbers) is a list of numbers, separated by commas. Do basically
274
+ the same transform as numericID, but in this case allow
275
+ commas to separate values.
276
+
277
+ This is more restrictive than numericID, obviously, in that we can't
278
+ allow there to be commas in the input
279
+ -->
280
+ <fieldtype name =" csn" class =" solr.TextField" positionIncrementGap =" 100" omitNorms =" true" >
281
+ <analyzer >
282
+ <tokenizer class =" solr.PatternTokenizerFactory" pattern =" \s*,\s*" />
283
+ <filter class =" solr.PatternReplaceFilterFactory"
284
+ pattern =" ^.*?(\p{N}[\p{N}\-\.]{4,}\p{N}[xX]?).*$" replacement =" ***$1"
285
+ />
286
+ <filter class =" solr.PatternReplaceFilterFactory"
287
+ pattern =" ^[^\*].*$" replacement =" "
288
+ />
289
+ <filter class =" solr.PatternReplaceFilterFactory"
290
+ pattern =" \*\*\*" replacement =" "
291
+ />
292
+ <filter class =" solr.LowerCaseFilterFactory" />
293
+ <filter class =" solr.PatternReplaceFilterFactory"
294
+ pattern =" [^\p{N}x]" replacement =" " replace =" all"
295
+ />
296
+ <filter class =" solr.LengthFilterFactory" min =" 8" max =" 100" />
297
+
298
+ <filter class =" solr.PatternReplaceFilterFactory"
299
+ pattern =" ^0*" replacement =" "
300
+ />
301
+ </analyzer >
302
+ </fieldtype >
303
+
304
+ <!-- callnoprefix: use edgengram to index every left-anchored substring
305
+ of the call number, throwing away spaces and dots
306
+ -->
307
+
308
+ <fieldtype name =" callnoprefix" class =" solr.TextField" omitNorms =" true" >
309
+ <analyzer >
310
+ <tokenizer class =" solr.KeywordTokenizerFactory" />
311
+ <filter class =" solr.LowerCaseFilterFactory" />
312
+ <filter class =" solr.PatternReplaceFilterFactory"
313
+ pattern =" [\p{Z}\.]" replacement =" " replace =" all"
314
+ />
315
+ <filter class =" solr.EdgeNGramFilterFactory" minGramSize =" 1" maxGramSize =" 15" side =" front" />
316
+ </analyzer >
317
+ </fieldtype >
318
+
319
+
320
+
321
+
322
+ </types >
323
+
324
+ <fields >
325
+ <field name =" id" type =" string" indexed =" true" stored =" true" />
326
+ <field name =" sane" type =" sane_string" indexed =" true" stored =" true" />
327
+ <field name =" text" type =" text" indexed =" true" stored =" true" multiValued =" true" />
328
+ <field name =" tf" type =" text_lr" indexed =" true" stored =" true" multiValued =" true" />
329
+ <field name =" tl" type =" text_l" indexed =" true" stored =" true" multiValued =" true" />
330
+ <field name =" tr" type =" text_r" indexed =" true" stored =" true" multiValued =" true" />
331
+ <field name =" numeric" type =" numericID" indexed =" true" stored =" true" multiValued =" true" />
332
+ <field name =" csn" type =" csn" indexed =" true" stored =" true" multiValued =" true" />
333
+ <field name =" name" type =" text" indexed =" true" stored =" true" multiValued =" true" />
334
+ <field name =" othername" type =" text" indexed =" true" stored =" true" multiValued =" true" />
335
+
336
+ </fields >
337
+
338
+
339
+ <uniqueKey >id</uniqueKey >
340
+ <defaultSearchField >text</defaultSearchField >
341
+ <solrQueryParser defaultOperator =" OR" />
342
+
343
+ </schema >
344
+
345
+
346
+
347
+
348
+
349
+
350
+
351
+
352
+
353
+
354
+
355
+
356
+
357
+
358
+
359
+
360
+
361
+
362
+
363
+
0 commit comments