|
70 | 70 | <fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
|
71 | 71 |
|
72 | 72 |
|
73 |
| - |
74 |
| - <!-- |
75 |
| - ###################################### |
76 |
| - ########### Text Types ############# |
77 |
| - ###################################### |
78 |
| - |
79 |
| - --> |
80 |
| - |
81 |
| - <!-- text - A standard text type, with icu tokenization and unicode normalization. |
82 |
| - - With the ICUFolding, we get: |
83 |
| - + NFKC normalization (precomosing), |
84 |
| - + Unicode case folding (i.e., lowercasing) |
85 |
| - + search term folding (removing accents, etc). |
86 |
| - - Synonyms can be put in syn.txt (see sample synonyms.txt file in solr example) |
87 |
| - - Word delimiter splits on CaseChange and numbers (e.g., code4lib). |
88 |
| - - The CJK stuff produces bigrams for those languages |
89 |
| - - Remove Duplicates does what it says on the tin. |
90 |
| - --> |
91 |
| - <fieldtype name="text" class="solr.TextField" positionIncrementGap="1000"> |
92 |
| - <analyzer> |
93 |
| - <tokenizer class="solr.ICUTokenizerFactory"/> |
94 |
| - <filter class="solr.ICUFoldingFilterFactory"/> |
95 |
| - <filter class="solr.SynonymFilterFactory" synonyms="syn.txt" ignoreCase="true" expand="false"/> |
96 |
| - <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/> |
97 |
| - <filter class="solr.CJKWidthFilterFactory"/> |
98 |
| - <filter class="solr.CJKBigramFilterFactory"/> |
99 |
| - <filter class="solr.TrimFilterFactory"/> |
100 |
| - <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> |
101 |
| - </analyzer> |
102 |
| - </fieldtype> |
103 |
| - |
104 |
| - <!-- same as text, but with some stemming thrown in --> |
105 |
| - <fieldtype name="text_stemmed" class="solr.TextField" positionIncrementGap="1000"> |
106 |
| - <analyzer> |
107 |
| - <tokenizer class="solr.ICUTokenizerFactory"/> |
108 |
| - <filter class="solr.ICUFoldingFilterFactory"/> |
109 |
| - <filter class="solr.KStemFilterFactory"/> |
110 |
| - <filter class="solr.SynonymFilterFactory" synonyms="syn.txt" ignoreCase="true" expand="false"/> |
111 |
| - <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/> |
112 |
| - <filter class="solr.CJKWidthFilterFactory"/> |
113 |
| - <filter class="solr.CJKBigramFilterFactory"/> |
114 |
| - <filter class="solr.TrimFilterFactory"/> |
115 |
| - <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> |
116 |
| - </analyzer> |
117 |
| - </fieldtype> |
118 |
| - |
119 |
| - <!-- text_(l|r|lr): text that is anchored on one or both ends. |
120 |
| - These are useful for phrase searches only; for non-phrase searches we're |
121 |
| - basically just adding one or two useless tokens to the mix. |
122 |
| - |
123 |
| - It's mostly the same as text, but with the addition of one or |
124 |
| - two anchors. We don't stem these. |
125 |
| - |
126 |
| - text_lr is essentially an "exact match" where "exact" means |
127 |
| - "...except for runs of spaces, case, diacritics, and most punctuation". |
128 |
| - I find it useful for boosting the bejeebus out of exact title matches. |
129 |
| - --> |
130 |
| - |
131 |
| - <fieldtype name="text_l" class="solr.TextField" positionIncrementGap="1000"> |
132 |
| - <analyzer> |
133 |
| - <tokenizer class="solr.ICUTokenizerFactory"/> |
134 |
| - <filter class="solr.ICUFoldingFilterFactory"/> |
135 |
| - <filter class="solr.SynonymFilterFactory" synonyms="syn.txt" ignoreCase="true" expand="false"/> |
136 |
| - <filter class="solr.CJKWidthFilterFactory"/> |
137 |
| - <filter class="solr.CJKBigramFilterFactory"/> |
138 |
| - <filter class="solr.TrimFilterFactory"/> |
139 |
| - <filter class="solr.PatternReplaceFilterFactory" |
140 |
| - pattern="\p{Z}+" replacement=" " |
141 |
| - /> |
142 |
| - <charFilter class="solr.PatternReplaceCharFilterFactory" |
143 |
| - pattern="^(.*)$" replacement="AAAA $1" /> |
144 |
| - <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> |
145 |
| - </analyzer> |
146 |
| - </fieldtype> |
147 |
| - |
148 |
| - <fieldtype name="text_r" class="solr.TextField" positionIncrementGap="1000"> |
149 |
| - <analyzer> |
150 |
| - <tokenizer class="solr.ICUTokenizerFactory"/> |
151 |
| - <filter class="solr.ICUFoldingFilterFactory"/> |
152 |
| - <filter class="solr.SynonymFilterFactory" synonyms="syn.txt" ignoreCase="true" expand="false"/> |
153 |
| - <filter class="solr.CJKWidthFilterFactory"/> |
154 |
| - <filter class="solr.CJKBigramFilterFactory"/> |
155 |
| - <filter class="solr.TrimFilterFactory"/> |
156 |
| - <filter class="solr.PatternReplaceFilterFactory" |
157 |
| - pattern="\p{Z}+" replacement=" " |
158 |
| - /> |
159 |
| - <charFilter class="solr.PatternReplaceCharFilterFactory" |
160 |
| - pattern="^(.*)$" replacement="$1 ZZZZ" /> |
161 |
| - <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> |
162 |
| - </analyzer> |
163 |
| - </fieldtype> |
164 |
| - |
165 |
| - <!-- text_fullanchored anchors on both ends and is basically a more forgiving |
166 |
| - "exact match" |
167 |
| - --> |
168 |
| - |
169 |
| - <fieldtype name="text_lr" class="solr.TextField" positionIncrementGap="1000"> |
170 |
| - <analyzer> |
171 |
| - <tokenizer class="solr.ICUTokenizerFactory"/> |
172 |
| - <filter class="solr.ICUFoldingFilterFactory"/> |
173 |
| - <filter class="solr.SynonymFilterFactory" synonyms="syn.txt" ignoreCase="true" expand="false"/> |
174 |
| - <filter class="solr.CJKWidthFilterFactory"/> |
175 |
| - <filter class="solr.CJKBigramFilterFactory"/> |
176 |
| - <filter class="solr.TrimFilterFactory"/> |
177 |
| - <filter class="solr.PatternReplaceFilterFactory" |
178 |
| - pattern="\p{Z}+" replacement=" " |
179 |
| - /> |
180 |
| - <charFilter class="solr.PatternReplaceCharFilterFactory" |
181 |
| - pattern="^(.*)$" replacement="AAAA $1 ZZZZ" /> |
182 |
| - <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> |
183 |
| - </analyzer> |
184 |
| - </fieldtype> |
185 |
| - |
186 |
| - |
187 |
| - <!-- |
188 |
| - saneString: a string for exact matches, but trim, fold multiple spaces, |
189 |
| - and ditch some closing punctuation. Designed for facets where the |
190 |
| - values might not be as controlled as you'd like (e.g., LCSH) |
191 |
| - |
192 |
| - Note that you might want to facet on saneString, but allow searches against |
193 |
| - text_lr, since the latter does lowercasing, synonyms, and |
194 |
| - unicode folding. |
195 |
| - --> |
196 |
| - |
197 |
| - <fieldtype name="sane_string" class="solr.TextField" positionIncrementGap="1000" omitNorms="true"> |
198 |
| - <analyzer> |
199 |
| - <tokenizer class="solr.KeywordTokenizerFactory"/> |
200 |
| - <filter class="solr.PatternReplaceFilterFactory" |
201 |
| - pattern="(.*?)[ ,.!?/]+$" replacement="$1" |
202 |
| - /> |
203 |
| - <filter class="solr.TrimFilterFactory"/> |
204 |
| - <filter class="solr.PatternReplaceFilterFactory" |
205 |
| - pattern="\p{Z}+" replacement=" " |
206 |
| - /> |
207 |
| - </analyzer> |
208 |
| - </fieldtype> |
209 |
| - |
210 |
| - <!-- |
211 |
| - ################################################### |
212 |
| - ########## Useful library types ################ |
213 |
| - ################################################## |
214 |
| - |
215 |
| - --> |
216 |
| - |
217 |
| - <!-- numericID: |
| 73 | + <!-- numericID: |
218 | 74 | - take the first string of digits/dashes/dots and an optional X or x
|
219 | 75 | that is at least six characters long (OCLC, ISBN, ISSN, etc.)
|
220 | 76 | and throw away everything that's left
|
|
224 | 80 | - ditch any leading zeros
|
225 | 81 | -->
|
226 | 82 |
|
227 |
| - <fieldtype name="numericID" class="solr.TextField" positionIncrementGap="1000" omitNorms="true"> |
228 |
| - <analyzer> |
229 |
| - <tokenizer class="solr.KeywordTokenizerFactory"/> |
230 |
| - <!-- Start by finding the first substring that starts with a digit, ends with a digit, and |
231 |
| - has at least four digits in-between, followed by an optional X |
232 |
| - Throw away everything else, and stick a '***' on the front as an anchor |
233 |
| - --> |
234 |
| - <filter class="solr.PatternReplaceFilterFactory" |
235 |
| - pattern="^.*?(\p{N}[\p{N}\-\.]{6,}\p{N}[xX]?).*$" replacement="***$1" |
236 |
| - /> |
237 |
| - |
238 |
| - <!-- This is a little silly, but basically we find anything that does *not* |
239 |
| - start with '*' and throw it all away, on the basis that if it had contained |
240 |
| - a valid number, it would start with a '*' due to the pattern replacement |
241 |
| - above. The '*' is nice in that it's a wildcard character and will throw an |
242 |
| - error if your search actually *does* start with it. --> |
243 |
| - |
244 |
| - <filter class="solr.PatternReplaceFilterFactory" |
245 |
| - pattern="^[^\*].*$" replacement="" |
246 |
| - /> |
247 |
| - |
248 |
| - <!-- Get rid of the '***' --> |
249 |
| - |
250 |
| - <filter class="solr.PatternReplaceFilterFactory" |
251 |
| - pattern="^\*\*\*" replacement="" |
252 |
| - /> |
253 | 83 |
|
254 |
| - <!-- Lowercase it and get rid of anything that's not a number or an 'x' --> |
255 |
| - <filter class="solr.LowerCaseFilterFactory"/> |
256 |
| - <filter class="solr.PatternReplaceFilterFactory" |
257 |
| - pattern="[^\p{N}x]" replacement="" replace="all" |
258 |
| - /> |
259 | 84 |
|
260 |
| - <!-- Throw away everything that's not long enough anymore (e.g., at least five |
261 |
| - digits plus an optional 'x', or six digits). This will include stuff from the |
262 |
| - second step that got reduced to the empty string. --> |
263 |
| - <filter class="solr.LengthFilterFactory" min="8" max="100" /> |
264 |
| - |
265 |
| - |
266 |
| - <!-- Finally, get rid of leading zeros --> |
267 |
| - <filter class="solr.PatternReplaceFilterFactory" |
268 |
| - pattern="^0*" replacement="" |
269 |
| - /> |
270 |
| - </analyzer> |
271 |
| - </fieldtype> |
272 |
| - |
273 |
| - <!-- csn (comma-separated numbers) is a list of numbers, separated by commas. Do basically |
274 |
| - the same transform as numericID, but in this case allow |
275 |
| - commas to separate values. |
276 |
| - |
277 |
| - This is more restrictive than numericID, obviously, in that we can't |
278 |
| - allow there to be commas in the input |
279 |
| - --> |
280 |
| - <fieldtype name="csn" class="solr.TextField" positionIncrementGap="100" omitNorms="true"> |
281 |
| - <analyzer> |
282 |
| - <tokenizer class="solr.PatternTokenizerFactory" pattern="\s*,\s*" /> |
283 |
| - <filter class="solr.PatternReplaceFilterFactory" |
284 |
| - pattern="^.*?(\p{N}[\p{N}\-\.]{4,}\p{N}[xX]?).*$" replacement="***$1" |
285 |
| - /> |
286 |
| - <filter class="solr.PatternReplaceFilterFactory" |
287 |
| - pattern="^[^\*].*$" replacement="" |
288 |
| - /> |
289 |
| - <filter class="solr.PatternReplaceFilterFactory" |
290 |
| - pattern="\*\*\*" replacement="" |
291 |
| - /> |
292 |
| - <filter class="solr.LowerCaseFilterFactory"/> |
293 |
| - <filter class="solr.PatternReplaceFilterFactory" |
294 |
| - pattern="[^\p{N}x]" replacement="" replace="all" |
295 |
| - /> |
296 |
| - <filter class="solr.LengthFilterFactory" min="8" max="100" /> |
297 |
| - |
298 |
| - <filter class="solr.PatternReplaceFilterFactory" |
299 |
| - pattern="^0*" replacement="" |
300 |
| - /> |
301 |
| - </analyzer> |
302 |
| - </fieldtype> |
303 |
| - |
304 |
| - <!-- callnoprefix: use edgengram to index every left-anchored substring |
305 |
| - of the call number, throwing away spaces and dots |
306 |
| - --> |
307 |
| - |
308 |
| - <fieldtype name="callnoprefix" class="solr.TextField" omitNorms="true"> |
309 |
| - <analyzer> |
310 |
| - <tokenizer class="solr.KeywordTokenizerFactory"/> |
311 |
| - <filter class="solr.LowerCaseFilterFactory"/> |
312 |
| - <filter class="solr.PatternReplaceFilterFactory" |
313 |
| - pattern="[\p{Z}\.]" replacement="" replace="all" |
314 |
| - /> |
315 |
| - <filter class="solr.EdgeNGramFilterFactory" minGramSize="1" maxGramSize="15" side="front"/> |
316 |
| - </analyzer> |
317 |
| - </fieldtype> |
318 |
| - |
319 |
| - |
320 |
| - |
| 85 | + <fieldtype name="numericID" class="solr.TextField" positionIncrementGap="1000" omitNorms="true"> |
| 86 | + <analyzer> |
| 87 | + <tokenizer class="solr.KeywordTokenizerFactory"/> |
| 88 | + <filter class="solr.PatternReplaceFilterFactory" pattern="^.*?(\p{N}[\p{N}\-\.]{5,}\p{N}[xX]?).*$" replacement="***$1"/> |
| 89 | + <filter class="solr.PatternReplaceFilterFactory" pattern="^[^\*].*$" replacement=""/> |
| 90 | + <filter class="solr.PatternReplaceFilterFactory" pattern="^\*\*\*" replacement=""/> |
| 91 | + <filter class="solr.LowerCaseFilterFactory"/> |
| 92 | + <filter class="solr.PatternReplaceFilterFactory" pattern="[^\p{N}x]" replacement="" replace="all"/> |
| 93 | + <filter class="solr.LengthFilterFactory" min="8" max="14"/> |
| 94 | + <filter class="solr.PatternReplaceFilterFactory" pattern="^0*" replacement=""/> |
| 95 | + </analyzer> |
| 96 | + </fieldtype> |
321 | 97 |
|
322 | 98 | </types>
|
323 | 99 |
|
324 | 100 | <fields>
|
325 | 101 | <field name="id" type="string" indexed="true" stored="true" />
|
326 |
| - <field name="sane" type="sane_string" indexed="true" stored="true"/> |
327 |
| - <field name="text" type="text" indexed="true" stored="true" multiValued="true" /> |
328 |
| - <field name="tf" type="text_lr" indexed="true" stored="true" multiValued="true" /> |
329 |
| - <field name="tl" type="text_l" indexed="true" stored="true" multiValued="true" /> |
330 |
| - <field name="tr" type="text_r" indexed="true" stored="true" multiValued="true" /> |
331 |
| - <field name="numeric" type="numericID" indexed="true" stored="true" multiValued="true" /> |
332 |
| - <field name="csn" type="csn" indexed="true" stored="true" multiValued="true" /> |
333 |
| - <field name="name" type="text" indexed="true" stored="true" multiValued="true" /> |
334 |
| - <field name="othername" type="text" indexed="true" stored="true" multiValued="true" /> |
335 |
| - |
| 102 | + <field name="*_numeric" type="numericID" indexed="true" stored="true" multiValued="true"/> |
336 | 103 | </fields>
|
337 | 104 |
|
338 | 105 |
|
|
0 commit comments