Skip to content

Commit 6accc51

Browse files
committed
Simplified for instructional purposes
1 parent deb313a commit 6accc51

File tree

1 file changed

+14
-247
lines changed

1 file changed

+14
-247
lines changed

solr/conf/schema.xml

+14-247
Original file line numberDiff line numberDiff line change
@@ -70,151 +70,7 @@
7070
<fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
7171

7272

73-
74-
<!--
75-
######################################
76-
########### Text Types #############
77-
######################################
78-
79-
-->
80-
81-
<!-- text - A standard text type, with icu tokenization and unicode normalization.
82-
- With the ICUFolding, we get:
83-
+ NFKC normalization (precomosing),
84-
+ Unicode case folding (i.e., lowercasing)
85-
+ search term folding (removing accents, etc).
86-
- Synonyms can be put in syn.txt (see sample synonyms.txt file in solr example)
87-
- Word delimiter splits on CaseChange and numbers (e.g., code4lib).
88-
- The CJK stuff produces bigrams for those languages
89-
- Remove Duplicates does what it says on the tin.
90-
-->
91-
<fieldtype name="text" class="solr.TextField" positionIncrementGap="1000">
92-
<analyzer>
93-
<tokenizer class="solr.ICUTokenizerFactory"/>
94-
<filter class="solr.ICUFoldingFilterFactory"/>
95-
<filter class="solr.SynonymFilterFactory" synonyms="syn.txt" ignoreCase="true" expand="false"/>
96-
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
97-
<filter class="solr.CJKWidthFilterFactory"/>
98-
<filter class="solr.CJKBigramFilterFactory"/>
99-
<filter class="solr.TrimFilterFactory"/>
100-
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
101-
</analyzer>
102-
</fieldtype>
103-
104-
<!-- same as text, but with some stemming thrown in -->
105-
<fieldtype name="text_stemmed" class="solr.TextField" positionIncrementGap="1000">
106-
<analyzer>
107-
<tokenizer class="solr.ICUTokenizerFactory"/>
108-
<filter class="solr.ICUFoldingFilterFactory"/>
109-
<filter class="solr.KStemFilterFactory"/>
110-
<filter class="solr.SynonymFilterFactory" synonyms="syn.txt" ignoreCase="true" expand="false"/>
111-
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
112-
<filter class="solr.CJKWidthFilterFactory"/>
113-
<filter class="solr.CJKBigramFilterFactory"/>
114-
<filter class="solr.TrimFilterFactory"/>
115-
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
116-
</analyzer>
117-
</fieldtype>
118-
119-
<!-- text_(l|r|lr): text that is anchored on one or both ends.
120-
These are useful for phrase searches only; for non-phrase searches we're
121-
basically just adding one or two useless tokens to the mix.
122-
123-
It's mostly the same as text, but with the addition of one or
124-
two anchors. We don't stem these.
125-
126-
text_lr is essentially an "exact match" where "exact" means
127-
"...except for runs of spaces, case, diacritics, and most punctuation".
128-
I find it useful for boosting the bejeebus out of exact title matches.
129-
-->
130-
131-
<fieldtype name="text_l" class="solr.TextField" positionIncrementGap="1000">
132-
<analyzer>
133-
<tokenizer class="solr.ICUTokenizerFactory"/>
134-
<filter class="solr.ICUFoldingFilterFactory"/>
135-
<filter class="solr.SynonymFilterFactory" synonyms="syn.txt" ignoreCase="true" expand="false"/>
136-
<filter class="solr.CJKWidthFilterFactory"/>
137-
<filter class="solr.CJKBigramFilterFactory"/>
138-
<filter class="solr.TrimFilterFactory"/>
139-
<filter class="solr.PatternReplaceFilterFactory"
140-
pattern="\p{Z}+" replacement=" "
141-
/>
142-
<charFilter class="solr.PatternReplaceCharFilterFactory"
143-
pattern="^(.*)$" replacement="AAAA $1" />
144-
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
145-
</analyzer>
146-
</fieldtype>
147-
148-
<fieldtype name="text_r" class="solr.TextField" positionIncrementGap="1000">
149-
<analyzer>
150-
<tokenizer class="solr.ICUTokenizerFactory"/>
151-
<filter class="solr.ICUFoldingFilterFactory"/>
152-
<filter class="solr.SynonymFilterFactory" synonyms="syn.txt" ignoreCase="true" expand="false"/>
153-
<filter class="solr.CJKWidthFilterFactory"/>
154-
<filter class="solr.CJKBigramFilterFactory"/>
155-
<filter class="solr.TrimFilterFactory"/>
156-
<filter class="solr.PatternReplaceFilterFactory"
157-
pattern="\p{Z}+" replacement=" "
158-
/>
159-
<charFilter class="solr.PatternReplaceCharFilterFactory"
160-
pattern="^(.*)$" replacement="$1 ZZZZ" />
161-
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
162-
</analyzer>
163-
</fieldtype>
164-
165-
<!-- text_fullanchored anchors on both ends and is basically a more forgiving
166-
"exact match"
167-
-->
168-
169-
<fieldtype name="text_lr" class="solr.TextField" positionIncrementGap="1000">
170-
<analyzer>
171-
<tokenizer class="solr.ICUTokenizerFactory"/>
172-
<filter class="solr.ICUFoldingFilterFactory"/>
173-
<filter class="solr.SynonymFilterFactory" synonyms="syn.txt" ignoreCase="true" expand="false"/>
174-
<filter class="solr.CJKWidthFilterFactory"/>
175-
<filter class="solr.CJKBigramFilterFactory"/>
176-
<filter class="solr.TrimFilterFactory"/>
177-
<filter class="solr.PatternReplaceFilterFactory"
178-
pattern="\p{Z}+" replacement=" "
179-
/>
180-
<charFilter class="solr.PatternReplaceCharFilterFactory"
181-
pattern="^(.*)$" replacement="AAAA $1 ZZZZ" />
182-
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
183-
</analyzer>
184-
</fieldtype>
185-
186-
187-
<!--
188-
saneString: a string for exact matches, but trim, fold multiple spaces,
189-
and ditch some closing punctuation. Designed for facets where the
190-
values might not be as controlled as you'd like (e.g., LCSH)
191-
192-
Note that you might want to facet on saneString, but allow searches against
193-
text_lr, since the latter does lowercasing, synonyms, and
194-
unicode folding.
195-
-->
196-
197-
<fieldtype name="sane_string" class="solr.TextField" positionIncrementGap="1000" omitNorms="true">
198-
<analyzer>
199-
<tokenizer class="solr.KeywordTokenizerFactory"/>
200-
<filter class="solr.PatternReplaceFilterFactory"
201-
pattern="(.*?)[ ,.!?/]+$" replacement="$1"
202-
/>
203-
<filter class="solr.TrimFilterFactory"/>
204-
<filter class="solr.PatternReplaceFilterFactory"
205-
pattern="\p{Z}+" replacement=" "
206-
/>
207-
</analyzer>
208-
</fieldtype>
209-
210-
<!--
211-
###################################################
212-
########## Useful library types ################
213-
##################################################
214-
215-
-->
216-
217-
<!-- numericID:
73+
<!-- numericID:
21874
- take the first string of digits/dashes/dots and an optional X or x
21975
that is at least six characters long (OCLC, ISBN, ISSN, etc.)
22076
and throw away everything that's left
@@ -224,115 +80,26 @@
22480
- ditch any leading zeros
22581
-->
22682

227-
<fieldtype name="numericID" class="solr.TextField" positionIncrementGap="1000" omitNorms="true">
228-
<analyzer>
229-
<tokenizer class="solr.KeywordTokenizerFactory"/>
230-
<!-- Start by finding the first substring that starts with a digit, ends with a digit, and
231-
has at least four digits in-between, followed by an optional X
232-
Throw away everything else, and stick a '***' on the front as an anchor
233-
-->
234-
<filter class="solr.PatternReplaceFilterFactory"
235-
pattern="^.*?(\p{N}[\p{N}\-\.]{6,}\p{N}[xX]?).*$" replacement="***$1"
236-
/>
237-
238-
<!-- This is a little silly, but basically we find anything that does *not*
239-
start with '*' and throw it all away, on the basis that if it had contained
240-
a valid number, it would start with a '*' due to the pattern replacement
241-
above. The '*' is nice in that it's a wildcard character and will throw an
242-
error if your search actually *does* start with it. -->
243-
244-
<filter class="solr.PatternReplaceFilterFactory"
245-
pattern="^[^\*].*$" replacement=""
246-
/>
247-
248-
<!-- Get rid of the '***' -->
249-
250-
<filter class="solr.PatternReplaceFilterFactory"
251-
pattern="^\*\*\*" replacement=""
252-
/>
25383

254-
<!-- Lowercase it and get rid of anything that's not a number or an 'x' -->
255-
<filter class="solr.LowerCaseFilterFactory"/>
256-
<filter class="solr.PatternReplaceFilterFactory"
257-
pattern="[^\p{N}x]" replacement="" replace="all"
258-
/>
25984

260-
<!-- Throw away everything that's not long enough anymore (e.g., at least five
261-
digits plus an optional 'x', or six digits). This will include stuff from the
262-
second step that got reduced to the empty string. -->
263-
<filter class="solr.LengthFilterFactory" min="8" max="100" />
264-
265-
266-
<!-- Finally, get rid of leading zeros -->
267-
<filter class="solr.PatternReplaceFilterFactory"
268-
pattern="^0*" replacement=""
269-
/>
270-
</analyzer>
271-
</fieldtype>
272-
273-
<!-- csn (comma-separated numbers) is a list of numbers, separated by commas. Do basically
274-
the same transform as numericID, but in this case allow
275-
commas to separate values.
276-
277-
This is more restrictive than numericID, obviously, in that we can't
278-
allow there to be commas in the input
279-
-->
280-
<fieldtype name="csn" class="solr.TextField" positionIncrementGap="100" omitNorms="true">
281-
<analyzer>
282-
<tokenizer class="solr.PatternTokenizerFactory" pattern="\s*,\s*" />
283-
<filter class="solr.PatternReplaceFilterFactory"
284-
pattern="^.*?(\p{N}[\p{N}\-\.]{4,}\p{N}[xX]?).*$" replacement="***$1"
285-
/>
286-
<filter class="solr.PatternReplaceFilterFactory"
287-
pattern="^[^\*].*$" replacement=""
288-
/>
289-
<filter class="solr.PatternReplaceFilterFactory"
290-
pattern="\*\*\*" replacement=""
291-
/>
292-
<filter class="solr.LowerCaseFilterFactory"/>
293-
<filter class="solr.PatternReplaceFilterFactory"
294-
pattern="[^\p{N}x]" replacement="" replace="all"
295-
/>
296-
<filter class="solr.LengthFilterFactory" min="8" max="100" />
297-
298-
<filter class="solr.PatternReplaceFilterFactory"
299-
pattern="^0*" replacement=""
300-
/>
301-
</analyzer>
302-
</fieldtype>
303-
304-
<!-- callnoprefix: use edgengram to index every left-anchored substring
305-
of the call number, throwing away spaces and dots
306-
-->
307-
308-
<fieldtype name="callnoprefix" class="solr.TextField" omitNorms="true">
309-
<analyzer>
310-
<tokenizer class="solr.KeywordTokenizerFactory"/>
311-
<filter class="solr.LowerCaseFilterFactory"/>
312-
<filter class="solr.PatternReplaceFilterFactory"
313-
pattern="[\p{Z}\.]" replacement="" replace="all"
314-
/>
315-
<filter class="solr.EdgeNGramFilterFactory" minGramSize="1" maxGramSize="15" side="front"/>
316-
</analyzer>
317-
</fieldtype>
318-
319-
320-
85+
<fieldtype name="numericID" class="solr.TextField" positionIncrementGap="1000" omitNorms="true">
86+
<analyzer>
87+
<tokenizer class="solr.KeywordTokenizerFactory"/>
88+
<filter class="solr.PatternReplaceFilterFactory" pattern="^.*?(\p{N}[\p{N}\-\.]{5,}\p{N}[xX]?).*$" replacement="***$1"/>
89+
<filter class="solr.PatternReplaceFilterFactory" pattern="^[^\*].*$" replacement=""/>
90+
<filter class="solr.PatternReplaceFilterFactory" pattern="^\*\*\*" replacement=""/>
91+
<filter class="solr.LowerCaseFilterFactory"/>
92+
<filter class="solr.PatternReplaceFilterFactory" pattern="[^\p{N}x]" replacement="" replace="all"/>
93+
<filter class="solr.LengthFilterFactory" min="8" max="14"/>
94+
<filter class="solr.PatternReplaceFilterFactory" pattern="^0*" replacement=""/>
95+
</analyzer>
96+
</fieldtype>
32197

32298
</types>
32399

324100
<fields>
325101
<field name="id" type="string" indexed="true" stored="true" />
326-
<field name="sane" type="sane_string" indexed="true" stored="true"/>
327-
<field name="text" type="text" indexed="true" stored="true" multiValued="true" />
328-
<field name="tf" type="text_lr" indexed="true" stored="true" multiValued="true" />
329-
<field name="tl" type="text_l" indexed="true" stored="true" multiValued="true" />
330-
<field name="tr" type="text_r" indexed="true" stored="true" multiValued="true" />
331-
<field name="numeric" type="numericID" indexed="true" stored="true" multiValued="true" />
332-
<field name="csn" type="csn" indexed="true" stored="true" multiValued="true" />
333-
<field name="name" type="text" indexed="true" stored="true" multiValued="true" />
334-
<field name="othername" type="text" indexed="true" stored="true" multiValued="true" />
335-
102+
<field name="*_numeric" type="numericID" indexed="true" stored="true" multiValued="true"/>
336103
</fields>
337104

338105

0 commit comments

Comments
 (0)