Skip to content

Commit 3959a3f

Browse files
committed
Added original schema
1 parent 4880093 commit 3959a3f

File tree

1 file changed

+363
-0
lines changed

1 file changed

+363
-0
lines changed

solr/conf/schema.xml.bak

+363
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,363 @@
1+
<?xml version="1.0" encoding="UTF-8" ?>
2+
<schema name="mirlyn" version="1.4">
3+
<types>
4+
5+
<!--
6+
#########################
7+
#### Stock solr types ###
8+
#########################
9+
-->
10+
11+
12+
<!-- Numeric -->
13+
<fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
14+
<fieldType name="float" class="solr.TrieFloatField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
15+
<fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
16+
<fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
17+
<fieldType name="tint" class="solr.TrieIntField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
18+
<fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
19+
<fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
20+
<fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
21+
22+
<!-- Date/Time
23+
24+
The format for this date field is of the form 1995-12-31T23:59:59Z, and
25+
is a more restricted form of the canonical representation of dateTime
26+
http://www.w3.org/TR/xmlschema-2/#dateTime
27+
The trailing "Z" designates UTC time and is mandatory.
28+
Optional fractional seconds are allowed: 1995-12-31T23:59:59.999Z
29+
All other components are mandatory.
30+
31+
Expressions can also be used to denote calculations that should be
32+
performed relative to "NOW" to determine the value, ie...
33+
34+
NOW/HOUR
35+
... Round to the start of the current hour
36+
NOW-1DAY
37+
... Exactly 1 day prior to now
38+
NOW/DAY+6MONTHS+3DAYS
39+
... 6 months and 3 days in the future from the start of
40+
the current day
41+
42+
Consult the DateField javadocs for more information.
43+
44+
Note: For faster range queries, consider the tdate type
45+
-->
46+
<fieldType name="date" class="solr.TrieDateField" omitNorms="true" precisionStep="0" positionIncrementGap="0"/>
47+
48+
<!-- A Trie based date field for faster date range queries and date faceting. -->
49+
<fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" precisionStep="6" positionIncrementGap="0"/>
50+
51+
<!-- boolean type: "true" or "false" -->
52+
<fieldType name="boolean" class="solr.BoolField" sortMissingLast="true" omitNorms="true"/>
53+
54+
<!--Binary data type. The data should be sent/retrieved in as Base64 encoded Strings -->
55+
<fieldtype name="binary" class="solr.BinaryField"/>
56+
57+
<!-- Ignored -->
58+
<fieldtype name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" />
59+
60+
<!-- A specialized field for geospatial search. If indexed, this fieldType must not be multivalued. -->
61+
<fieldType name="location" class="solr.LatLonType" subFieldSuffix="_coordinate"/>
62+
63+
<!--
64+
A Geohash is a compact representation of a latitude longitude pair in a single field.
65+
See http://wiki.apache.org/solr/SpatialSearch
66+
-->
67+
<fieldtype name="geohash" class="solr.GeoHashField"/>
68+
69+
<!-- String -->
70+
<fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
71+
72+
73+
74+
<!--
75+
######################################
76+
########### Text Types #############
77+
######################################
78+
79+
-->
80+
81+
<!-- text - A standard text type, with icu tokenization and unicode normalization.
82+
- With the ICUFolding, we get:
83+
+ NFKC normalization (precomosing),
84+
+ Unicode case folding (i.e., lowercasing)
85+
+ search term folding (removing accents, etc).
86+
- Synonyms can be put in syn.txt (see sample synonyms.txt file in solr example)
87+
- Word delimiter splits on CaseChange and numbers (e.g., code4lib).
88+
- The CJK stuff produces bigrams for those languages
89+
- Remove Duplicates does what it says on the tin.
90+
-->
91+
<fieldtype name="text" class="solr.TextField" positionIncrementGap="1000">
92+
<analyzer>
93+
<tokenizer class="solr.ICUTokenizerFactory"/>
94+
<filter class="solr.ICUFoldingFilterFactory"/>
95+
<filter class="solr.SynonymFilterFactory" synonyms="syn.txt" ignoreCase="true" expand="false"/>
96+
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
97+
<filter class="solr.CJKWidthFilterFactory"/>
98+
<filter class="solr.CJKBigramFilterFactory"/>
99+
<filter class="solr.TrimFilterFactory"/>
100+
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
101+
</analyzer>
102+
</fieldtype>
103+
104+
<!-- same as text, but with some stemming thrown in -->
105+
<fieldtype name="text_stemmed" class="solr.TextField" positionIncrementGap="1000">
106+
<analyzer>
107+
<tokenizer class="solr.ICUTokenizerFactory"/>
108+
<filter class="solr.ICUFoldingFilterFactory"/>
109+
<filter class="solr.KStemFilterFactory"/>
110+
<filter class="solr.SynonymFilterFactory" synonyms="syn.txt" ignoreCase="true" expand="false"/>
111+
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
112+
<filter class="solr.CJKWidthFilterFactory"/>
113+
<filter class="solr.CJKBigramFilterFactory"/>
114+
<filter class="solr.TrimFilterFactory"/>
115+
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
116+
</analyzer>
117+
</fieldtype>
118+
119+
<!-- text_(l|r|lr): text that is anchored on one or both ends.
120+
These are useful for phrase searches only; for non-phrase searches we're
121+
basically just adding one or two useless tokens to the mix.
122+
123+
It's mostly the same as text, but with the addition of one or
124+
two anchors. We don't stem these.
125+
126+
text_lr is essentially an "exact match" where "exact" means
127+
"...except for runs of spaces, case, diacritics, and most punctuation".
128+
I find it useful for boosting the bejeebus out of exact title matches.
129+
-->
130+
131+
<fieldtype name="text_l" class="solr.TextField" positionIncrementGap="1000">
132+
<analyzer>
133+
<tokenizer class="solr.ICUTokenizerFactory"/>
134+
<filter class="solr.ICUFoldingFilterFactory"/>
135+
<filter class="solr.SynonymFilterFactory" synonyms="syn.txt" ignoreCase="true" expand="false"/>
136+
<filter class="solr.CJKWidthFilterFactory"/>
137+
<filter class="solr.CJKBigramFilterFactory"/>
138+
<filter class="solr.TrimFilterFactory"/>
139+
<filter class="solr.PatternReplaceFilterFactory"
140+
pattern="\p{Z}+" replacement=" "
141+
/>
142+
<charFilter class="solr.PatternReplaceCharFilterFactory"
143+
pattern="^(.*)$" replacement="AAAA $1" />
144+
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
145+
</analyzer>
146+
</fieldtype>
147+
148+
<fieldtype name="text_r" class="solr.TextField" positionIncrementGap="1000">
149+
<analyzer>
150+
<tokenizer class="solr.ICUTokenizerFactory"/>
151+
<filter class="solr.ICUFoldingFilterFactory"/>
152+
<filter class="solr.SynonymFilterFactory" synonyms="syn.txt" ignoreCase="true" expand="false"/>
153+
<filter class="solr.CJKWidthFilterFactory"/>
154+
<filter class="solr.CJKBigramFilterFactory"/>
155+
<filter class="solr.TrimFilterFactory"/>
156+
<filter class="solr.PatternReplaceFilterFactory"
157+
pattern="\p{Z}+" replacement=" "
158+
/>
159+
<charFilter class="solr.PatternReplaceCharFilterFactory"
160+
pattern="^(.*)$" replacement="$1 ZZZZ" />
161+
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
162+
</analyzer>
163+
</fieldtype>
164+
165+
<!-- text_fullanchored anchors on both ends and is basically a more forgiving
166+
"exact match"
167+
-->
168+
169+
<fieldtype name="text_lr" class="solr.TextField" positionIncrementGap="1000">
170+
<analyzer>
171+
<tokenizer class="solr.ICUTokenizerFactory"/>
172+
<filter class="solr.ICUFoldingFilterFactory"/>
173+
<filter class="solr.SynonymFilterFactory" synonyms="syn.txt" ignoreCase="true" expand="false"/>
174+
<filter class="solr.CJKWidthFilterFactory"/>
175+
<filter class="solr.CJKBigramFilterFactory"/>
176+
<filter class="solr.TrimFilterFactory"/>
177+
<filter class="solr.PatternReplaceFilterFactory"
178+
pattern="\p{Z}+" replacement=" "
179+
/>
180+
<charFilter class="solr.PatternReplaceCharFilterFactory"
181+
pattern="^(.*)$" replacement="AAAA $1 ZZZZ" />
182+
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
183+
</analyzer>
184+
</fieldtype>
185+
186+
187+
<!--
188+
saneString: a string for exact matches, but trim, fold multiple spaces,
189+
and ditch some closing punctuation. Designed for facets where the
190+
values might not be as controlled as you'd like (e.g., LCSH)
191+
192+
Note that you might want to facet on saneString, but allow searches against
193+
text_lr, since the latter does lowercasing, synonyms, and
194+
unicode folding.
195+
-->
196+
197+
<fieldtype name="sane_string" class="solr.TextField" positionIncrementGap="1000" omitNorms="true">
198+
<analyzer>
199+
<tokenizer class="solr.KeywordTokenizerFactory"/>
200+
<filter class="solr.PatternReplaceFilterFactory"
201+
pattern="(.*?)[ ,.!?/]+$" replacement="$1"
202+
/>
203+
<filter class="solr.TrimFilterFactory"/>
204+
<filter class="solr.PatternReplaceFilterFactory"
205+
pattern="\p{Z}+" replacement=" "
206+
/>
207+
</analyzer>
208+
</fieldtype>
209+
210+
<!--
211+
###################################################
212+
########## Useful library types ################
213+
##################################################
214+
215+
-->
216+
217+
<!-- numericID:
218+
- take the first string of digits/dashes/dots and an optional X or x
219+
that is at least six characters long (OCLC, ISBN, ISSN, etc.)
220+
and throw away everything that's left
221+
- lowercase it (i.e., turn any trailing X into an x)
222+
- ditch everything that's not a number or an 'x'
223+
-
224+
- ditch any leading zeros
225+
-->
226+
227+
<fieldtype name="numericID" class="solr.TextField" positionIncrementGap="1000" omitNorms="true">
228+
<analyzer>
229+
<tokenizer class="solr.KeywordTokenizerFactory"/>
230+
<!-- Start by finding the first substring that starts with a digit, ends with a digit, and
231+
has at least four digits in-between, followed by an optional X
232+
Throw away everything else, and stick a '***' on the front as an anchor
233+
-->
234+
<filter class="solr.PatternReplaceFilterFactory"
235+
pattern="^.*?(\p{N}[\p{N}\-\.]{4,}\p{N}[xX]?).*$" replacement="***$1"
236+
/>
237+
238+
<!-- This is a little silly, but basically we find anything that does *not*
239+
start with '*' and throw it all away, on the basis that if it had contained
240+
a valid number, it would start with a '*' due to the pattern replacement
241+
above. The '*' is nice in that it's a wildcard character and will throw an
242+
error if your search actually *does* start with it. -->
243+
244+
<filter class="solr.PatternReplaceFilterFactory"
245+
pattern="^[^\*].*$" replacement=""
246+
/>
247+
248+
<!-- Get rid of the '***' -->
249+
250+
<filter class="solr.PatternReplaceFilterFactory"
251+
pattern="^\*\*\*" replacement=""
252+
/>
253+
254+
<!-- Lowercase it and get rid of anything that's not a number or an 'x' -->
255+
<filter class="solr.LowerCaseFilterFactory"/>
256+
<filter class="solr.PatternReplaceFilterFactory"
257+
pattern="[^\p{N}x]" replacement="" replace="all"
258+
/>
259+
260+
<!-- Throw away everything that's not long enough anymore (e.g., at least five
261+
digits plus an optional 'x', or six digits). This will include stuff from the
262+
second step that got reduced to the empty string. -->
263+
<filter class="solr.LengthFilterFactory" min="8" max="100" />
264+
265+
266+
<!-- Finally, get rid of leading zeros -->
267+
<filter class="solr.PatternReplaceFilterFactory"
268+
pattern="^0*" replacement=""
269+
/>
270+
</analyzer>
271+
</fieldtype>
272+
273+
<!-- csn (comma-separated numbers) is a list of numbers, separated by commas. Do basically
274+
the same transform as numericID, but in this case allow
275+
commas to separate values.
276+
277+
This is more restrictive than numericID, obviously, in that we can't
278+
allow there to be commas in the input
279+
-->
280+
<fieldtype name="csn" class="solr.TextField" positionIncrementGap="100" omitNorms="true">
281+
<analyzer>
282+
<tokenizer class="solr.PatternTokenizerFactory" pattern="\s*,\s*" />
283+
<filter class="solr.PatternReplaceFilterFactory"
284+
pattern="^.*?(\p{N}[\p{N}\-\.]{4,}\p{N}[xX]?).*$" replacement="***$1"
285+
/>
286+
<filter class="solr.PatternReplaceFilterFactory"
287+
pattern="^[^\*].*$" replacement=""
288+
/>
289+
<filter class="solr.PatternReplaceFilterFactory"
290+
pattern="\*\*\*" replacement=""
291+
/>
292+
<filter class="solr.LowerCaseFilterFactory"/>
293+
<filter class="solr.PatternReplaceFilterFactory"
294+
pattern="[^\p{N}x]" replacement="" replace="all"
295+
/>
296+
<filter class="solr.LengthFilterFactory" min="8" max="100" />
297+
298+
<filter class="solr.PatternReplaceFilterFactory"
299+
pattern="^0*" replacement=""
300+
/>
301+
</analyzer>
302+
</fieldtype>
303+
304+
<!-- callnoprefix: use edgengram to index every left-anchored substring
305+
of the call number, throwing away spaces and dots
306+
-->
307+
308+
<fieldtype name="callnoprefix" class="solr.TextField" omitNorms="true">
309+
<analyzer>
310+
<tokenizer class="solr.KeywordTokenizerFactory"/>
311+
<filter class="solr.LowerCaseFilterFactory"/>
312+
<filter class="solr.PatternReplaceFilterFactory"
313+
pattern="[\p{Z}\.]" replacement="" replace="all"
314+
/>
315+
<filter class="solr.EdgeNGramFilterFactory" minGramSize="1" maxGramSize="15" side="front"/>
316+
</analyzer>
317+
</fieldtype>
318+
319+
320+
321+
322+
</types>
323+
324+
<fields>
325+
<field name="id" type="string" indexed="true" stored="true" />
326+
<field name="sane" type="sane_string" indexed="true" stored="true"/>
327+
<field name="text" type="text" indexed="true" stored="true" multiValued="true" />
328+
<field name="tf" type="text_lr" indexed="true" stored="true" multiValued="true" />
329+
<field name="tl" type="text_l" indexed="true" stored="true" multiValued="true" />
330+
<field name="tr" type="text_r" indexed="true" stored="true" multiValued="true" />
331+
<field name="numeric" type="numericID" indexed="true" stored="true" multiValued="true" />
332+
<field name="csn" type="csn" indexed="true" stored="true" multiValued="true" />
333+
<field name="name" type="text" indexed="true" stored="true" multiValued="true" />
334+
<field name="othername" type="text" indexed="true" stored="true" multiValued="true" />
335+
336+
</fields>
337+
338+
339+
<uniqueKey>id</uniqueKey>
340+
<defaultSearchField>text</defaultSearchField>
341+
<solrQueryParser defaultOperator="OR"/>
342+
343+
</schema>
344+
345+
346+
347+
348+
349+
350+
351+
352+
353+
354+
355+
356+
357+
358+
359+
360+
361+
362+
363+

0 commit comments

Comments
 (0)