Getting ready for SST #2

billdueber · billdueber · commit e2a69e37b2e3 · 2012-03-05T17:01:22.000-05:00
diff --git a/exampledocs/names.xml b/exampledocs/names.xml
@@ -14,23 +14,20 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 -->
+<update>
+  <delete><query>*:*</query></delete>
+  <commit/>
+  <add>
+    <doc>
+      <field name="id">1</field>
+      <field name="name_text">William John James Dueber</field>
+      <field name="name_text">Rufus Xavier Sarsparella</field>  
+    </doc>
+    <doc>
+      <field name="id">2</field>
+      <field name="name_text">Mike Dueber</field>
+      <field name="name_text">William Penn</field>  
+    </doc>
+  </add>
 
-<add>
-<doc>
-  <field name="id">SP2514N</field>
-  <field name="text">Just a test of the anchor</field>
-  <field name="tf">Just a test of the anchor</field>
-  <field name="tl">Just a test of the anchor</field>
-  <field name="tr">Just a test of the anchor</field>
-  <field name="numeric">ISBN13: 12345-2234X (behind stacks from 1990)</field>
-  <field name="numeric">ISBN134455</field>
-  <field name="csn">1234-5678, 11223344, 123456-89-9X, 111</field>
-  <field name="name">William James John Dueber</field>
-  <field name="name">Jesus H. Tapdancing Christ on a pogo stick with his sister nancy</field>
-  <field name="othername">Rufus Xavier Sarsaparilla</field>
-  <field name="othername">Jumping Jack Flash</field>
-  
-</doc>
-
-</add>
-
+</update>
diff --git a/solr/conf/schema.xml b/solr/conf/schema.xml
@@ -95,11 +95,37 @@
     </analyzer>
     </fieldtype>       
     
+    <!-- text - A standard text type, with icu tokenization and unicode normalization. 
+           - With the ICUFolding, we get:
+             + NFKC normalization (precomosing), 
+             + Unicode case folding (i.e., lowercasing)
+             + search term folding (removing accents, etc).
+           - Synonyms can be put in syn.txt (see sample synonyms.txt file in solr example)
+           - Word delimiter splits on CaseChange and numbers (e.g., code4lib).
+           - The CJK stuff produces bigrams for those languages
+           - Remove Duplicates does what it says on the tin.
+    -->
+    <fieldtype name="text" class="solr.TextField" positionIncrementGap="1000">
+      <analyzer>
+        <tokenizer class="solr.ICUTokenizerFactory"/>
+          <filter class="solr.ICUFoldingFilterFactory"/>
+          <filter class="solr.SynonymFilterFactory" synonyms="syn.txt" ignoreCase="true" expand="false"/>
+          <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
+          <filter class="solr.CJKWidthFilterFactory"/>
+          <filter class="solr.CJKBigramFilterFactory"/> 
+          <filter class="solr.TrimFilterFactory"/>       
+          <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
+      </analyzer>
+    </fieldtype>
+    
+    
+    
   </types>
   
   <fields>
     <field name="id" type="string" indexed="true" stored="true" />
-    <field name="*_numeric" type="numericID" indexed="true" stored="true" multiValued="true"/>    
+    <dynamicField name="*_numeric" type="numericID" indexed="true" stored="true" multiValued="true"/>   
+    <dynamicField name="*_text" type="text" indexed="true" stored="true" multiValued="true"/>   
   </fields>