# Example of indexing and searching

In [1]:
import lucene

In [2]:
lucene.initVM()

<jcc.JCCEnv at 0x7f3a0097b048>

In [3]:
from org.apache.lucene.analysis.standard import StandardTokenizer, StandardAnalyzer
from org.apache.lucene.analysis import StopFilter, TokenFilter
from org.apache.lucene.analysis.tokenattributes import CharTermAttribute
from org.tartarus.snowball.ext import EnglishStemmer
from java.io import StringReader
from org.apache.lucene.util import Version

### tokenization

In [4]:
stok = StandardTokenizer()
sread = StringReader("cancers and their symptoms")
stok.setReader(sread)
stok.reset()
while stok.incrementToken():
    print(stok.getAttribute(CharTermAttribute.class_))

cancers
and
their
symptoms


In [5]:
stok = StandardTokenizer()
sread = StringReader("cancers and their symptoms")
stok.setReader(sread)
stok.reset()
# while stok.incrementToken():
#     print(stok.getAttribute(CharTermAttribute.class_))
sfil = StopFilter(stok, StandardAnalyzer.ENGLISH_STOP_WORDS_SET)
while sfil.incrementToken():
    print(sfil.getAttribute(CharTermAttribute.class_))

cancers
symptoms


In [6]:
es = EnglishStemmer()
stok = StandardTokenizer()
sread = StringReader("cancers and their symptoms")
stok.setReader(sread)
stok.reset()
# while stok.incrementToken():
#     print(stok.getAttribute(CharTermAttribute.class_))
sfil = StopFilter(stok, StandardAnalyzer.ENGLISH_STOP_WORDS_SET)
while sfil.incrementToken():
    es.setCurrent(str(sfil.getAttribute(CharTermAttribute.class_)))
    es.stem()
    print(es.getCurrent())

cancer
symptom


In [7]:
from org.apache.lucene.document import Document, TextField, Field
from org.apache.lucene.index import IndexWriter, IndexWriterConfig
from org.apache.lucene.store import SimpleFSDirectory, RAMDirectory, FSDirectory
from java.io import File
from java.nio.file import Paths

### indexing

In [8]:
# indeksowanie bazy

doc = Document()
# ind_dir = RAMDirectory()
path = Paths.get('index')
ind_dir = SimpleFSDirectory(path)
conf = IndexWriterConfig(StandardAnalyzer())

In [9]:
# parsowanie XML
import xml.etree.ElementTree as ET
tree = ET.parse('../pubmed-xml/baseline/medline17n0001.xml')
root = tree.getroot()

In [10]:
print(root.tag)

PubmedArticleSet


In [11]:
#path = Paths.get('index')
#ind_dir = SimpleFSDirectory(path)
#conf = IndexWriterConfig(StandardAnalyzer())
ind_wr = IndexWriter(ind_dir, conf)
for pmed_article in root.findall('PubmedArticle'):
    article = pmed_article.find('MedlineCitation').find('Article')
    if article is not None and article.find('Abstract') is not None:
        doc = Document()
        doc.add(TextField('title', article.find('ArticleTitle').text, Field.Store.YES))
        doc.add(TextField('abstract', article.find('Abstract').find('AbstractText').text, Field.Store.YES))
        ind_wr.addDocument(doc)
ind_wr.close()

### searching

In [12]:
from org.apache.lucene.index import IndexReader, LeafReader, DirectoryReader
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.queryparser.classic import QueryParser

In [13]:
# path = Paths.get('index')
# ind_dir = SimpleFSDirectory(path)
ind_reader = (DirectoryReader.open(ind_dir))
ind_searcher = IndexSearcher(ind_reader)
query_parser = QueryParser('abstract', StandardAnalyzer())
query = query_parser.parse('protein')
hits = ind_searcher.search(query, 10)
print(str(hits.totalHits) + " documents found.")

1644 documents found.


#### results

In [16]:
for score_doc in hits.scoreDocs:
    print(ind_searcher.doc(score_doc.doc).getField('title').stringValue())
    print("\n\n")

The binding of calcium to a salivary phosphoprotein, protein C, and comparison with calcium binding to protein A, a related salivary phosphoprotein.



Neurotrophic activity of brain extracts in forelimb regeneration of the urodele, Triturus.



[Demonstration of protein kinase activities in the coronary artery of cattle].



Solubility and heat stability of whey protein concentrates.



Structure and assembly of the capsid of bacteriophage P22.



[Investigation of rat brain prealbumins].



Use of immobilized light-harvesting chlorophyll a/b protein to study the stoichiometry of its self-association.



Effect of proteolytic enzymes on the binding of cobalamin to R protein and intrinsic factor. In vitro evidence that a failure to partially degrade R protein is responsible for cobalamin malabsorption in pancreatic insufficiency.



[Rheology and spinning of alkaline solution of field bean protein and casein].



Some properties of erythrocuprein treated by organic solvents.



