In [1]:
import lucene
from java.nio.file import Paths
from org.apache.lucene.analysis.standard import StandardAnalyzer
from org.apache.lucene.index import IndexWriter, IndexWriterConfig, DirectoryReader
from org.apache.lucene.search import IndexSearcher
from org.apache.lucene.document import Document, Field, StringField, TextField, IntPoint
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.queryparser.classic import QueryParser
import simplemma
import csv

In [2]:
# path config for links

csv_data_path = "/data/processing/data/output/links.csv"
index_path = "/data/indexes/links"

In [3]:
lucene.initVM(vmargs=['-Djava.awt.headless=true'])

<jcc.JCCEnv at 0x7fe8a8127d30>

In [4]:
# define writer and store

store = SimpleFSDirectory(Paths.get(index_path))
analyzer = StandardAnalyzer()
config = IndexWriterConfig(analyzer)
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
writer = IndexWriter(store, config)

In [5]:
# index links 

with open(csv_data_path) as f:
    file = csv.DictReader(f, fieldnames=['lemma', 'form', 'postfix'])
    for row in file:
        doc = Document()
        
        doc.add(Field('lemma', row['lemma'].lower(), StringField.TYPE_STORED))
        doc.add(Field('form', row['form'].lower(), StringField.TYPE_STORED))
        doc.add(Field('postfix', row['postfix'].lower(), StringField.TYPE_STORED))
        doc.add(Field('content', str(row), TextField.TYPE_STORED))
        
        writer.addDocument(doc)
        
    writer.commit()
    writer.close()

In [6]:
# path config for sentences

sen_data_path = "/data/processing/data/output/sentences.csv"
sen_index_path = "/data/indexes/sentences"

In [7]:
# define writer and store

store = SimpleFSDirectory(Paths.get(sen_index_path))
analyzer = StandardAnalyzer()
config = IndexWriterConfig(analyzer)
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
writer = IndexWriter(store, config)

In [9]:
# index sentences

with open(sen_data_path) as f:
    file = csv.DictReader(f, fieldnames=['sentence'])
    for row in file:
        doc = Document()
        doc.add(Field('content', row['sentence'], TextField.TYPE_STORED))
        writer.addDocument(doc)
        
    writer.commit()
    writer.close()

In [15]:
sentences_index_path = "/data/indexes/sentences"
sentences_directory = SimpleFSDirectory(Paths.get(sentences_index_path))
sentences_searcher = IndexSearcher(DirectoryReader.open(sentences_directory))
analyzer = StandardAnalyzer()

query = QueryParser("content", analyzer).parse("teneda")
score_docs = sentences_searcher.search(query, 2).scoreDocs

for score_doc in score_docs:
    doc = sentences_searcher.doc(score_doc.doc)
    print("Content:", doc.get("content"))

Content: Po čase sa stal kráľom tohto ostrova, ktorý dostal po ňom meno Teneda.
Content: Tennes alebo Tennés alebo Tenés je v gréckej mytológii syn boha Apolóna a dcéry trójskeho kráľa Laomedonta Prokleie, kráľ ostrova Teneda.


In [44]:
from org.apache.lucene.search import BooleanQuery, BooleanClause

links_index_path = "/data/indexes/links"
links_directory = SimpleFSDirectory(Paths.get(links_index_path))
links_searcher = IndexSearcher(DirectoryReader.open(links_directory))
analyzer = StandardAnalyzer()

def search(word, distance=0, num=5):
    query = BooleanQuery.Builder()
    form_parser = QueryParser("form", analyzer).parse(word + "~" + str(distance))
    lemma_parser = QueryParser("lemma", analyzer).parse(word + "~" + str(distance))
    query.add(form_parser, BooleanClause.Occur.SHOULD)
    query.add(lemma_parser, BooleanClause.Occur.SHOULD)
    built_query = query.build()
    score_docs = links_searcher.search(built_query, num).scoreDocs
    print("=======================================================")
    print("Word:", word)
    print("Distance:", distance)

    for i, score_doc in enumerate(score_docs):
        print("-------------------------------")
        print("Result", i)
        doc = links_searcher.doc(score_doc.doc)
        print('lemma:', doc.get("lemma"))
        print('postfix:', doc.get("postfix"))
        print('content:', doc.get("content"))

#     if len(score_docs) == 0:
#         print("Result")
#         print("simplemma:", simplemma.lemmatize(word, lang_data))

#     examples = search_examples(word)
#     if len(examples) != 0:
#         print("examples:")
#     for example in examples:
#         print(example)


In [45]:
search("teneda")

Word: teneda
Distance: 0
-------------------------------
Result 0
lemma: tenedos
postfix: 
content: {'lemma': 'Tenedos', 'form': 'Teneda', 'postfix': ''}
