# Lab 2 - FTS

In [1]:
from elasticsearch import Elasticsearch
import functools
import os

ElasticSearch communicates through REST API. In the development process of this notebook the ES instance's security features such as SSL were disabled to allow unrestricted access. This was necessary to not use SSL keys/certs or to not point to the their location.

In [2]:
host = "http://localhost:9200"
es = Elasticsearch(host)
es.info().body

{'name': 'MacBook-Pro-5.lan',
 'cluster_name': 'elasticsearch',
 'cluster_uuid': 'PooxqAlHSxyYoL2hC3MGDw',
 'version': {'number': '8.4.3',
  'build_flavor': 'default',
  'build_type': 'tar',
  'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73',
  'build_date': '2022-10-04T07:17:24.662462378Z',
  'build_snapshot': False,
  'lucene_version': '9.3.0',
  'minimum_wire_compatibility_version': '7.17.0',
  'minimum_index_compatibility_version': '7.0.0'},
 'tagline': 'You Know, for Search'}

In [3]:
polish_analyzer = "polish_law_analyzer"


In [4]:
# Checks if indices already exist, if so deletes them.
if es.indices.exists(index="polish_law_index"):
    es.indices.delete(index="polish_law_index")

response = es.indices.create(
    index="polish_law_index",
    settings={
        'analysis': {
            'analyzer': {
                f'{polish_analyzer}': {
                    'type': 'custom',
                    'tokenizer': 'standard',
                    'filter': [
                        "lowercase",  # lowercase filter
                        "synonym",  # introduce synonyms, defined below
                        "morfologik_stem"  # Morfologic as the lemmatizer
                    ]
                }
            },
            'filter': {
                'synonym': {
                    'type': "synonym",
                    'expand': True,  # TODO
                    'synonyms': [
                        "kpk => kodeks postępowania karnego",
                        "kpc => kodeks postępowania cywilnego",
                        "kk => kodeks karny",
                        "kc => kodeks cywilny"
                    ]
                }
            }
        }
    },
    mappings={
        'properties': {
            'text': {
                'type': "text",
                'analyzer': f'{polish_analyzer}',
            }
        }
    }
)
response

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'polish_law_index'})

Reading documents and uploading.

In [5]:
files_folder = "ustawy"

def read_documents() -> dict[str, str]:
    file_names = os.listdir(files_folder)
    return {
        name: _read_document(name, files_folder)
        for name in file_names
        if name.endswith(".txt")
    }

def _read_document(name: str, path: str) -> str:
    with open(os.path.join(path, name), 'r') as f:
        return f.read()

documents = list(read_documents().items())


In [6]:
def map_document_to_action(doc_name: str) -> dict:
    return {
        'index': {
            '_id': doc_name,
        }
    }

def map_document_to_source(doc_name: str, doc_text: str) -> dict:
    return {
        'name': doc_name,
        'text': doc_text
    }

def bulk_documents(documents: list[tuple[str, str]]) -> list[dict]:
    return functools.reduce(
        lambda acc, x: acc + [map_document_to_action(x[0])] + [map_document_to_source(*x)],
        documents,
        []
    )

Task 5: Load the data to the ES instance.

In [7]:
n = 300
documents_chunked = [documents[i:i + n] for i in range(0, len(documents), n)]
n_chunks = len(documents_chunked)
for i, docs in enumerate(documents_chunked, 1):
    print(f"Uploading chunk {i}/{n_chunks}")
    res = es.bulk(index="doc", operations=bulk_documents(docs))
    print(f"Took: {res['took']}, errors: {res['errors']}")

Uploading chunk 1/4
Took: 1112, errors: False
Uploading chunk 2/4
Took: 604, errors: False
Uploading chunk 3/4
Took: 997, errors: False
Uploading chunk 4/4
Took: 622, errors: False


Task 6: Determine the number of legislative acts containing the word ustawa (in any form).

In [8]:
response = es.count(
    index="doc", 
    query={'match': {'text': {'query': "ustawa"}}}
    )
count_ustawa = response['count']
print("Count of the word 'ustawa':", count_ustawa)

Count of the word 'ustawa': 1178


Task 7: Determine the number of occurrences of the word ustawa by searching for this particular form, including the other inflectional forms.

In [9]:
response = es.termvectors(
    index="doc",
    id="1993_602.txt",  # first file in the folder
    fields=["text"],
    filter_path=["term_vectors.text.terms.ustawa.ttf"],
    term_statistics=True
)
print(response)
ttf_ustawa = response['term_vectors']['text']['terms']['ustawa']['ttf']
print('Number of occurrences of the word "ustawa":', ttf_ustawa)

{'term_vectors': {'text': {'terms': {'ustawa': {'ttf': 3235}}}}}
Number of occurrences of the word "ustawa": 3235


Task 8: Determine the number of occurrences of the word ustaw by searching for this particular form, including the other inflectional forms.

In [10]:
response = es.indices.analyze(
    index="doc",
    analyzer=ES_ANALYZER,
    # analyzer='polish_law_analyzer',
    text="ustaw" 
)
print(response)
words_ustaw = [t['token'] for t in response['tokens']]
words_ustaw

NameError: name 'ES_ANALYZER' is not defined

In [11]:
# The problem with selecting "polish_law_analyzer", finds the word "ustaw" in indices without specified analyzer

response = es.indices.analyze(
    index="doc",
    # analyzer=polish_analyzer, 
    text="ustaw" 
)
print(response)
words_ustaw = [t['token'] for t in response['tokens']]
words_ustaw

{'tokens': [{'token': 'ustaw', 'start_offset': 0, 'end_offset': 5, 'type': '<ALPHANUM>', 'position': 0}]}


['ustaw']

Task 9: Determine the number of legislative acts containing the words 'kodeks postępowania cywilnego' in the specified order, but in any inflection form.

In [12]:
response = es.count(
    index="doc", 
    query={'match_phrase': {'text': "kodeks postępowania cywilnego"}}
    )
count_kpc = response['count']
print("Count of 'kodeks postepowanie cywilnego':", count_kpc)

Count of 'kodeks postepowanie cywilnego': 44


Task 10: Determine the number of legislative acts containing the words 'wchodzi w życie' (in any form) allowing for up to 2 additional words in the searched phrase.

In [13]:
response = es.count(
    index="doc", 
    query={'match_phrase': {'text': {'query': "wchodzi w życie", 'slop': 2}}}
)
print(response)
count_wwz = response['count']
print("Number of documents containing the phrase 'wchodzi w życie':", count_wwz)

{'count': 1174, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}}
Number of documents containing the phrase 'wchodzi w życie': 1174


Task 11: Determine the 10 documents that are the most relevant for the phrase konstytucja.

In [14]:
word = "konstytucja"

res = es.search(
    index="doc",
    query={'match': {'text': word}},
    filter_path=["hits.hits._id", "hits.hits._score"],
    size=10
)
print(res)
res["hits"]["hits"]



{'hits': {'hits': [{'_id': '1997_629.txt', '_score': 9.633655}, {'_id': '1999_688.txt', '_score': 7.6440825}]}}


[{'_id': '1997_629.txt', '_score': 9.633655},
 {'_id': '1999_688.txt', '_score': 7.6440825}]

Task 12: Print the excerpts containing the word konstytucja (up to three excerpts per document) from the previous task.

In [15]:
response = es.search(
    index="doc",
    query={'match': {'text': "konstytucja"}},
    highlight={'fields': {'text': {'number_of_fragments': 3}}},
    filter_path=["hits.hits._id", "hits.hits.highlight"],
    size=10
)
print(response)
response['hits']['hits']

{'hits': {'hits': [{'_id': '1997_629.txt', 'highlight': {'text': ['Zasady, na których opierać się ma <em>Konstytucja</em> mogą\n                być poddane pod referendum.']}}, {'_id': '1999_688.txt', 'highlight': {'text': ['Projekt ustawy nie może dotyczyć spraw, dla których <em>Konstytucja</em>\nRzeczypospolitej Polskiej zastrzega wyłączną']}}]}}


[{'_id': '1997_629.txt',
  'highlight': {'text': ['Zasady, na których opierać się ma <em>Konstytucja</em> mogą\n                być poddane pod referendum.']}},
 {'_id': '1999_688.txt',
  'highlight': {'text': ['Projekt ustawy nie może dotyczyć spraw, dla których <em>Konstytucja</em>\nRzeczypospolitej Polskiej zastrzega wyłączną']}}]