In [150]:
import requests as rq
from utils import read_bills_without_normalize
import os
import regex
from elasticsearch import Elasticsearch
from pathlib import Path

# Check if ES works

In [151]:
es_url = "http://localhost:9200"
es = Elasticsearch(es_url)

In [152]:
es.info()

ObjectApiResponse({'name': '35f5cbf1a125', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'ZW8ilXXfTOS_RXihBz65Tg', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

# Constants

In [153]:
es_analyzer = "polish_es_analyzer"
es_index = "bills"

# Create analyzer

In [154]:
resp = rq.delete(url = f"{es_url}/{es_index}")
resp.json()

{'acknowledged': True}

In [155]:
rq.post(url = f"{es_url}/{es_index}/_refresh")

<Response [404]>

In [156]:
# ref https://www.elastic.co/guide/en/elasticsearch/reference/current/analyzer.html
# it looks like according to the doc that we have to create analyzer under specific index 
response = rq.put(
    url = f"{es_url}/{es_index}",
    json = {
    "settings":{
        "analysis":{
            "analyzer":{
                f"{es_analyzer}":{ 
                    "type":"custom",
                    "tokenizer":"standard",
                    "filter":[
                        "lowercase",
                        "synonym",
                        "morfologik_stem"
                    ]
                }
            },
            "filter":{
                "synonym":{
                    "type": "synonym",
                    "expand": True, # we don't want the target mapping to be only the first word ref https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-tokenfilter.html
                    "synonyms": [ 
                        "kpk => kodeks postępowania karnego",
                        "kpc => kodeks postępowania cywilnego",
                        "kk => kodeks karny",
                        "kc => kodeks cywilny"
                    ]
                }
            }
        }
    },
    "mappings":{
        "properties":{
            "text": { # this need to be same as filed in json to which we are gonna to load data!!!!!
                "type": "text",
                "analyzer": f"{es_analyzer}", 
            }
        }
    }
})

In [157]:
response.json()

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'bills'}

In [158]:
response = rq.get(
    url=f"{es_url}/{es_index}/_analyze",
    json={
        "text": "kpc i kpk i kk i kc",
        "analyzer": f"{es_analyzer}"
    }
)
response.json()

{'tokens': [{'token': 'kodeks',
   'start_offset': 0,
   'end_offset': 3,
   'type': 'SYNONYM',
   'position': 0},
  {'token': 'i',
   'start_offset': 4,
   'end_offset': 5,
   'type': '<ALPHANUM>',
   'position': 1},
  {'token': 'postępowanie',
   'start_offset': 4,
   'end_offset': 5,
   'type': 'SYNONYM',
   'position': 1},
  {'token': 'postępować',
   'start_offset': 4,
   'end_offset': 5,
   'type': 'SYNONYM',
   'position': 1},
  {'token': 'cywilny',
   'start_offset': 6,
   'end_offset': 9,
   'type': 'SYNONYM',
   'position': 2},
  {'token': 'kodeks',
   'start_offset': 6,
   'end_offset': 9,
   'type': 'SYNONYM',
   'position': 2},
  {'token': 'i',
   'start_offset': 10,
   'end_offset': 11,
   'type': '<ALPHANUM>',
   'position': 3},
  {'token': 'postępowanie',
   'start_offset': 10,
   'end_offset': 11,
   'type': 'SYNONYM',
   'position': 3},
  {'token': 'postępować',
   'start_offset': 10,
   'end_offset': 11,
   'type': 'SYNONYM',
   'position': 3},
  {'token': 'karny',
 

# Read bills without normalization

In [159]:
current_dir = os.getcwd()
bills_dict = read_bills_without_normalize(f"{current_dir}/data/first_ex_data/ustawy")

In [160]:
len(bills_dict.values())

1178

In [161]:
bills_dict["1999_700"]

'\n\n\n\nDz.U. z 1999 r. Nr 63, poz. 700 \n                                       \n                                                                               \n                                                                               \n                                                                               \n                                                                               \n                                    USTAWA\n                            z dnia 8 lipca 1999 r.\n                                       \n            o zmianie ustawy o powszechnym ubezpieczeniu zdrowotnym\n                                       \n                                       \n                                    Art. 1.\nW ustawie z dnia 6 lutego 1997 r. o powszechnym ubezpieczeniu zdrowotnym (Dz.U.\nNr 28, poz. 153 i Nr 75, poz. 468, z 1998 r. Nr 117, poz. 756, Nr 137, poz.\n887, Nr 144, poz. 929 i Nr 162, poz. 1116 oraz z 1999 r. Nr 45, poz. 439 i Nr\n49, poz. 483), w art. 

# Load data to ES index

In [162]:
# for file_id, file_content in bills_dict.items():
#     response = rq.delete(url = f"{es_url}/{es_index}/_doc/{file_id}")

In [163]:
for file_id, file_content in bills_dict.items():
    response = rq.put(
        url = f"{es_url}/{es_index}/_doc/{file_id}",
        json = {
            "text": f"{file_content}"
        })
#     print(response.json())
#     if (response.json()['acknowledged'] is not True):
#         print("Error")
#     es.create(index=es_index, id=file_id, document={"text": file_content})

{'_index': 'bills', '_id': '2001_874', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1}
{'_index': 'bills', '_id': '1996_583', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1, '_primary_term': 1}
{'_index': 'bills', '_id': '2003_1853', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 2, '_primary_term': 1}
{'_index': 'bills', '_id': '1997_753', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 3, '_primary_term': 1}
{'_index': 'bills', '_id': '2000_440', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 4, '_primary_term': 1}
{'_index': 'bills', '_id': '2004_1375', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 5, '_primary_term': 1}
{'_index': 'bi

{'_index': 'bills', '_id': '2003_1661', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 57, '_primary_term': 1}
{'_index': 'bills', '_id': '1997_752', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 58, '_primary_term': 1}
{'_index': 'bills', '_id': '1995_482', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 59, '_primary_term': 1}
{'_index': 'bills', '_id': '2003_1852', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 60, '_primary_term': 1}
{'_index': 'bills', '_id': '2001_875', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 61, '_primary_term': 1}
{'_index': 'bills', '_id': '2000_1156', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 62, '_primary_term': 1}
{'_inde

{'_index': 'bills', '_id': '2004_890', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 107, '_primary_term': 1}
{'_index': 'bills', '_id': '2004_884', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 108, '_primary_term': 1}
{'_index': 'bills', '_id': '1997_31', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 109, '_primary_term': 1}
{'_index': 'bills', '_id': '2004_1001', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 110, '_primary_term': 1}
{'_index': 'bills', '_id': '1996_146', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 111, '_primary_term': 1}
{'_index': 'bills', '_id': '2001_1209', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 112, '_primary_term': 1}
{'_

{'_index': 'bills', '_id': '2003_1302', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 159, '_primary_term': 1}
{'_index': 'bills', '_id': '1997_557', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 160, '_primary_term': 1}
{'_index': 'bills', '_id': '1999_528', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 161, '_primary_term': 1}
{'_index': 'bills', '_id': '1995_724', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 162, '_primary_term': 1}
{'_index': 'bills', '_id': '1995_730', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 163, '_primary_term': 1}
{'_index': 'bills', '_id': '2004_466', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 164, '_primary_term': 1}
{'_

{'_index': 'bills', '_id': '2000_1191', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 209, '_primary_term': 1}
{'_index': 'bills', '_id': '2001_640', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 210, '_primary_term': 1}
{'_index': 'bills', '_id': '1999_688', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 211, '_primary_term': 1}
{'_index': 'bills', '_id': '1996_584', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 212, '_primary_term': 1}
{'_index': 'bills', '_id': '2001_873', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 213, '_primary_term': 1}
{'_index': 'bills', '_id': '1997_783', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 214, '_primary_term': 1}
{'_

{'_index': 'bills', '_id': '1997_435', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 268, '_primary_term': 1}
{'_index': 'bills', '_id': '1996_140', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 269, '_primary_term': 1}
{'_index': 'bills', '_id': '2001_247', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 270, '_primary_term': 1}
{'_index': 'bills', '_id': '2003_1300', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 271, '_primary_term': 1}
{'_index': 'bills', '_id': '1997_555', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 272, '_primary_term': 1}
{'_index': 'bills', '_id': '2001_469', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 273, '_primary_term': 1}
{'_

{'_index': 'bills', '_id': '2002_1763', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 318, '_primary_term': 1}
{'_index': 'bills', '_id': '1997_650', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 319, '_primary_term': 1}
{'_index': 'bills', '_id': '2003_2256', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 320, '_primary_term': 1}
{'_index': 'bills', '_id': '1997_678', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 321, '_primary_term': 1}
{'_index': 'bills', '_id': '2001_744', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 322, '_primary_term': 1}
{'_index': 'bills', '_id': '2003_1039', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 323, '_primary_term': 1}
{

{'_index': 'bills', '_id': '2004_574', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 381, '_primary_term': 1}
{'_index': 'bills', '_id': '2000_636', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 382, '_primary_term': 1}
{'_index': 'bills', '_id': '2004_1922', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 383, '_primary_term': 1}
{'_index': 'bills', '_id': '1997_492', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 384, '_primary_term': 1}
{'_index': 'bills', '_id': '2004_2581', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 385, '_primary_term': 1}
{'_index': 'bills', '_id': '2000_959', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 386, '_primary_term': 1}
{'

{'_index': 'bills', '_id': '2000_179', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 435, '_primary_term': 1}
{'_index': 'bills', '_id': '2001_1444', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 436, '_primary_term': 1}
{'_index': 'bills', '_id': '2001_1322', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 437, '_primary_term': 1}
{'_index': 'bills', '_id': '2000_1322', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 438, '_primary_term': 1}
{'_index': 'bills', '_id': '2001_1450', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 439, '_primary_term': 1}
{'_index': 'bills', '_id': '2004_1843', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 440, '_primary_term': 1}

{'_index': 'bills', '_id': '1997_119', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 498, '_primary_term': 1}
{'_index': 'bills', '_id': '1999_1063', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 499, '_primary_term': 1}
{'_index': 'bills', '_id': '2001_1086', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 500, '_primary_term': 1}
{'_index': 'bills', '_id': '1998_1063', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 501, '_primary_term': 1}
{'_index': 'bills', '_id': '2004_1265', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 502, '_primary_term': 1}
{'_index': 'bills', '_id': '1996_493', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 503, '_primary_term': 1}


{'_index': 'bills', '_id': '1997_735', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 548, '_primary_term': 1}
{'_index': 'bills', '_id': '1995_618', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 549, '_primary_term': 1}
{'_index': 'bills', '_id': '2001_1443', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 550, '_primary_term': 1}
{'_index': 'bills', '_id': '2001_1325', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 551, '_primary_term': 1}
{'_index': 'bills', '_id': '2003_718', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 552, '_primary_term': 1}
{'_index': 'bills', '_id': '2001_1319', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 553, '_primary_term': 1}
{

{'_index': 'bills', '_id': '2001_85', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 608, '_primary_term': 1}
{'_index': 'bills', '_id': '2004_573', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 609, '_primary_term': 1}
{'_index': 'bills', '_id': '1996_719', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 610, '_primary_term': 1}
{'_index': 'bills', '_id': '2003_2124', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 611, '_primary_term': 1}
{'_index': 'bills', '_id': '1994_344', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 612, '_primary_term': 1}
{'_index': 'bills', '_id': '2000_1318', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 613, '_primary_term': 1}
{'_

{'_index': 'bills', '_id': '1997_117', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 660, '_primary_term': 1}
{'_index': 'bills', '_id': '2000_238', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 661, '_primary_term': 1}
{'_index': 'bills', '_id': '2001_995', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 662, '_primary_term': 1}
{'_index': 'bills', '_id': '1996_462', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 663, '_primary_term': 1}
{'_index': 'bills', '_id': '2004_2776', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 664, '_primary_term': 1}
{'_index': 'bills', '_id': '2003_2277', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 665, '_primary_term': 1}
{'

{'_index': 'bills', '_id': '1995_617', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 715, '_primary_term': 1}
{'_index': 'bills', '_id': '1996_32', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 716, '_primary_term': 1}
{'_index': 'bills', '_id': '2004_1693', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 717, '_primary_term': 1}
{'_index': 'bills', '_id': '2000_818', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 718, '_primary_term': 1}
{'_index': 'bills', '_id': '2000_1316', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 719, '_primary_term': 1}
{'_index': 'bills', '_id': '1993_646', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 720, '_primary_term': 1}
{'_

{'_index': 'bills', '_id': '2002_1142', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 781, '_primary_term': 1}
{'_index': 'bills', '_id': '1996_262', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 782, '_primary_term': 1}
{'_index': 'bills', '_id': '1997_298', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 783, '_primary_term': 1}
{'_index': 'bills', '_id': '1997_501', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 784, '_primary_term': 1}
{'_index': 'bills', '_id': '2004_2406', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 785, '_primary_term': 1}
{'_index': 'bills', '_id': '2004_41', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 786, '_primary_term': 1}
{'_

{'_index': 'bills', '_id': '1997_299', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 831, '_primary_term': 1}
{'_index': 'bills', '_id': '2000_439', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 832, '_primary_term': 1}
{'_index': 'bills', '_id': '1998_1122', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 833, '_primary_term': 1}
{'_index': 'bills', '_id': '1994_602', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 834, '_primary_term': 1}
{'_index': 'bills', '_id': '1995_439', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 835, '_primary_term': 1}
{'_index': 'bills', '_id': '2002_1802', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 836, '_primary_term': 1}
{'

{'_index': 'bills', '_id': '2001_749', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 883, '_primary_term': 1}
{'_index': 'bills', '_id': '2000_1098', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 884, '_primary_term': 1}
{'_index': 'bills', '_id': '1997_661', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 885, '_primary_term': 1}
{'_index': 'bills', '_id': '2004_624', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 886, '_primary_term': 1}
{'_index': 'bills', '_id': '2004_1535', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 887, '_primary_term': 1}
{'_index': 'bills', '_id': '2004_630', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 888, '_primary_term': 1}
{'

{'_index': 'bills', '_id': '2004_1208', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 940, '_primary_term': 1}
{'_index': 'bills', '_id': '2004_2701', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 941, '_primary_term': 1}
{'_index': 'bills', '_id': '1996_401', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 942, '_primary_term': 1}
{'_index': 'bills', '_id': '1996_367', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 943, '_primary_term': 1}
{'_index': 'bills', '_id': '2000_273', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 944, '_primary_term': 1}
{'_index': 'bills', '_id': '2004_1546', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 945, '_primary_term': 1}
{

{'_index': 'bills', '_id': '2003_760', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 996, '_primary_term': 1}
{'_index': 'bills', '_id': '2001_1407', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 997, '_primary_term': 1}
{'_index': 'bills', '_id': '1996_774', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 998, '_primary_term': 1}
{'_index': 'bills', '_id': '2003_1454', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 999, '_primary_term': 1}
{'_index': 'bills', '_id': '2001_301', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1000, '_primary_term': 1}
{'_index': 'bills', '_id': '1999_530', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1001, '_primary_term': 1}


{'_index': 'bills', '_id': '2001_499', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1049, '_primary_term': 1}
{'_index': 'bills', '_id': '1996_44', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1050, '_primary_term': 1}
{'_index': 'bills', '_id': '2002_90', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1051, '_primary_term': 1}
{'_index': 'bills', '_id': '1995_488', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1052, '_primary_term': 1}
{'_index': 'bills', '_id': '1997_770', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1053, '_primary_term': 1}
{'_index': 'bills', '_id': '2000_1189', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1054, '_primary_term': 1}

{'_index': 'bills', '_id': '2003_758', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1107, '_primary_term': 1}
{'_index': 'bills', '_id': '1999_440', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1108, '_primary_term': 1}
{'_index': 'bills', '_id': '1996_41', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1109, '_primary_term': 1}
{'_index': 'bills', '_id': '2000_857', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1110, '_primary_term': 1}
{'_index': 'bills', '_id': '2001_1403', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1111, '_primary_term': 1}
{'_index': 'bills', '_id': '2001_1365', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1112, '_primary_term': 

{'_index': 'bills', '_id': '1996_175', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1157, '_primary_term': 1}
{'_index': 'bills', '_id': '2004_2507', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1158, '_primary_term': 1}
{'_index': 'bills', '_id': '2004_2249', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1159, '_primary_term': 1}
{'_index': 'bills', '_id': '1999_1234', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1160, '_primary_term': 1}
{'_index': 'bills', '_id': '1995_713', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1161, '_primary_term': 1}
{'_index': 'bills', '_id': '1996_773', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1162, '_primary_term'

# Determine the number of legislative acts containing the word ustawa (in any form)

In [164]:
resp = rq.get(
        url = f"{es_url}/{es_index}/_search",
        json = {
            "query": {
                "match": {
                    "text": {
                        "query": "ustawa"
                    }
                }
            }
        }
)

In [165]:
resp.json()['hits']['total']['value']

1178

# Determine the number of occurrences of the word ustawa by searching for this particular form, including the other inflectional forms.

In [166]:
resp = es.termvectors( 
    index=es_index,
    id = "1999_700",
    fields=["text"],
    term_statistics=True
)

In [167]:
resp['term_vectors']['text']['terms']['ustawa']['ttf'] 

24934

In [168]:
response = rq.get(
    url=f"{es_url}/{es_index}/_termvectors/1999_700",
    json={
        "fields": ["text"],
        "term_statistics": True
    }
)

response.json()["term_vectors"]["text"]["terms"]["ustawa"]['ttf']

24934

# Determine the number of occurrences of the word ustaw by searching for this particular form, including the other inflectional forms.

In [169]:
response = rq.get(
    url=f"{es_url}/{es_index}/_analyze",
    json={
        "text": "ustaw",
        "analyzer": f"{es_analyzer}"
    }
)

response.json()

{'tokens': [{'token': 'ustawa',
   'start_offset': 0,
   'end_offset': 5,
   'type': '<ALPHANUM>',
   'position': 0},
  {'token': 'ustawić',
   'start_offset': 0,
   'end_offset': 5,
   'type': '<ALPHANUM>',
   'position': 0}]}

In [170]:
response = rq.get(
    url=f"{es_url}/{es_index}/_termvectors/1993_599", # text under this ID must contains words ustawa and ustawić!!
    json={
        "fields": ["text"],
        "term_statistics": True
    }
)

response.json()["term_vectors"]["text"]["terms"]['ustawa']['ttf'] + response.json()["term_vectors"]["text"]["terms"]['ustawić']['ttf']

25847

# Determine the number of legislative acts containing the words kodeks postępowania cywilnego in the specified order, but in any inflection form.

In [171]:
es.search(index=es_index, query={"match_phrase" : {"text": {"query" : "kodeks postępowania cywilnego"}}})['hits']['total']['value']

99

In [172]:
response = rq.get(
    url=f"{es_url}/{es_index}/_search",
    json={
        "query": {
            "match_phrase": {
                "text": {
                    "query": "kodeks postępowania cywilnego"
                }
            }
        }
    }
)

response.json()['hits']['total']['value']

99

# Determine the number of legislative acts containing the words wchodzi w życie (in any form) allowing for up to 2 additional words in the searched phrase.

In [173]:
es.search(index=es_index, query={"match_phrase": {"text": {"query" :"wchodzi w życie", "slop": 2}}})['hits']['total']['value']

1174

In [174]:
response = rq.get(
    url=f"{es_url}/{es_index}/_search",
    json={
        "query": {
            "match_phrase": {
                "text": {
                    "query": "wchodzi w życie",
                    "slop" : 2
                }
            }
        }
    }
)

response.json()['hits']['total']['value']

1174

# Determine the 10 documents that are the most relevant for the phrase konstytucja.

In [175]:
res = es.search(index=es_index, query={'match': {'text': {'query' :'konstytucja'}}}, size=10)['hits']['hits']
for i in res:
    print(f"{i['_id'] :<9} | {i['_score']}")

1997_629  | 6.867635
2000_443  | 6.662749
1997_604  | 6.6320543
1996_350  | 6.626803
1997_642  | 6.251624
2001_23   | 6.0579295
1996_199  | 5.928016
1999_688  | 5.8496947
2001_1082 | 5.466536
1997_681  | 5.466536


In [176]:
response = rq.get(
    url=f"{es_url}/{es_index}/_search",
    json={
        "query": {
            "match": {
                "text": {
                    "query": "konstytucja"
                }
            }
        },
        "size": 10
    }
)


res_list = response.json()['hits']['hits']
for i in res_list:
    print(f"{i['_id'] :<9} | {i['_score']}")

1997_629  | 6.867635
2000_443  | 6.662749
1997_604  | 6.6320543
1996_350  | 6.626803
1997_642  | 6.251624
2001_23   | 6.0579295
1996_199  | 5.928016
1999_688  | 5.8496947
2001_1082 | 5.466536
1997_681  | 5.466536


# Print the excerpts containing the word konstytucja (up to three excerpts per document) from the previous task.

In [177]:
response = rq.get(
    url=f"{es_url}/{es_index}/_search",
    json={
        "query": {
            "match": {
                "text": {
                    "query": "konstytucja"
                }
            }
        },
        "highlight": { # https://www.elastic.co/guide/en/elasticsearch/reference/current/highlighting.html
            "fields": {
                "text": {
                    "number_of_fragments": 3
                }
            },
        },
        "size": 10
    }
)

resp_list = response.json()['hits']['hits']
for bill in resp_list:
    print(f"{bill['_id'] :<9} | {bill['_score']}")
    for excerpt in bill["highlight"]['text']:
        print(excerpt)
        print("-------------------------")
    print('||||||||||||||||||||||||||')


1997_629  | 6.867635
o zmianie ustawy konstytucyjnej o trybie przygotowania
           i uchwalenia <em>Konstytucji</em> Rzeczypospolitej
-------------------------
W ustawie  konstytucyjnej z  dnia 23 kwietnia 1992 r. o trybie przygotowania i 
uchwalenia <em>Konstytucji</em>
-------------------------
Do zgłoszenia projektu <em>Konstytucji</em> załącza się wykaz 
                obywateli popierających zgłoszenie
-------------------------
||||||||||||||||||||||||||
2000_443  | 6.662749
umowy międzynarodowej i nie wypełnia przesłanek określonych w art. 89
     ust. 1 lub art. 90 <em>Konstytucji</em>
-------------------------
międzynarodowej lub załącznika nie
     wypełnia przesłanek określonych w art. 89 ust. 1 lub art. 90 <em>Konstytucji</em>
-------------------------
co do zasadności wyboru
  trybu ratyfikacji umowy międzynarodowej, o którym mowa w art. 89 ust. 2
  <em>Konstytucji</em>
-------------------------
||||||||||||||||||||||||||
1997_604  | 6.6320543
Jeżeli Trybunał Konstytuc