In [1]:
from elasticsearch import Elasticsearch
from datasets import load_dataset
import pandas as pd
import numpy as np

ds = load_dataset("clarin-knext/fiqa-pl", "corpus")
df_corpus = pd.DataFrame(ds['corpus'])
print(df_corpus.head())

ds_q = load_dataset("clarin-knext/fiqa-pl", "queries")
ds_q = ds_q['queries']
print(ds_q[0])

ds_qrels = load_dataset("clarin-knext/fiqa-pl-qrels")
df_qrels = pd.DataFrame(ds_qrels['test'])
print(df_qrels.head())

df_corpus = df_corpus.rename(columns={'_id': 'id'})
df_corpus['id'] = df_corpus['id'].astype(np.int64)

  from .autonotebook import tqdm as notebook_tqdm


  _id title                                               text
0   3        Nie mówię, że nie podoba mi się też pomysł szk...
1  31        Tak więc nic nie zapobiega fałszywym ocenom po...
2  56        Nigdy nie możesz korzystać z FSA dla indywidua...
3  59        Samsung stworzył LCD i inne technologie płaski...
4  63        Oto wymagania SEC: Federalne przepisy dotycząc...
{'_id': '0', 'title': '', 'text': 'Co jest uważane za wydatek służbowy w podróży służbowej?'}
   query-id  corpus-id  score
0         8     566392      1
1         8      65404      1
2        15     325273      1
3        18      88124      1
4        26     285255      1


In [2]:
df_joined = pd.merge(df_corpus, df_qrels, left_on='id', right_on='corpus-id', how='left').drop(columns=['corpus-id'])
df_joined.fillna({'score': 0, 'query-id': -1}, inplace=True)
print(df_joined.head())

   id title                                               text  query-id  \
0   3        Nie mówię, że nie podoba mi się też pomysł szk...      -1.0   
1  31        Tak więc nic nie zapobiega fałszywym ocenom po...      -1.0   
2  56        Nigdy nie możesz korzystać z FSA dla indywidua...      -1.0   
3  59        Samsung stworzył LCD i inne technologie płaski...      -1.0   
4  63        Oto wymagania SEC: Federalne przepisy dotycząc...      -1.0   

   score  
0    0.0  
1    0.0  
2    0.0  
3    0.0  
4    0.0  


## Zad 1 & 2

In [3]:
elasticsearch = Elasticsearch(hosts=["http://localhost:9200"])
print(elasticsearch.info())

{'name': 'node-1', 'cluster_name': 'my-application-cluster', 'cluster_uuid': '3dsu0IVGSgCf0dqmsaR0NQ', 'version': {'number': '8.15.2', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '98adf7bf6bb69b66ab95b761c9e5aadb0bb059a3', 'build_date': '2024-09-19T10:06:03.564235954Z', 'build_snapshot': False, 'lucene_version': '9.11.1', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


## Zad 3 & 4

In [4]:
synonyms_set = [{"id": "synonym-1", "synonyms": "styczeń, sty, I"},
                {"id": "synonym-2", "synonyms": "luty, lut, II"},
                {"id": "synonym-3", "synonyms": "marzec, mar, III"},
                {"id": "synonym-4", "synonyms": "kwiecień, kwi, IV"},
                {"id": "synonym-5", "synonyms": "maj, V"},
                {"id": "synonym-6", "synonyms": "czerwiec, cze, VI"},
                {"id": "synonym-7", "synonyms": "lipiec, lip, VII"},
                {"id": "synonym-8", "synonyms": "sierpień, sie, VIII"},
                {"id": "synonym-9", "synonyms": "wrzesień, wrz, IX"},
                {"id": "synonym-10", "synonyms": "październik, paź, X"},
                {"id": "synonym-11", "synonyms": "listopad, lis, XI"},
                {"id": "synonym-12", "synonyms": "grudzień, gru, XII"}]

elasticsearch.synonyms.put_synonym(id="months-synonyms", synonyms_set=synonyms_set)

ObjectApiResponse({'result': 'updated', 'reload_analyzers_details': {'_shards': {'total': 2, 'successful': 1, 'failed': 0}, 'reload_details': [{'index': 'fiqa-pl', 'reloaded_analyzers': ['polish_analyzer_synonyms_included'], 'reloaded_node_ids': ['3FM9a4CGRYK2mHCLDNy-Zw']}]}})

In [5]:
analyzers = ["lowercase_analyzer", "polish_analyzer_morfologik_included", "polish_analyzer_synonyms_included"]

settings = {
    "analysis": {
        "analyzer": {
            "lowercase_analyzer": {
                "tokenizer": "standard",
                "filter": ["lowercase"],
            },
            "polish_analyzer_morfologik_included": {
                "tokenizer": "standard",
                "filter": ["lowercase", "morfologik_stem", "lowercase"],
            },
            "polish_analyzer_synonyms_included": {
                "tokenizer": "standard",
                "filter": ["polish_months_filter", "lowercase", "morfologik_stem", "lowercase"],
            },
        },
        "filter": {
            "polish_months_filter": {
                "type": "synonym_graph",
                "synonyms_set": "months-synonyms",
                "updateable": True,
            }
        },
    }
}

mappings = {
    "properties": {
        "text": {
            "type": "text",
            "analyzer": "polish_analyzer_morfologik_included",
            "search_analyzer": "polish_analyzer_morfologik_included",
        }
    }
}

def create_index(index_name, settings, mappings):
    if elasticsearch.indices.exists(index=index_name):
        elasticsearch.indices.delete(index=index_name, ignore_unavailable=True)
    elasticsearch.indices.create(index=index_name, mappings=mappings, settings=settings)

def create_with_custom_analyzer(index_name, settings, analyzer_name):
    mappings = {
        "properties": {
            "text": {
                "type": "text",
                "analyzer": analyzer_name,
                "search_analyzer": analyzer_name,
            }
        }
    }
    create_index(index_name, settings, mappings)


In [6]:
def populate_index(index_name, ds):
    operations = []
    for document in ds:
        operations.append({"index": {"_index": index_name}})
        operations.append({"id": document["id"], "text": document["text"]})
    elasticsearch.bulk(operations=operations, request_timeout=60)

In [7]:
import re as reg

def count_word(amt_of_documents, texts, add_synonyms=False):
    name_kwiecien_rule_regex = "(?:kwie(?:(?:cień)|(?:tni)(?:(?:a(?:(?:mi)|(?:ch)){0,1})|(?:owi)|(?:em)|(?:u)|(?:e)|(?:ów)|(?:om)){0,1}))"
    if add_synonyms:
        name_kwiecien_rule_regex = "(?:" + name_kwiecien_rule_regex + "|(?:kwi)|(?:IV))"
    occurences_of_kwiecien = 0
    for i in range(amt_of_documents):
        occurences_of_kwiecien += len(reg.findall(name_kwiecien_rule_regex, texts[i]['_source']['text'], reg.IGNORECASE))
    return occurences_of_kwiecien

## Zad 5 & 7

In [9]:
create_with_custom_analyzer("fiqa-pl", settings, "polish_analyzer_morfologik_included")
populate_index("fiqa-pl", df_joined.to_dict(orient='records'))

  elasticsearch.bulk(operations=operations, request_timeout=60)


## Zad 8

In [10]:
searching_word = "kwiecień"

In [11]:
result = elasticsearch.search(index="fiqa-pl", size=500, query={"multi_match": {"query": searching_word,"analyzer": "polish_analyzer_morfologik_included", "fields": ["text"]}})
amt_of_documents = result['hits']['total']['value']
print("documents without synonyms: ", amt_of_documents)
print("matches without synonyms: ", count_word(amt_of_documents, result['hits']['hits']))

documents without synonyms:  257
matches without synonyms:  354


In [12]:
result = elasticsearch.search(index="fiqa-pl", size=500, query={"multi_match": {"query": searching_word, "analyzer": "polish_analyzer_synonyms_included", "fields": ["text"]}})
amt_of_documents = result['hits']['total']['value']
print("documents with synonyms: ", amt_of_documents)
print("matches with synonyms: ", count_word(amt_of_documents, result['hits']['hits'], True))

documents with synonyms:  306
matches with synonyms:  463


## Zad 10

## Compute NDCG@5

In [13]:
corpus_query_mapping = {} # query_id -> [corpus_ids]

for i, row in df_joined.iterrows():
    if row['query-id'] == -1:
        continue
    if row['query-id'] not in corpus_query_mapping:
        corpus_query_mapping[row['query-id']] = []
    corpus_query_mapping[row['query-id']].append(row['id'])

ranking = []
maxi = 0
id_maxi = -1
for query_id, corpus_ids in corpus_query_mapping.items():
    ranking.append((corpus_ids, query_id))
    if len(corpus_ids) > maxi:
        maxi = len(corpus_ids)
        id_maxi = query_id

ranking = sorted(ranking, key=lambda x: len(x[0]), reverse=True)
best_queries = []
for idx, (corpus_ids, query_id) in enumerate(ranking):
    print("query_id: ", query_id, " amount of documents: ", len(corpus_ids))
    best_queries.append(int(query_id))
    if idx == 10: break


query_id:  5993.0  amount of documents:  15
query_id:  2348.0  amount of documents:  15
query_id:  6005.0  amount of documents:  13
query_id:  6131.0  amount of documents:  12
query_id:  776.0  amount of documents:  12
query_id:  6002.0  amount of documents:  11
query_id:  5511.0  amount of documents:  10
query_id:  659.0  amount of documents:  10
query_id:  10497.0  amount of documents:  10
query_id:  3909.0  amount of documents:  10
query_id:  4409.0  amount of documents:  10


In [14]:
def calculate_ndcg(result, searching_query_id):
    amount_of_one = len(corpus_query_mapping[searching_query_id])

    DCG = 0
    for idx, hit in enumerate(result['hits']['hits']):
        gain = 0
        corpus_id = hit['_source']['id']
        if corpus_id in corpus_query_mapping[searching_query_id]:
            gain = 1
        DCG += gain / np.log2(idx + 2)

    IDCG = 0
    for idx in range(amount_of_one):
        IDCG += 1 / np.log2(idx + 2)

    if IDCG == 0:
        nDCG = 0
    else: 
        nDCG = DCG / IDCG
    print("nDCG: ", nDCG)
    return nDCG

In [15]:
def compare_analyzers(searching_query_id):
    searching_word = ds_q[searching_query_id]['text']
    # print("searching word: ", searching_word)
    result = elasticsearch.search(index="fiqa-pl", size=10000, query={"multi_match": {"query": searching_word, "analyzer": "polish_analyzer_morfologik_included", "fields": ["text"]}})
    amt_of_documents = result['hits']['total']['value']
    print("documents without synonyms: ", amt_of_documents)
    ndcg1 = calculate_ndcg(result, searching_query_id)

    print(" ")

    result = elasticsearch.search(index="fiqa-pl", size=10000, query={"multi_match": {"query": searching_word, "analyzer": "polish_analyzer_synonyms_included", "fields": ["text"]}})
    amt_of_documents = result['hits']['total']['value']
    print("documents with synonyms: ", amt_of_documents)
    ndcg2 = calculate_ndcg(result, searching_query_id)
    print(" ")

    result = elasticsearch.search(index="fiqa-pl", size=10000, query={"multi_match": {"query": searching_word, "analyzer": "lowercase_analyzer", "fields": ["text"]}})
    amt_of_documents = result['hits']['total']['value']
    print("documents without synonyms and lemmatization: ", amt_of_documents)
    ndcg3 = calculate_ndcg(result, searching_query_id)
    print(" ")

    result = elasticsearch.search(index="fiqa-pl", size=10000, query={"multi_match": {"query": searching_word, "analyzer": "polish_analyzer_synonyms_included", "fields": ["text"]}})
    amt_of_documents = result['hits']['total']['value']
    print("documents with synonyms and lemmatization: ", amt_of_documents)
    ndcg4 = calculate_ndcg(result, searching_query_id)
    print(" ")
    return ndcg1, ndcg2, ndcg3, ndcg4

In [16]:
nDCG_collected = []
for query_id in best_queries:
    if query_id > 6648:
        continue
    print("query_id: ", query_id)
    nDCG1, nDCG2, nDCG3, nDCG4 = compare_analyzers(query_id)
    nDCG_collected.append(nDCG1)
    nDCG_collected.append(nDCG2)
    nDCG_collected.append(nDCG3)
    nDCG_collected.append(nDCG4)

print("nDCG average: ", np.mean(nDCG_collected))

query_id:  5993
documents without synonyms:  10000
nDCG:  0.026717807395236236
 
documents with synonyms:  10000
nDCG:  0.026717807395236236
 
documents without synonyms and lemmatization:  10000
nDCG:  0.0
 
documents with synonyms and lemmatization:  10000
nDCG:  0.026717807395236236
 
query_id:  2348
documents without synonyms:  10000
nDCG:  0.1359527185551999
 
documents with synonyms:  10000
nDCG:  0.1359527185551999
 
documents without synonyms and lemmatization:  10000
nDCG:  0.04089654341096002
 
documents with synonyms and lemmatization:  10000
nDCG:  0.1359527185551999
 
query_id:  6005
documents without synonyms:  10000
nDCG:  0.02964862909292818
 
documents with synonyms:  10000
nDCG:  0.02964862909292818
 
documents without synonyms and lemmatization:  10000
nDCG:  0.03154922871116617
 
documents with synonyms and lemmatization:  10000
nDCG:  0.02964862909292818
 
query_id:  6131
documents without synonyms:  10000
nDCG:  0.029969257893291325
 
documents with synonyms:  100

## (Optional) Zad 11
Find three questions from the test subset with the following features:

In [17]:
def look_for_relevant_document_at_given_position(position):
    if position > 10000:
        position = 10000
    for idx, query in enumerate(ds_q):
        result = elasticsearch.search(index="fiqa-pl", size=10000, query={"multi_match": {"query": query['text'], "analyzer": "polish_analyzer_synonyms_included", "fields": ["text"]}})
        
        if (idx/len(ds_q))*100 % 100 == 0:
            print(idx)

        lower_sum = []
        for idx, hit in enumerate(result["hits"]["hits"]):
            gain = 0
            corpus_id = result['hits']['hits'][idx]['_source']['id']
            if query['_id'] in corpus_query_mapping and corpus_id in corpus_query_mapping[query['_id']]:
                gain = 1
            if idx < position:
                lower_sum.append(gain)
        
        gain = 0
        corpus_id = result['hits']['hits'][position]['_source']['id']
        if query['_id'] in corpus_query_mapping and corpus_id in corpus_query_mapping[query['_id']]:
            gain = 1

        if position < len(result["hits"]["hits"]) and gain == 1 and sum(lower_sum) == 0:
        
            print("Searched query: ", query['text'], " found at: ", idx)
            break
        if position >= len(ds_q) and sum(lower_sum) == 0:
            print("Searched query: ", query['text'], " found at: ", idx)
            break

### the relevant document is returned by ES at position 1,

In [18]:
# look_for_relevant_document_at_given_position(1) # took too long

### the relevant document is returned by ES at position 4 or 5.

In [19]:
# look_for_relevant_document_at_given_position(4) # took too long

In [20]:
# look_for_relevant_document_at_given_position(5) # took too long

### the relevant document is returned by ES is not found.

In [21]:
look_for_relevant_document_at_given_position(len(ds_q))

0
Searched query:  Co jest uważane za wydatek służbowy w podróży służbowej?  found at:  9999


## Answers to questions

1. What are the strengths and weaknesses of regular expressions versus full text search regarding processing of text?

**Regular expressions give more control over searching but are much more complex. In the presented example FTS found fewer results than regExp in previous labs. But FTS is more convenient and also gives satisfying results, almost the same as regexp. FTS has a good ecosystem and a lot of features that facilitate and improve data processing. In most cases, FTS will be enough, but for precise outcomes, we should probably use both methods to ensure the best results.**

2. Can an LLM be applied in the context of searching for documents? Justify your answer, excluding the obvious observation that an LLM can be used to formulate the answer.

**LLM has a tremendous impact on searching. As far as LLM can recognize the text LLM can change, enrich or interpret it. LLM can take the query and try to understand its context, by generating more sentences or guessing the most probable connections and asking the search engine again. Also if the text can be represented in another form or expression then the generative model can translate it in such a way e.g. to a different language.**