In [114]:
import numpy as np
import pandas as pd
import random
from datasets import load_dataset

from haystack.nodes import DensePassageRetriever
import torch
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import EmbeddingRetriever
from haystack.pipelines import DocumentSearchPipeline
from elasticsearch import Elasticsearch

In [2]:
random.seed(2137)

In [None]:
"""
aktualnie jest zaladowana baza danych do roberty, jesli jest z nia problem nalezy usunac plik .db i od nowa zaladowac dane, obliczyc embeddingi, zapisac i mozna kontynuowac prace
"""

In [3]:
df_texts = pd.DataFrame.from_dict(load_dataset("clarin-knext/fiqa-pl", "corpus")['corpus'])
df_texts.head(2)

Unnamed: 0,_id,title,text
0,3,,"Nie mówię, że nie podoba mi się też pomysł szk..."
1,31,,Tak więc nic nie zapobiega fałszywym ocenom po...


In [4]:
df_qa = pd.DataFrame(load_dataset("clarin-knext/fiqa-pl-qrels")['test'])
df_qa.head(2)

Unnamed: 0,query-id,corpus-id,score
0,8,566392,1
1,8,65404,1


In [5]:
df_q = pd.DataFrame(load_dataset("clarin-knext/fiqa-pl", "queries")['queries'])
df_q.head(2)

Unnamed: 0,_id,title,text
0,0,,Co jest uważane za wydatek służbowy w podróży ...
1,4,,Wydatki służbowe - ubezpieczenie samochodu pod...


# Building document store for E5:

In [10]:
# Baza danych Faiss
doc_store = FAISSDocumentStore( embedding_dim=768, similarity='cosine')

In [11]:
dicts = []
for id, text in zip(df_texts['_id'], df_texts['text']):
  dicts.append({
        'content': ""+text,
        'meta': {'id': id}
    })


In [12]:
# load data to database:
doc_store.write_documents(dicts)

Writing Documents: 60000it [00:28, 2091.77it/s]                           


In [13]:
# Save database to avoid re-indexing all time, now we can use load() instead of write_documents
doc_store.save(index_path="./data/my_index.faiss", config_path="./data/my_config.json")

In [22]:
device = "mps" if torch.backends.mps.is_available() else "cpu"  # Jeśli MPS jest dostępny, używaj go, w przeciwnym razie CPU.
e5 = EmbeddingRetriever(
    document_store=doc_store,
    embedding_model="intfloat/e5-base-v2",
    model_format="transformers",  # Make sure we specify the transformers model format
    pooling_strategy="reduce_mean",  # This is the pooling method used to train the e5 models
    top_k=5,
    max_seq_len=512,
    batch_size=64,
    devices=[device],
)
# compute embeddings of all documents using e5
doc_store.update_embeddings(e5)

Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  6.88 Batches/s]
Updating Embedding:   0%|          | 0/57597 [00:00<?, ? docs/s]
Inferencing Samples:   0%|          | 0/157 [00:00<?, ? Batches/s][A
Inferencing Samples:   1%|          | 1/157 [00:05<14:31,  5.59s/ Batches][A
Inferencing Samples:   1%|▏         | 2/157 [00:10<12:56,  5.01s/ Batches][A
Inferencing Samples:   2%|▏         | 3/157 [00:14<12:29,  4.87s/ Batches][A
Inferencing Samples:   3%|▎         | 4/157 [00:19<12:22,  4.85s/ Batches][A
Inferencing Samples:   3%|▎         | 5/157 [00:24<12:04,  4.77s/ Batches][A
Inferencing Samples:   4%|▍         | 6/157 [00:28<11:51,  4.71s/ Batches][A
Inferencing Samples:   4%|▍         | 7/157 [00:33<11:48,  4.72s/ Batches][A
Inferencing Samples:   5%|▌         | 8/157 [00:38<11:46,  4.74s/ Batches][A
Inferencing Samples:   6%|▌         | 9/157 [00:43<11:39,  4.73s/ Batches][A
Inferencing Samples:   6%|▋         | 10/157 [00:47<11:33,  4.72s/ Batches][A
Inferencing

In [25]:
# Again save with new embeddings
doc_store.save(index_path="./data/my_index.faiss", config_path="./data/my_config.json")

In [51]:
pipe = DocumentSearchPipeline(e5)
prediction = pipe.run(
    query="lool", params={"Retriever": {"top_k": 5}}
)
  

Inferencing Samples: 100%|██████████| 1/1 [00:06<00:00,  6.74s/ Batches]


In [54]:
len(prediction['documents'])

5

In [71]:
for elem in prediction['documents']:
    print(elem.score)
    print(elem.meta['id'])

0.8988051414489746
597929
0.8953127861022949
424253
0.892010509967804
571786
0.891658753156662
73148
0.8916545212268829
524495


In [72]:
ids = [elem.meta['id'] for elem in prediction['documents']]

In [93]:
query = "Twoje zapytanie tutaj"
retrieved_docs = e5.retrieve(query)
for doc in retrieved_docs:
    print(doc.meta['id'])  # Zwróci treść istotnych dokumentów


Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  1.48 Batches/s]

173187
331722
332858
2206
387147





# Elastic Search:

In [43]:

es = Elasticsearch(["http://elastics:password@localhost:9200"], verify_certs=False)
try:
    resp = es.info()
    print(resp)
except Exception as e:
    print(f"Error: {e}")


{'name': 'node-1', 'cluster_name': 'my-application-cluster', 'cluster_uuid': 'gBI4PdSoQuCa8sMPxLY-yQ', 'version': {'number': '8.15.2', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '98adf7bf6bb69b66ab95b761c9e5aadb0bb059a3', 'build_date': '2024-09-19T10:06:03.564235954Z', 'build_snapshot': False, 'lucene_version': '9.11.1', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


In [103]:
def compute_dcg(scores):
    return sum(score / np.log(idx + 2) for idx, score in enumerate(scores))


def compute_ndcg(relevant_scores, retrieved_scores, k=5):
    dcg = compute_dcg(retrieved_scores[:k])  # to sa te ktore zwrocil nasz 'model'
    ideal_dcg = compute_dcg(
        sorted(relevant_scores, reverse=True)[:k])  # relevant uzywamy do idealnego dcg (idealne ulozenie odpowiedzi)
    return dcg / ideal_dcg if ideal_dcg > 0 else 0


NDCG_SIZE = 5


def search_and_compute_ndcg(index_name, analyzator_content, test_data, ndcg_size, query_column_name):
    ndcg_scores = []

    # obliczamy dla kazdej query dostepnej w testowym zbiorze danych
    for index, row in test_data.iterrows():
        # query id
        query_id = row["query-id"]
        # query text
        query = df_q[df_q['_id'] == str(query_id)][query_column_name].values[0]
        # Wykonanie zapytania do Elasticsearch
        search_query = {
                "query": {
                    "match": {
                        analyzator_content: query,
                    }
                }
            }
        # bierzemy 5 pierwszych dopasowań od Elastic search (dostał query) zwraca nam 5 dokumentów
        response = es.search(index=index_name, body=search_query, size=ndcg_size)
        retrieved_docs = [hit["_id"] for hit in response["hits"]["hits"]]  # id 5 dokumentow zwrocone przez ES
        print(retrieved_docs)
        # Wszysktie A które pasuja do Q (z labelowanego dataset)
        good_answers = df_qa[df_qa['query-id'] == int(query_id)]
        # sortuje je po ich 'score', one i tak mają 1 ale na przyszlosc z lepszym datasetem zeby gralo bo tak sie realizuje IDCG
        good_answers = good_answers.sort_values(by='score', ascending=False)
        # Biore posortowane kolejne elementy z dobrymyim odpowiedziami, jesli nie ma ich (5) to uzupelniam 0 ami aby było zawsze 5 elementów - prawidlowe ndcg tak działa
        relenvant_answears = list(good_answers['score'][:ndcg_size]) + [0] * (
                    ndcg_size - len(good_answers))  # idealne odpowiedzi
        # print(relenvant_answears) -> cos w stylu [1,1,0,0,0]

        retrived_answears = [0 for _ in range(ndcg_size)]  #otrzymane odpowiedzi
        for idx, doc_found in enumerate(retrieved_docs):
            if int(doc_found) in good_answers['corpus-id'].values:
                retrived_answears[idx] = good_answers[good_answers['corpus-id'] == int(doc_found)]['score'].iloc[0]

        #print(retrived_answears) # -> cos w stylu [0,1,0,0,0]
        ndcg = compute_ndcg(relenvant_answears, retrived_answears, k=5)
        ndcg_scores.append(ndcg)  # ndcg dla kazdego query sumujemy

    # Zwracamy średnie NDCG dla wszystkich zapytań
    return np.mean(ndcg_scores)

In [104]:
qa_no_duplicates = df_qa.drop_duplicates(subset='query-id')
index_name = "fiqa_index"

# Neural search E5

In [131]:
def search_and_compute_ndcg_doc_store(test_data, ndcg_size, query_column_name):
    ndcg_scores = []

    # obliczamy dla kazdej query dostepnej w testowym zbiorze danych
    for index, row in test_data.iterrows():
        # query id
        query_id = row["query-id"]
        # query text
        query = df_q[df_q['_id'] == str(query_id)][query_column_name].values[0]
        # Wykonanie zapytania do Document store:
        prediction = e5.retrieve(query) # Zdefiniowane na górze
        #retrieved_docs = [elem.content for elem in prediction['documents']]
        retrieved_docs = [elem.meta['id'] for elem in prediction]
        #print(query)
        #print(retrieved_docs)
        #print("###")
        #print(retrieved_docs)
        # Wszysktie A które pasuja do Q (z labelowanego dataset)
        good_answers = df_qa[df_qa['query-id'] == int(query_id)]
        # sortuje je po ich 'score', one i tak mają 1 ale na przyszlosc z lepszym datasetem zeby gralo bo tak sie realizuje IDCG
        good_answers = good_answers.sort_values(by='score', ascending=False)
        # Biore posortowane kolejne elementy z dobrymyim odpowiedziami, jesli nie ma ich (5) to uzupelniam 0 ami aby było zawsze 5 elementów - prawidlowe ndcg tak działa
        relenvant_answears = list(good_answers['score'][:ndcg_size]) + [0] * (
                    ndcg_size - len(good_answers))  # idealne odpowiedzi
        # print(relenvant_answears) -> cos w stylu [1,1,0,0,0]

        retrived_answears = [0 for _ in range(ndcg_size)]  #otrzymane odpowiedzi
        for idx, doc_found in enumerate(retrieved_docs):
            if int(doc_found) in good_answers['corpus-id'].values:
                retrived_answears[idx] = good_answers[good_answers['corpus-id'] == int(doc_found)]['score'].iloc[0]

        #print(retrived_answears) # -> cos w stylu [0,1,0,0,0]
        ndcg = compute_ndcg(relenvant_answears, retrived_answears, k=5)
        ndcg_scores.append(ndcg)  # ndcg dla kazdego query sumujemy

    # Zwracamy średnie NDCG dla wszystkich zapytań
    return np.mean(ndcg_scores)

# Neural Search using Roberta

In [116]:
# Creating new FAISS database for roberta embeddings
document_store_r = FAISSDocumentStore(similarity="dot_product")


In [117]:
document_store_r.write_documents(dicts)

Writing Documents: 60000it [00:29, 2035.18it/s]                           


In [118]:
r_retriever = DensePassageRetriever(
    document_store=document_store_r,
    query_embedding_model= 'sdadas/mmlw-retrieval-roberta-base',
    passage_embedding_model= 'sdadas/mmlw-retrieval-roberta-base'
)

Using a model of type 'roberta' which might be incompatible with DPR encoders. Only Bert-based encoders are supported. They need input_ids, token_type_ids, attention_mask as input tensors.
Using a model of type 'roberta' which might be incompatible with DPR encoders. Only Bert-based encoders are supported. They need input_ids, token_type_ids, attention_mask as input tensors.


In [119]:
document_store_r.update_embeddings(r_retriever)

Updating Embedding:   0%|          | 0/57597 [00:00<?, ? docs/s]
Create embeddings:   0%|          | 0/10000 [00:00<?, ? Docs/s][A
Create embeddings:   0%|          | 16/10000 [00:04<45:31,  3.65 Docs/s][A
Create embeddings:   0%|          | 32/10000 [00:04<22:21,  7.43 Docs/s][A
Create embeddings:   0%|          | 48/10000 [00:05<14:18, 11.59 Docs/s][A
Create embeddings:   1%|          | 64/10000 [00:05<10:31, 15.74 Docs/s][A
Create embeddings:   1%|          | 80/10000 [00:06<08:25, 19.62 Docs/s][A
Create embeddings:   1%|          | 96/10000 [00:06<07:09, 23.06 Docs/s][A
Create embeddings:   1%|          | 112/10000 [00:07<06:21, 25.92 Docs/s][A
Create embeddings:   1%|▏         | 128/10000 [00:07<05:50, 28.16 Docs/s][A
Create embeddings:   1%|▏         | 144/10000 [00:08<05:29, 29.88 Docs/s][A
Create embeddings:   2%|▏         | 160/10000 [00:08<05:15, 31.16 Docs/s][A
Create embeddings:   2%|▏         | 176/10000 [00:09<05:04, 32.27 Docs/s][A
Create embeddings:   2%|▏  

In [120]:
r_pipe = DocumentSearchPipeline(r_retriever)

In [124]:
# Our dense retrievers require the use of specific prefixes and suffixes when encoding texts. For this model, each query should be preceded by the prefix "zapytanie: "

def search_and_compute_ndcg_doc_store_r(test_data, ndcg_size, query_column_name):
    ndcg_scores = []

    # obliczamy dla kazdej query dostepnej w testowym zbiorze danych
    for index, row in test_data.iterrows():
        # query id
        query_id = row["query-id"]
        # query text
        query = "zapytanie: " + str(df_q[df_q['_id'] == str(query_id)][query_column_name].values[0])
        # Wykonanie zapytania do Document store z wykorzystaniem Polish roberty:
        prediction = r_pipe.run(query=query, params={"Retriever": {"top_k": 5}})
        #retrieved_docs = [elem.content for elem in prediction['documents']]
        retrieved_docs = [elem.meta['id'] for elem in prediction['documents']]
        #print(query)
        #print(retrieved_docs)
        #print("###")
        #print(retrieved_docs)
        # Wszysktie A które pasuja do Q (z labelowanego dataset)
        good_answers = df_qa[df_qa['query-id'] == int(query_id)]
        # sortuje je po ich 'score', one i tak mają 1 ale na przyszlosc z lepszym datasetem zeby gralo bo tak sie realizuje IDCG
        good_answers = good_answers.sort_values(by='score', ascending=False)
        # Biore posortowane kolejne elementy z dobrymyim odpowiedziami, jesli nie ma ich (5) to uzupelniam 0 ami aby było zawsze 5 elementów - prawidlowe ndcg tak działa
        relenvant_answears = list(good_answers['score'][:ndcg_size]) + [0] * (
                    ndcg_size - len(good_answers))  # idealne odpowiedzi
        # print(relenvant_answears) -> cos w stylu [1,1,0,0,0]

        retrived_answears = [0 for _ in range(ndcg_size)]  #otrzymane odpowiedzi
        for idx, doc_found in enumerate(retrieved_docs):
            if int(doc_found) in good_answers['corpus-id'].values:
                retrived_answears[idx] = good_answers[good_answers['corpus-id'] == int(doc_found)]['score'].iloc[0]

        #print(retrived_answears) # -> cos w stylu [0,1,0,0,0]
        ndcg = compute_ndcg(relenvant_answears, retrived_answears, k=5)
        ndcg_scores.append(ndcg)  # ndcg dla kazdego query sumujemy

    # Zwracamy średnie NDCG dla wszystkich zapytań
    return np.mean(ndcg_scores)

In [132]:
mean_ndcg_query_doc_store_r = search_and_compute_ndcg_doc_store_r(qa_no_duplicates, 5, query_column_name='text')
mean_ndcg_query_doc_store = search_and_compute_ndcg_doc_store(qa_no_duplicates, 5, query_column_name='text')
mean_ndcg_query = search_and_compute_ndcg(index_name, 'content_synon', qa_no_duplicates, NDCG_SIZE, query_column_name='text')

Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  9.59 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 12.42 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 12.79 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 12.34 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 12.58 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 12.95 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 12.91 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 12.89 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 12.49 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 12.78 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 12.72 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 11.61 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 12.84 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00

['309023', '65404', '438975', '316359', '529879']
['224000', '204288', '233145', '373905', '325273']
['348480', '397510', '309023', '357520', '203820']
['356884', '176284', '557369', '141738', '86760']
['290293', '20796', '122114', '217661', '536703']
['28764', '278824', '30343', '533185', '338700']
['588194', '115274', '502616', '50310', '270420']
['545296', '192516', '462831', '103842', '44152']
['295522', '400230', '135196', '535207', '301161']
['530596', '154931', '509111', '411063', '146657']
['122908', '590310', '146657', '490223', '562777']
['592192', '538208', '352363', '504479', '438666']
['385881', '510087', '381751', '423083', '217875']
['134497', '271691', '430890', '371880', '69800']
['73427', '395139', '28974', '244318', '215267']
['71338', '58611', '88124', '366761', '504317']
['324686', '307595', '113448', '350889', '192589']
['360139', '315168', '393838', '187110', '413976']
['372909', '431685', '515254', '328341', '72321']
['504317', '334603', '157233', '297965', '151

In [133]:
print("NDCG@5 dla Embeddings Polish Roberta: ", mean_ndcg_query_doc_store_r)
print("NDCG@5 dla Embeddings E5 multi-lang base: ", mean_ndcg_query_doc_store)
print("NDCG@5 Elastic Search: ", mean_ndcg_query)

NDCG@5 dla Embeddings Polish Roberta:  0.2744230248463447
NDCG@5 dla Embeddings E5 multi-lang base:  0.06898782996558572
NDCG@5 Elastic Search:  0.1851291130797741


# Podsumowanie wyników
E5 base - slabo, wyniki gorsze niz ES i klasyfikator Herbert. Moze to byc spowodowane parametrami modelu podczas treningu (np. duzy batch), mozna by poeksperymentowac z parametrami treningu. Moze to wynikac wprost z modelu, tzn. jest to model w wersji base a wiec nie najwiekszy oraz to model do wielu jezykow, to domyslnie bedzie sobie gorzej radzil z Polskim jezykiem, dodatkowo trudnym tekstem.
Polish Roberta - W koncu udalo przebic sie wynik ES i to znacznie (z 0.18 na 0.27). Model ten byl fine-tunowany na Polskich danych, wiec tutaj odrazu przewaga i mozna bylo sie spodziewac lepszych wynikow niz wielojezykowa E5, to sie poprawio, ponadto lepsze wyniki niz ES.


# Questions

Which of the methods: lexical match (e.g. ElasticSearch) or dense representation works better?
Dense representation, ale model trenowany na Polskich danych.
Which of the methods is faster?
ES i to znacznie
Try to determine the other pros and cons of using lexical search and dense document retrieval models.

lexical:
+ Szybkosc
+ Intuicyjnosc
+ stare, sprawdzone duze wsparcie
- gorsze wyniki

Dense:
- wieksza zlozonosc obliczeniowa
+ trudniejsze do zrouzeminia, interpretowalnosc
+ lepsze wyniki
+ prostsza implementacja niz ES (subiektywna opinia)


# Wnioski
Kolejne ciekae ćwiczenie. Pozwoliło zapoznać się z nową biblioteka haystack oraz zobaczyc jak dziala wyszukiwanie neuronowe. Zastosowanie SOTA modeli, bibliotek na ciekawym trudnym problemie, pomoglo to bardzo w zrozumieniu jak dziala wyszukiwanie passages za pomoca sieci neuronowych i jak to zrobic w praktyce, bardzo dobry wstep do QA i poszerzenie wiedzy. Uporzadkowala sie wiedza jak wyszukujemy lexykalnie a jak dense.