In [1]:
%cd /Users/natalipeeva/Documents/GitHub/Automatic-Answering-of-City-Council-Questions/

/Users/natalipeeva/Documents/GitHub/Automatic-Answering-of-City-Council-Questions


#### First run in terminal:
docker run --rm -p 9200:9200 -p 9300:9300 -e "xpack.security.enabled=false" -e "discovery.type=single-node" docker.elastic.co/elasticsearch/elasticsearch:8.7.0

In [2]:
import sys
import os
src_dir = os.path.join(os.getcwd(), 'src')
sys.path.append(src_dir)

from read_data.read_data import read_urls_questions, get_questions, get_url_content_tuples, get_relevant_docs
from elasticsearch import Elasticsearch
from retrieval.sparse_retrieval.bm25 import set_index, get_result_tuples

In [3]:
es_client = Elasticsearch("http://localhost:9200")

In [4]:
#es_client.info().body

In [16]:
collected, questions =  read_urls_questions(os.path.join('data/reference_urls/reference_urls_collected.csv'),
                                            os.path.join('data/question_answer/questions_updated_urls.csv'),
                                            clean_url_nan=True) # read collected urls and questions + remove unsuccessful collection

In [17]:
col_urls = set(list(collected['URL'])) # check how many common items i have - unfortunately only 20
list_of_lists= list(questions['Cleaned_URLs'])
flattened_list = [item for sublist in list_of_lists for item in sublist]
# Assuming your two lists are named 'list1' and 'list2'
common_items = set(col_urls).intersection(flattened_list)

In [18]:
len(common_items)

113

In [19]:
filtered_questions2 = questions[questions['URLs'].apply(lambda urls: any(url in urls for url in common_items))]

In [153]:
df_filtered = questions[~questions['URLs'].apply(lambda urls: any(url.startswith("https://www.rijksoverheid.nl/") for url in urls))]

In [154]:
question_list = get_questions(df_filtered)
document_list = get_url_content_tuples(collected)
mappings = {
        "properties": {
            "url": {"type": "text"},
            "text": {"type": "text", "analyzer": "standard", "similarity": "BM25"}
    }
}

In [21]:
es_client = set_index(es_client=es_client, collected=collected, mappings=mappings) # 48 secs


In [172]:
results = get_result_tuples(es_client=es_client, questions=question_list, n=100)

In [173]:
from retrieval.evaluation.evaluate import calculate_recall_at_k, calculate_average_recall

In [174]:
from irmetrics.topk import recall

In [175]:
def recall_k(results, relevant_docs, k):
    """
    Compute Recall@K
    Input:
        results: A sorted list of tuples (document_id, score), with the most relevant document in the first position
        relevant_docs: A set of relevant documents.
        k: the cut-off
    Output: Recall@K
    """
    # Get unique document IDs from the results and relevant_docs
    unique_results = list(set(result[0] for result in results))
    unique_relevant_docs = list(set(relevant_docs))

    if k > len(unique_results):
        k = len(unique_results)

    relevant_count = 0
    for i in range(k):
        if unique_results[i] in unique_relevant_docs:  # Check if result is in relevant
            relevant_count += 1

    k_recall = float(relevant_count) / float(len(unique_relevant_docs))

    return k_recall

def get_k_urls(bm25_results):
    urls = []
    counter = 1
    for result in bm25_results:

        urls.append((result['url'], result['score']))
        counter+=1
    return urls

def calculate_recall_at_k(queries, search_results, relevant_docs, k_values, ranking_method):
    """
    Calculate Recall@K for a list of queries and specified values of k.
    Input:
        queries - a list of queries
        collection: a list of tuples (document_id, document_content)
        relevant_docs: a dictionary where the key is the query and the value is a set of relevant document IDs
        k_values: a list of k values for recall calculation
    Output: a dictionary where the key is the query and the value is a dictionary of recall values at each k
    """
    recall_results = {}

    for query in queries:
        if ranking_method=='bm25':
            results = get_k_urls(search_results[query][1])
        elif ranking_method=='tf-idf':
            results = search_results[query]
        relevant = relevant_docs[query]

        recall_values = {}
        for k in k_values:
            recall_ = recall(list(set(results)), list(set(relevant)), k=k)
            recall_values[k] = recall_

        recall_results[query] = recall_values

    return recall_results


def calculate_average_recall(recall_results):
    """
    Calculate the average recall values per recall type from a dictionary of recall values.
    Input:
        recall_results: a dictionary where the key is the query and the value is a dictionary of recall values at each k
    Output: a dictionary where the key is the recall type and the value is the average recall value
    """
    average_recall = {}

    for query_recall in recall_results.values():
        for k, recall_ in query_recall.items():
            if k not in average_recall:
                average_recall[k] = 0.0
            average_recall[k] += recall_

    query_count = len(recall_results)
    for k in average_recall:
        average_recall[k] /= query_count

    return average_recall

In [176]:
relevant_docs = get_relevant_docs(questions)

In [177]:
relevant_docs = get_relevant_docs(questions)
recalls = calculate_recall_at_k(question_list, results, relevant_docs, [1, 5, 10, 15], 'bm25') # think of a metric similarity recall? 
calculate_average_recall(recalls)

ValueError: operands could not be broadcast together with shapes (74,) (77,) (74,) 

In [29]:
! pip install ir-metrics

Collecting ir-metrics
  Downloading ir_metrics-0.1.6-py3-none-any.whl (9.6 kB)
Installing collected packages: ir-metrics
Successfully installed ir-metrics-0.1.6


In [30]:
from irmetrics.topk import recall

In [178]:
predictions = []
for question in results.keys():
    urls = []
    for result in results[question][1]:
        urls.append(result['url'])
    predictions.append(list(set(urls)))

In [179]:
true = []
for question in relevant_docs.keys():
    true.append(list(set(relevant_docs[question])))

In [180]:
import numpy as np
from irmetrics.topk import rr

# Calculate the Mean Reciprocal Rank for each question
mrr_values = []
for i in range(len(predictions)):
    true_values = true[i]
    mrr = rr(true_values, predictions[i], k =10)
    mrr_values.append(mrr)

# Calculate the average Mean Reciprocal Rank
average_mrr = np.mean(mrr_values)

print("Average Mean Reciprocal Rank:", average_mrr)


Average Mean Reciprocal Rank: 0.031638071895424835


In [186]:
import numpy as np
from irmetrics.topk import recall

# Calculate the Mean Reciprocal Rank for each question
mrr_values = []
for i in range(len(predictions)):
    true_values = true[i]
    mrr = recall(true_values, predictions[i], k=5)
    mrr_values.append(mrr)

# Calculate the average Mean Reciprocal Rank
average_recall = np.mean(mrr_values)

print("Average Mean Reciprocal Rank:", average_recall)

Average Mean Reciprocal Rank: 0.0661764705882353


In [182]:
import numpy as np
from irmetrics.topk import precision


# Calculate the Mean Reciprocal Rank for each question
mrr_values = []
for i in range(len(predictions)):
    true_values = true[i]
    mrr = precision(true_values, predictions[i], k=10)
    mrr_values.append(mrr)

# Calculate the average Mean Reciprocal Rank
average_recall = np.mean(mrr_values)

print("Average Mean Reciprocal Rank:", average_recall)

Average Mean Reciprocal Rank: 0.012499999999999999


In [94]:
recall(predictions, true, k=10).mean()

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (136,) + inhomogeneous part.

In [68]:
urls

['https://www.amsterdam.nl/publish/pages/931968/raadsbrief_positief_perspectief_-_weerbaar_opgroeien_in_amsterdam.pdf',
 'https://www.rijksoverheid.nl/onderwerpen/veilig-leren-en-werken-in-het-onderwijs/veiligheid-op-school',
 'https://www.rijksoverheid.nl/documenten/kamerstukken/2021/11/02/antwoorden-kamervragen-over-het-analyseren-van-data-van-de-proco-app',
 'https://www.ggd.amsterdam.nl/gezond-wonen/zwemmen-open-water/',
 'https://www.rijksoverheid.nl/actueel/nieuws/2021/09/16/hulpteams-bij-het-vinden-van-werk-overal-in-het-land-van-start',
 'https://www.rijksoverheid.nl/documenten/kamerstukken/2021/05/11/antwoorden-kamervragen-van-het-lid-kathmann-pvda-over-sancties-voor-racisme-en-discriminatie-binnen-de-politie',
 'https://www.rijksoverheid.nl/actueel/nieuws/2022/07/08/verbod-op-ongerichte-gokreclames-en-sponsoring',
 'https://www.amsterdam.nl/veelgevraagd/?caseid=%7BD6E280FB-4A76-40A0-9B88-12B87E446FA6%7D',
 'https://www.amsterdam.nl/sociaaldomein/zorg-jeugd/artikelen/specialis

In [71]:
list(results.keys())[5]

'\n \n1. Is het college het met de fractie van GroenLinks eens dat de grote hoeveelheid aan \ngokreclame het risico op problematisch gokgedrag en risico op verslaving vergroot?  \n \n'

In [70]:
question_list[5]

'\n \n1. Is het college het met de fractie van GroenLinks eens dat de grote hoeveelheid aan \ngokreclame het risico op problematisch gokgedrag en risico op verslaving vergroot?  \n \n'

In [69]:
relevant_docs[question_list[5]]

['https://www.rijksoverheid.nl/actueel/nieuws/2022/07/08/verbod-op-ongerichte-gokreclames-en-sponsoring']

In [55]:
list(results.keys())[1]

'\n7.\nKan het college de reeds bestaande zwemplekken in Amsterdam en de directe \nomgeving, die zich op fietsafstand bevinden, beter communiceren zodat mensen \nweten waar ze allemaal heen kunnen op de fiets om te zwemmen?\n'

In [73]:
set(relevant_docs[question_list[5]])

{'https://www.rijksoverheid.nl/actueel/nieuws/2022/07/08/verbod-op-ongerichte-gokreclames-en-sponsoring'}

In [83]:
recall(list(set(relevant_docs[question_list[5]])), list(set(urls)), k=10)

1.0

In [43]:
results[list(results.keys())[0]][1][0]

{'url': 'https://www.rivm.nl/mpox-apenpokken',
 'text': '\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nMpox (apenpokken) | RIVM\n\n\n\n\n\n\n\n\n\n\n\nOverslaan en naar de inhoud gaan\nDirect naar de hoofdnavigatie\n\n\n\n\n\n \n \nRijksinstituut voor Volksgezondheiden Milieu\nMinisterie van Volksgezondheid,Welzijn en Sport\n\n\n\n\n\n\n\n\nNederlands\n\n\nEnglish\n\n\n\n\n\n\n\nRIVM De zorg voor morgen begint vandaag \n          \n\n\n\n\n\n\nMenuMenu ingeklapt\n\n\n\n\nHome\n\n\nOnderwerpen\n\n\nOver RIVM\n\n\nPublicaties\n\n\nInternationaal\n\n\nContact\n\n\nAgenda\n\n\n\n\nNederlands\n\n\nEnglish\n\n\n\nZoekformulier ingeklapt\n\n\n\n\n\n\nZoeken\n\n\n\n\n\nZoeken\n\n\n\n\n\n\n\n\n\n\n\n\n\nHome\n\n\n              Mpox (apenpokken)\n          \n\n Mpox (apenpokken)\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n× In dit onderwerp Menu\n\n\n\n\n\n\n\nVaccinatie \n\n\nPrivacy\n\n\n\n\nVaccin\n\n\nVeelgestelde vragen\n\n\n\n\n\n\n\n\n\n\nScience Photo Library / ANP\n\n\n\n\n  Mpox, vaak apenpokken of apenpokkenvir

#### Run to save results

In [15]:
import pickle
with open ('data/results/bm25_results_short.pickle', 'wb') as f:
    pickle.dump(results, f)