In [None]:
# convert dataframe to a list of text? 

In [1]:
%cd /Users/natalipeeva/Documents/GitHub/Automatic-Answering-of-City-Council-Questions/

/Users/natalipeeva/Documents/GitHub/Automatic-Answering-of-City-Council-Questions


In [2]:
import sys
import os
src_dir = os.path.join(os.getcwd(), 'src')
sys.path.append(src_dir)

from read_data.read_data import read_urls_questions, get_questions, get_url_content_tuples, get_relevant_docs
from elasticsearch import Elasticsearch
from retrieval.sparse_retrieval.bm25 import set_index_paragraphs, query_es_index_paragraphs, get_result_tuples

In [3]:
es_client = Elasticsearch("http://localhost:9200")

In [4]:
mappings = {
    "properties": {
        "url": {"type": "text"},
        "text": {"type": "text", "analyzer": "standard", "similarity": "BM25"},
        "h1": {"type": "text", "analyzer": "standard", "similarity": "BM25"},
        "h2": {"type": "text", "analyzer": "standard", "similarity": "BM25"},
        "h3": {"type": "text", "analyzer": "standard", "similarity": "BM25"},
        "h4": {"type": "text", "analyzer": "standard", "similarity": "BM25"}
    }
}

In [5]:
import pandas as pd

In [6]:
collected = pd.read_csv('data/collected/paragraphs.csv')

In [7]:
es_client = set_index_paragraphs(es_client=es_client, collected=collected, mappings=mappings) # 8:44 minutes

In [19]:
col, questions =  read_urls_questions(os.path.join('data/reference_urls/reference_urls_collected.csv'),
                                            os.path.join('data/question_answer/questions_updated_urls.csv'),
                                            clean_url_nan=True) # read collected urls and questions + remove unsuccessful collection

In [20]:
question_list = get_questions(questions)

In [21]:
# Assuming your DataFrame is named 'questions' and the column with URLs is named 'URLs'
filtered_questions = questions[questions['Cleaned_URLs'].apply(lambda urls: any(url.startswith('https://www.amsterdam.nl') for url in urls))]

# The 'filtered_questions' DataFrame will contain only the questions with at least one URL starting with 'https://amsterdam.nl'

In [22]:
col_urls = set(list(collected['url'])) # check how many common items i have - unfortunately only 20
list_of_lists= list(filtered_questions['Cleaned_URLs'])
flattened_list = [item for sublist in list_of_lists for item in sublist]
# Assuming your two lists are named 'list1' and 'list2'
common_items = set(col_urls).intersection(flattened_list)

In [23]:
filtered_questions2 = questions[questions['URLs'].apply(lambda urls: any(url in urls for url in common_items))]

# The 'filtered_questions' DataFrame will contain only the questions with URLs present in the 'url_list'


In [24]:
question_list = get_questions(filtered_questions2)

In [25]:
len(question_list)

23

In [26]:
def get_result_tuples(es_client, questions, index_name='qa_attempt1', n=10):

    results = {}  # To store the results for all questions

    for question in questions:
        doc, res_list = query_es_index_paragraphs(question, es_client, index_name=index_name, n_results=n)
        results[question] = (doc, res_list)  # it took 9 seconds to finish 
    
    return results

In [34]:
results = get_result_tuples(es_client=es_client, questions=question_list, n=250) # 8 seconds

In [35]:
from retrieval.evaluation.evaluate import calculate_recall_at_k, calculate_average_recall

In [71]:
def recall_k(results, relevant_docs, k):
    """
    Compute Recall@K
    Input:
        results: A sorted list of tuples (document_id, score), with the most relevant document in the first position
        relevant_docs: A set of relevant documents.
        k: the cut-off
    Output: Recall@K
    """
    # Get unique document IDs from the results
    unique_results = list(set(result[0] for result in results))

    if k > len(unique_results):
        k = len(unique_results)

    relevant_count = 0
    for i in range(k):
        if unique_results[i] in relevant_docs:  # Check if result is in relevant
            relevant_count += 1

    k_recall = float(relevant_count) / float(len(relevant_docs))

    return k_recall

def get_k_urls(bm25_results):
    urls = []
    counter = 1
    for result in bm25_results:

        urls.append((result['url'], result['score']))
        counter+=1
    return urls

def calculate_recall_at_k(queries, search_results, relevant_docs, k_values, ranking_method):
    """
    Calculate Recall@K for a list of queries and specified values of k.
    Input:
        queries - a list of queries
        collection: a list of tuples (document_id, document_content)
        relevant_docs: a dictionary where the key is the query and the value is a set of relevant document IDs
        k_values: a list of k values for recall calculation
    Output: a dictionary where the key is the query and the value is a dictionary of recall values at each k
    """
    recall_results = {}

    for query in queries:
        if ranking_method=='bm25':
            results = get_k_urls(search_results[query][1])
        elif ranking_method=='tf-idf':
            results = search_results[query]
        relevant = relevant_docs[query]

        recall_values = {}
        for k in k_values:
            recall = recall_k(results, relevant, k)
            recall_values[k] = recall

        recall_results[query] = recall_values

    return recall_results


def calculate_average_recall(recall_results):
    """
    Calculate the average recall values per recall type from a dictionary of recall values.
    Input:
        recall_results: a dictionary where the key is the query and the value is a dictionary of recall values at each k
    Output: a dictionary where the key is the recall type and the value is the average recall value
    """
    average_recall = {}

    for query_recall in recall_results.values():
        for k, recall in query_recall.items():
            if k not in average_recall:
                average_recall[k] = 0.0
            average_recall[k] += recall

    query_count = len(recall_results)
    for k in average_recall:
        average_recall[k] /= query_count

    return average_recall

In [72]:
relevant_docs = get_relevant_docs(questions)
recalls = calculate_recall_at_k(question_list, results, relevant_docs, [5, 10, 20, 30, 40, 50, 250], 'bm25') # think of a metric similarity recall? 
calculate_average_recall(recalls) # recall at 20 is 0,26 which is pretty good considering we query a huge set of documents 

{5: 0.0,
 10: 0.0,
 20: 0.08695652173913043,
 30: 0.08695652173913043,
 40: 0.13043478260869565,
 50: 0.13043478260869565,
 250: 0.2826086956521739}

In [59]:
for item in common_items:
    print(item)

https://www.amsterdam.nl/parkeren/parkeren-reizen/
https://www.amsterdam.nl/veelgevraagd/?caseid=%7BD6E280FB-4A76-40A0-9B88-12B87E446FA6%7D
https://www.amsterdam.nl/bestuur-organisatie/volg-beleid/groen/bomen/
https://www.amsterdam.nl/veelgevraagd/?productid=%7B249D3A8E-ED07-4E4C-BFAD-49F174342FD5%7D
https://www.amsterdam.nl/bestuur-organisatie/organisatie/overige/acvz/verwerking-persoonsgegevens-acvz/
https://www.amsterdam.nl/wonen-leefomgeving/wonen/funderingsloket/
https://www.amsterdam.nl/zorg-ondersteuning/ondersteuning/vluchtelingen/24-uursopvang-ongedocumenteerden/
https://www.amsterdam.nl/wonen-leefomgeving/duurzaam-amsterdam/windmolens-amsterdam/reflectiefase/
https://www.amsterdam.nl/wonen-leefomgeving/zelfbouw/
https://www.amsterdam.nl/verkeer-vervoer/fiets/fietsdepot/fiets-graveren/
https://www.amsterdam.nl/projecten/noorderpark/
https://www.amsterdam.nl/veelgevraagd/?caseid=%7B2A574844-AA85-4A2C-8CD3-8CB494F4997E%7D
https://www.amsterdam.nl/wonen-leefomgeving/zelfbouw/woon

In [40]:
for item in set(flattened_list):
    if item not in col_urls:
        print(item)

https://www.amsterdam.nl/zorg-ondersteuning/ondersteuning/dak-of-thuisloos/?vkurl=dakloos
https://www.amsterdam.nl/sociaaldomein/nederlandse-taal/kwaliteitsconvenant-taal-inburgering/
https://www.amsterdam.nl/komop
https://www.amsterdam.nl/bestuur-organisatie/volg-beleid/coalitieakkoord-uitvoeringsagenda/gezonde-duurzame-stad/klimaatneutraal/
https://www.amsterdam.nl/nieuws/kennisgevingen-bekendmakingen/
https://www.amsterdam.nl/nrga/
https://www.rivm.nl/publicaties/health-effects-related-to-wind-turbine-sound-update
https://www.amsterdam.nl/sociaaldomein/onderwijs-leerplicht/mbo-agenda/
https://www.amsterdam.nl/sociaaldomein/zorgprofessionals/huiselijk-geweld-en/adviseurs-sociale-basis-(preventie-hgkm)/
https://www.amsterdam.nl/wonen-leefomgeving/groene-stad
https://www.amsterdam.nl/projecten/arenapoort
https://www.amsterdam.nl/veelgevraagd/?productid=%7B87FAD1C9-60E9-4CEA-B9AE-6D5594A0E841%7D#case_%7B63E55F58-F93C-4A68-BEAA-896C8F8FBBB1%7D)
https://www.amsterdam.nl/ondernemen/onderst

In [60]:
predictions = []
for question in results.keys():
    urls = []
    for result in results[question][1]:
        urls.append(result['url'])
    predictions.append(list(set(urls)))

In [61]:
true = []
for question in relevant_docs.keys():
    true.append(list(set(relevant_docs[question])))

In [70]:
import numpy as np
from irmetrics.topk import recall

# Calculate the Mean Reciprocal Rank for each question
mrr_values = []
for i in range(len(predictions)):
    true_values = true[i]
    mrr = recall(true_values, predictions[i], k=30)
    mrr_values.append(mrr)

# Calculate the average Mean Reciprocal Rank
average_recall = np.mean(mrr_values)

print("Average Mean Reciprocal Rank:", average_recall)


Average Mean Reciprocal Rank: 0.0
