### Setup CD

In [1]:
%cd  /Users/natalipeeva/Documents/GitHub/Automatic-Answering-of-City-Council-Questions/

/Users/natalipeeva/Documents/GitHub/Automatic-Answering-of-City-Council-Questions


In [2]:
import pandas as pd
import ast
import nltk
import pandas as pd
nltk.download('punkt')
from hashids import Hashids

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/natalipeeva/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Read Data

In [3]:
questions_for_ranking = pd.read_csv('data/question_answer/amsterdam_questions.csv')
collection = pd.read_csv('data/amsterdam/amsterdam_full.csv')

### Prepare Data for ranking

#### Convert Data to Paragraphs

##### Passages function

In [11]:
def create_passages_dataframe(collection):
    all_passages_df = pd.DataFrame(columns=['URL', 'Textual_Content'])

    for index, row in collection.iterrows():
        url = row['URL']
        content = str(row['Textual_Content'])

        sentences = nltk.sent_tokenize(content, language='dutch')

        passages = []
        passage_words = []
        word_count = 0

        for sentence in sentences:
            sentence_words = nltk.word_tokenize(sentence, language='dutch')

            while len(sentence_words) > 0:
                if word_count + len(sentence_words) <= 100:
                    passage_words.extend(sentence_words)
                    word_count += len(sentence_words)
                    sentence_words = []
                else:
                    remaining_words = 100 - word_count
                    passage_words.extend(sentence_words[:remaining_words])
                    passages.append(' '.join(passage_words))
                    passage_words = []
                    word_count = 0
                    sentence_words = sentence_words[remaining_words:]

        if passage_words:
            passages.append(' '.join(passage_words))

        passage_df = pd.DataFrame({'URL': url, 'Textual_Content': passages})
        all_passages_df = pd.concat([all_passages_df, passage_df], ignore_index=True)

    return all_passages_df


##### Create passages

In [12]:
passages_df = create_passages_dataframe(collection) # 50 secs for execution
len(passages_df)

60902

##### Add passage ids

In [13]:
hashids = Hashids()
passages_df["id"] = [hashids.encode(i) for i in range(len(passages_df))]

In [14]:
passages_df.tail()

Unnamed: 0,URL,Textual_Content,id
60897,https://www.amsterdam.nl/nieuws/kennisgevingen...,evenementen . Lijst Over deze site Privacy Coo...,r1vk
60898,https://www.amsterdam.nl/zorg-ondersteuning/on...,Hulp voor dak- of thuislozen - Gemeente Amster...,v1zg
60899,https://www.amsterdam.nl/zorg-ondersteuning/on...,Op zoek naar ( tijdelijke ) woonruimte Opvang ...,w1Am
60900,https://www.amsterdam.nl/zorg-ondersteuning/on...,onverzekerden Collectieve zorgverzekering Amst...,g0GZ
60901,https://www.amsterdam.nl/zorg-ondersteuning/on...,tot inspraakavonden . Wat organiseert de gemee...,j1kY


##### Add paragraphs ids to questions

In [15]:
def add_passages_ids(questions_for_ranking, passages_df):
    matching_ids_column = []
    for index, row in questions_for_ranking.iterrows():
        matching_ids = []
        for url in ast.literal_eval(row['URLs']):
            for index2, row2 in passages_df.iterrows():
                if url == row2['URL']:
                    matching_ids.append(row2['id'])
        if matching_ids:
            matching_ids_column.append(matching_ids)
        else:
            matching_ids_column.append(None)
    questions_for_ranking['passages_ids'] = matching_ids_column
    return questions_for_ranking


In [16]:
questions_for_ranking = add_passages_ids(questions_for_ranking, passages_df)

In [17]:
questions_for_ranking.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Year,Month,Question,Answer,Document,URLs,Cleaned_URLs,passages_ids
0,1,1,2020,6,\n7.\nKan het college de reeds bestaande zwemp...,\nVoor het vinden van de officiële zwemplekken...,https://amsterdam.raadsinformatie.nl/document/...,['https://www.amsterdam.nl/veelgevraagd/?casei...,['https://www.amsterdam.nl/veelgevraagd/?casei...,"[y8RV, zm0r, AnyB, Bgzx]"
1,6,7,2021,8,\n \n3. Huisartsen geven aan meer informatie n...,"\nDe uitvoerder van de regeling, het CAK, lij...",https://amsterdam.raadsinformatie.nl/document/...,['https://www.amsterdam.nl/zorg-ondersteuning/...,['https://www.amsterdam.nl/zorg-ondersteuning/...,"[nVkY, o9lA, p6mX, qXnD, rNoB, v7vL, w8wg, xNx..."
2,7,8,2021,8,\n \n8. Weten ongedocumenteerden de weg naar m...,"\nDe Kruispost wordt goed bezocht, maar het c...",https://amsterdam.raadsinformatie.nl/document/...,['https://www.amsterdam.nl/zorg-ondersteuning/...,['https://www.amsterdam.nl/zorg-ondersteuning/...,"[nVkY, o9lA, p6mX, qXnD, rNoB, v7vL, w8wg, xNx..."
3,9,14,2022,7,\n \n4. Is het college tot nu toe tevreden met...,"\nJa, met de beschikbare middelen is de uitvo...",https://amsterdam.raadsinformatie.nl/document/...,['https://www.amsterdam.nl/wonen-leefomgeving/...,['https://www.amsterdam.nl/wonen-leefomgeving/...,"[zkqm, Ap8P, Bq1N, Dvwq, EwK0]"
4,10,16,2019,10,\n \n1. \nKan aan de werkinstructie worden toe...,...,https://amsterdam.raadsinformatie.nl/document/...,['https://www.amsterdam.nl/privacy/loket/'],['https://www.amsterdam.nl/privacy/loket/'],"[G0w8, JBEP, KDAY, LEAA]"


##### Add question ids 

In [18]:
hashids = Hashids()
questions_for_ranking["question_id"] = [hashids.encode(i) for i in range(len(questions_for_ranking))]

In [19]:
questions_for_ranking.tail()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Year,Month,Question,Answer,Document,URLs,Cleaned_URLs,passages_ids,question_id
68,124,178,2014,8,\n \n2. Hoe evalueert het college het function...,\nOp 22 september 2009 stemde het toenmalige...,https://amsterdam.raadsinformatie.nl/document/...,['https://www.amsterdam.nl/veelgevraagd/?produ...,['https://www.amsterdam.nl/veelgevraagd/?produ...,"[wjXR, g5A6, j2E5, k5GX, lOJ6, mwK0, nZLl, oYM...",Nk6
69,127,181,2022,5,\n3. Is het college bereid om te investeren in...,\nHet college investeert al in de signalerend...,https://amsterdam.raadsinformatie.nl/document/...,['https://www.amsterdam.nl/sociaaldomein/zorgp...,['https://www.amsterdam.nl/sociaaldomein/zorgp...,"[0KpG, g0Kl, j1Mz, k1KY, l1LJ, m56A, n1XD]",OYp
70,128,182,2017,5,\n \n2. Deelt het college de mening dat het ee...,"\nNee, het college deelt deze mening van de P...",https://amsterdam.raadsinformatie.nl/document/...,['https://www.amsterdam.nl/nrga/'],['https://www.amsterdam.nl/nrga/'],"[o1KN, p1Kr, q1XR, r1Nk, v17g, w18m, xgNn, yjO...",PNw
71,129,183,2020,2,\n \n \n6. Recent heeft de burgemeester aange...,\na) Het verlenen van een vergunning is een b...,https://amsterdam.raadsinformatie.nl/document/...,['https://www.amsterdam.nl/nieuws/kennisgeving...,['https://www.amsterdam.nl/nieuws/kennisgeving...,"[j1Vz, k1VY, l1VJ, m5VA, n1VD, o19N, p16r, q1r...",QWl
72,134,193,2020,3,\n \n12. Welke informatievoorziening is er van...,\nZorgaanbieders zijn verantwoordelijk voor h...,https://amsterdam.raadsinformatie.nl/document/...,['https://www.amsterdam.nl/zorg-ondersteuning/...,['https://www.amsterdam.nl/zorg-ondersteuning/...,"[n1GE, o1A3, p19N, q1AG, v1zg, w1Am, g0GZ, j1kY]",R6q


#### remove unused columns

In [20]:
questions_for_ranking.drop('Unnamed: 0.1', axis=1, inplace=True)

In [21]:
questions_for_ranking.drop('Unnamed: 0', axis=1, inplace=True)

In [22]:
questions_for_ranking.drop('Cleaned_URLs', axis=1, inplace=True)

#### prepare dataset

In [23]:
from datasets import Dataset
import pyarrow as pa

In [24]:
arrow_table = pa.Table.from_pandas(questions_for_ranking)
arrow_dict = arrow_table.to_pydict()
questions_for_ranking = Dataset.from_dict(arrow_dict)

In [25]:
questions_for_ranking[0]

{'Year': 2020,
 'Month': 6,
 'Question': '\n7.\nKan het college de reeds bestaande zwemplekken in Amsterdam en de directe \nomgeving, die zich op fietsafstand bevinden, beter communiceren zodat mensen \nweten waar ze allemaal heen kunnen op de fiets om te zwemmen?\n',
 'Answer': '\nVoor het vinden van de officiële zwemplekken is er informatie op de site \nhttps://www.zwemwater.nl/.Hier is ook informatie te vinden over veilig zwemmen en gezondheidsrisico’s en het \nCorona protocol zwemmen en recreëren in en aan oppervlaktewater.\nOp de kaart met zwemwater, stadsstranden en fonteinen geeft de gemeente \nAmsterdam een overzicht van zwemplekken in open water: \nhttps://maps.amsterdam.nl/zwemwater/ . \nZie tevens de site: https://www.amsterdam.nl/veelgevraagd/?caseid=%7BD6E280FB-\n4A76-40A0-9B88-12B87E446FA6%7D\nVoorts is het voor de gezondheidsrisico’s goed kennis te nemen van de site van de \nGGD over het zwemmen in open water: https://www.ggd.amsterdam.nl/gezond-\nwonen/zwemmen-open-wate

In [26]:
arrow_table = pa.Table.from_pandas(passages_df)
arrow_dict = arrow_table.to_pydict()
passages_df = Dataset.from_dict(arrow_dict)

In [27]:
passages_df[1000]

{'URL': 'https://www.amsterdam.nl/veelgevraagd/meer-veel-gezochte-vragen/?view=top&categoryid=%7B11ADEA27-898E-482C-97D4-48CFCEFE550C%7D',
 'Textual_Content': 'Wat mag er in de broodcontainer ? Wat mag er bij het blad- en snoeiafval ? Deel deze pagina Deel deze pagina op Facebook Deel deze pagina op Twitter Deel deze pagina op LinkedIn Deel deze pagina op WhatsApp Print deze pagina Gemeente Amsterdam Contact Hebt u een vraag en kunt u het antwoord niet vinden op deze website ? Neem dan contact met ons op . Contactformulier Bel het telefoonnummer 14 020maandag tot en met vrijdag van 08.00 tot 18.00 uur Contactgegevens en openingstijden Volg de gemeente Nieuwsbrief Amsterdam Twitter Facebook Instagram LinkedIn YouTube Werkenbij Kalender Van buurtactiviteiten tot',
 'id': 'gN3'}

In [28]:
type(passages_df)

datasets.arrow_dataset.Dataset

## Ranking Amsterdam

### TF-IDF search

#### perform search

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
import heapq

In [30]:
def tfidf_search(query, vectorizer, matrix, collection, k):
    """
    Perform a search over all documents with the given query using tf-idf.
    Input:
        query - an (unprocessed) query
        vectorizer: a fitted TfidfVectorizer
        matrix: the document-term matrix obtained from the fitted vectorizer
        collection: a list of tuples (document_id, document_content)
        k: the number of top search results to retrieve
    Output: a list of (document_id, score, document_content), sorted in descending relevance to the given query
    """
    # preprocess the query # could preprocess more?
    preprocessed_query = str(query['Question']).lower().replace('\n', '')

    query_text = query['Question']
    true_documents = query['passages_ids']

    query_vector = vectorizer.transform([preprocessed_query])
    question_id = query['question_id']

    cosine_similarities = matrix.dot(query_vector.T).toarray().flatten()

    results_heap = []  # heap for efficiency to maintain the top-k results

    # iterate over the collection and update the heap with the top-k results
    for i, doc in enumerate(collection):
        doc_id = doc['id']
        score = cosine_similarities[i]
        document_content = doc['Textual_Content']
        heapq.heappush(results_heap, (score, doc_id, document_content))
        if len(results_heap) > k:
            heapq.heappop(results_heap)

    results_heap.sort(reverse=True)  # sort descending
    top_results = results_heap[:k]

    cosine_scores = [result[0] for result in top_results]
    ranked_ids = [result[1] for result in top_results]
    ranked_text = [result[2] for result in top_results]

    # create a dictionary containing the search results
    search_results = {
        'question_id': question_id,
        'question': query['Question'],
        'ranked_ids': ranked_ids,
        'ranked_text': ranked_text,
        'true_passages': true_documents,
        'cosine_scores': cosine_scores, 
        'answer': query['Answer']
    }

    return search_results  # return top k


In [31]:
def perform_tfidf_search(queries, collection, k):
    """
    Perform TF-IDF search for each query in a list of queries.
    Input:
        queries - a list of queries
        collection: a list of tuples (document_id, document_content)
        k: the number of top search results to retrieve
    Output: a dictionary where the key is the query and the value is a list of (document_id, score, document_content) tuples
    """
    # extract document contents from the collection
    document_contents = [str(doc['Textual_Content']).lower().replace('\n', '') for doc in collection]

    vectorizer = TfidfVectorizer()
    matrix = vectorizer.fit_transform(document_contents)

    search_results = []

    for query in queries:
        results = tfidf_search(query, vectorizer, matrix, collection, k=k)
        search_results.append(results)

    return search_results

In [32]:
results = perform_tfidf_search(questions_for_ranking, passages_df, k=100) # 2:18 mins

#### Evaluation

In [33]:
import numpy as np

In [34]:
from irmetrics.topk import recall, ap, ndcg, precision, rr, precision

In [35]:
from irmetrics.relevance import relevant_counts

In [36]:
#relevant_counts(relevant[:, np.newaxis], ranked[:, np.newaxis])

In [37]:
import warnings

### calculate metrics function

In [38]:
def calculate_average_metrics(results, k):
    recall_scores = []
    ndcg_scores = []
    rr_scores = []
    precision_scores = []

    for result in results:
        ranked = result['ranked_ids']
        relevant = result['true_passages']

        recall_k = recall(relevant, ranked, k)
        ndcg_k = calculate_ndcg(relevant, ranked, k)
        rr_k = rr(relevant, ranked, k)
        precision_k = precision(relevant, ranked, k)

        recall_scores.append(recall_k)
        ndcg_scores.append(ndcg_k)
        rr_scores.append(rr_k)
        precision_scores.append(precision_k)

    average_recall = sum(recall_scores) / len(recall_scores)
    average_ndcg = sum(ndcg_scores) / len(ndcg_scores)
    average_rr = sum(rr_scores) / len(rr_scores)
    average_precision = sum(precision_scores) / len(precision_scores)

    average_metrics = {
        'average_recall@{}'.format(k): average_recall,
        'average_ndcg@{}'.format(k): average_ndcg,
        'average_rr@{}'.format(k): average_rr,
        'average_precision@{}'.format(k): average_precision
    }

    return average_metrics


def calculate_ndcg(relevant, ranked, k):
    warnings.filterwarnings('ignore', 'invalid value encountered', RuntimeWarning)
    ndcg_k = ndcg(relevant, ranked, k)
    if np.isnan(ndcg_k) or np.isinf(ndcg_k):
        ndcg_k = 0.0
    return ndcg_k

In [39]:
ks = [5, 10, 100]
# for document 0 the results are quite high
for k in ks:
    print('\nMetrics for k={}:'.format(k))
    average_metrics = calculate_average_metrics(results, k)
    for metric, value in average_metrics.items():
        print('{}: {:.4f}'.format(metric, value))


Metrics for k=5:
average_recall@5: 0.0274
average_ndcg@5: 0.0539
average_rr@5: 0.0422
average_precision@5: 0.0274

Metrics for k=10:
average_recall@10: 0.0192
average_ndcg@10: 0.0577
average_rr@10: 0.0491
average_precision@10: 0.0164

Metrics for k=100:
average_recall@100: 0.0758
average_ndcg@100: 0.1008
average_rr@100: 0.0585
average_precision@100: 0.0078


##### Save TF-IDF Results

In [40]:
import pickle

In [41]:
with open('data/results_ranking/amsterdam_100words_ranked_tfidf.pickle', 'wb') as f:
    pickle.dump(results, f)

### Random

In [42]:
import random

def random_search(query, collection, k):
    """
    Perform random retrieval from the collection.
    Input:
        query - an (unprocessed) query
        collection: a list of tuples (document_id, document_content)
        k: the number of top search results to retrieve
    Output: a dictionary containing the search results
    """
    query_text = query['Question']
    true_documents = query['passages_ids']

    # Randomly select k documents from the collection
    random_results = random.sample(collection.to_list(), k)

    ranked_ids = [result['id'] for result in random_results]
    ranked_text = [result['Textual_Content'] for result in random_results]
    cosine_scores = []  # Assign random scores for demonstration purposes

    # Create a dictionary containing the search results
    search_results = {
        'question_id': query['question_id'],
        'question': query['Question'],
        'ranked_ids': ranked_ids,
        'ranked_text': ranked_text,
        'true_passages': true_documents,
        'cosine_scores': cosine_scores,
        'answer': query['Answer'],
    }

    return search_results


def perform_random_search(queries, collection, k):
    """
    Perform random retrieval search for each query in a list of queries.
    Input:
        queries - a list of queries
        collection: a pandas DataFrame representing the collection
        k: the number of top search results to retrieve
    Output: a list of dictionaries containing the search results
    """
    search_results = []

    for query in queries:
        results = random_search(query, collection, k=k)
        search_results.append(results)

    return search_results


In [43]:
results_random = perform_random_search(questions_for_ranking, passages_df, k=100) # 18 secs

In [44]:
ks = [5, 10, 100]
# for document 0 the results are quite high
for k in ks:
    print('\nMetrics for k={}:'.format(k))
    average_metrics = calculate_average_metrics(results_random, k)
    for metric, value in average_metrics.items():
        print('{}: {:.4f}'.format(metric, value))


Metrics for k=5:
average_recall@5: 0.0000
average_ndcg@5: 0.0000
average_rr@5: 0.0000
average_precision@5: 0.0000

Metrics for k=10:
average_recall@10: 0.0000
average_ndcg@10: 0.0000
average_rr@10: 0.0000
average_precision@10: 0.0000

Metrics for k=100:
average_recall@100: 0.0000
average_ndcg@100: 0.0000
average_rr@100: 0.0000
average_precision@100: 0.0000


### BM25

##### functions

In [45]:
from rank_bm25 import BM25Okapi
from rank_bm25 import BM25L
from rank_bm25 import BM25Plus

In [46]:
def bm25_search(query, corpus, bm25, k):
    """
    Perform a search over all documents with the given query using BM25 ranking.
    Input:
        query - an (unprocessed) query
        collection: a list of document contents
        bm25: initialized BM25Okapi object
        k: the number of top search results to retrieve
    Output: a list of (document_id, score, document_content), sorted in descending relevance to the given query
    """
    # Preprocess the query
    preprocessed_query = str(query['Question']).lower().replace('\n', '')
    query_tokens = preprocessed_query.split()

    scores = bm25.get_scores(query_tokens)

    results_heap = []

    # Iterate over the corpus and update the heap with the top-k results
    for i, doc in enumerate(corpus):
        doc_id = doc['id']
        score = scores[i]
        document_content = doc['Textual_Content']
        heapq.heappush(results_heap, (score, doc_id, document_content))
        if len(results_heap) > k:
            heapq.heappop(results_heap)

    results_heap.sort(reverse=True)  # Sort descending
    top_results = results_heap[:k]

    bm25_scores = [result[0] for result in top_results]
    ranked_ids = [result[1] for result in top_results]
    ranked_text = [result[2] for result in top_results]

    # Create a dictionary containing the search results

    query_text = query['Question']
    true_documents = query['passages_ids']
    question_id = query['question_id']

    search_results = {
    'question_id': question_id,
    'question': query_text,
    'ranked_ids': ranked_ids,
    'ranked_text': ranked_text,
    'true_passages': true_documents,
    'scores': bm25_scores, 
    'answer': query['Answer']
    }

    return search_results


def perform_bm25_search(queries, corpus, k):
    """
    Perform BM25 search for each query in a list of queries.
    Input:
        queries - a list of queries
        corpus: a list of document contents
        k: the number of top search results to retrieve
    Output: a list of dictionaries, where each dictionary contains the search results for a query
    """
    # Preprocess the corpus
    tokenized_corpus = [doc['Textual_Content'].lower().replace('\n', '').split() for doc in corpus]
    bm25 = BM25Okapi(tokenized_corpus)

    search_results = []

    for query in queries:
        results = bm25_search(query, corpus, bm25, k=k)
        search_results.append(results)

    return search_results


In [47]:
results_bm25 = perform_bm25_search(questions_for_ranking, passages_df, k=100) # 3:22 mins

In [49]:
counter = []
for p in passages_df:
    counter.append(len(p['Textual_Content'].split()))

In [50]:
max_value = max(counter)
max_indices = [index for index, value in enumerate(counter) if value == max_value]

##### evaluate

In [52]:
ks = [5, 10, 100]
# for document 0 the results are quite high
for k in ks:
    print('\nMetrics for k={}:'.format(k))
    average_metrics = calculate_average_metrics(results_bm25, k)
    for metric, value in average_metrics.items():
        print('{}: {:.4f}'.format(metric, value)) #BM25 +


Metrics for k=5:
average_recall@5: 0.0027
average_ndcg@5: 0.0086
average_rr@5: 0.0068
average_precision@5: 0.0027

Metrics for k=10:
average_recall@10: 0.0027
average_ndcg@10: 0.0083
average_rr@10: 0.0068
average_precision@10: 0.0027

Metrics for k=100:
average_recall@100: 0.0211
average_ndcg@100: 0.0347
average_rr@100: 0.0106
average_precision@100: 0.0029


In [55]:
with open('data/results_ranking/amsterdam_100words_ranked_bm25.pickle', 'wb') as f:
    pickle.dump(results_bm25, f)

In [56]:
with open('data/results_ranking/amsterdam_100words_ranked_random.pickle', 'wb') as f:
    pickle.dump(results_random, f)

### Evaluate ROUGE

In [57]:
from nltk.tokenize import word_tokenize

In [58]:
# Set the language to Dutch
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/natalipeeva/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/natalipeeva/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to
[nltk_data]     /Users/natalipeeva/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

In [59]:
ans_len = []
for q in questions_for_ranking:
    ans_len.append(len(word_tokenize(q['Answer'], language='dutch')))

In [60]:
sum(ans_len)/len(ans_len) # 256.87

256.8767123287671

In [61]:
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import corpus_bleu
from sklearn.metrics import f1_score

In [62]:
from nltk.translate.bleu_score import SmoothingFunction

In [63]:
def calculate_metrics(generated_summaries, reference_summaries):
    # Convert non-string elements to strings
    generated_summaries = [str(summary) for summary in generated_summaries]
    reference_summaries = [str(summary) for summary in reference_summaries]


    # Compute ROUGE scores
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'])
    rouge_scores = []
    for generated_summary, reference_summary in zip(generated_summaries, reference_summaries):
        scores = scorer.score(generated_summary, reference_summary)
        rouge_scores.append(scores)

    # Compute BLEU score
    bleu_score = corpus_bleu([[ref.split()] for ref in reference_summaries], [gen.split() for gen in generated_summaries], smoothing_function=SmoothingFunction().method1)

    # Compute F1 score
    f1 = f1_score(reference_summaries, generated_summaries, average='micro')  # Adjust 'average' parameter as needed


    # Access the scores as needed
    metrics = {
        "ROUGE-1 (Average)": sum([score['rouge1'].fmeasure for score in rouge_scores]) / len(rouge_scores),
        "ROUGE-2 (Average)": sum([score['rouge2'].fmeasure for score in rouge_scores]) / len(rouge_scores),
        "ROUGE-L (Average)": sum([score['rougeL'].fmeasure for score in rouge_scores]) / len(rouge_scores),
        "BLEU Score": bleu_score,
        "F1 Score": f1
    }

    return metrics


In [64]:
def simulate_answer(list_passages):
    concatenated_string = ""  # Initialize the concatenated string
    for paragraph in list_passages:
        tokens = word_tokenize(paragraph)
        if len(word_tokenize(concatenated_string)) + len(tokens) <= 256:
            concatenated_string += " " + paragraph
        else:
            break  # Stop concatenating if the token count exceeds 256
    return concatenated_string

In [65]:
simulated_answers = []
for result in results:
    simulated_answers.append(simulate_answer(result['ranked_text']))

In [66]:
answers = []
for question in questions_for_ranking:
    answers.append(question['Answer'])


In [67]:
calculate_metrics(simulated_answers, answers)

{'ROUGE-1 (Average)': 0.2313289582763451,
 'ROUGE-2 (Average)': 0.0344184919464686,
 'ROUGE-L (Average)': 0.12278253414413527,
 'BLEU Score': 0.007980492756711336,
 'F1 Score': 0.0}

In [68]:
simulated_answers_random = []
for result in results_random:
    simulated_answers_random.append(simulate_answer(result['ranked_text']))

In [69]:
calculate_metrics(simulated_answers_random, answers)

{'ROUGE-1 (Average)': 0.19952015091584155,
 'ROUGE-2 (Average)': 0.02156878779122388,
 'ROUGE-L (Average)': 0.1092907814352237,
 'BLEU Score': 0.0032005437617806634,
 'F1 Score': 0.0}

In [70]:
simulated_answers_bm25 = []
for result in results_bm25:
    simulated_answers_bm25.append(simulate_answer(result['ranked_text']))

In [71]:
calculate_metrics(simulated_answers_bm25, answers)

{'ROUGE-1 (Average)': 0.2538702529604056,
 'ROUGE-2 (Average)': 0.03811834858547292,
 'ROUGE-L (Average)': 0.1352721216338226,
 'BLEU Score': 0.00818301351999974,
 'F1 Score': 0.0}

## Ranking all with best ranker

### Get BM25 for all questions

#### preprocess data

In [72]:
questions = pd.read_csv('data/question_answer/questions.csv')

In [74]:
hashids = Hashids()
questions["question_id"] = [hashids.encode(i) for i in range(len(questions))]

In [75]:
questions.head()

Unnamed: 0,Year,Month,Question,Answer,Document,URLs,question_id
0,2018,12,\n \n1. Heeft het college kennisgenomen van de...,\nNee.,https://amsterdam.raadsinformatie.nl/document/...,,gY
1,2018,12,\n \n2. Kan het college bevestigen of dit lesm...,"\nNee, het college heeft hier geen zicht op. ...",https://amsterdam.raadsinformatie.nl/document/...,,jR
2,2018,12,\n \n ...,\nHet CIDI is duidelijk over de eigen doelste...,https://amsterdam.raadsinformatie.nl/document/...,,k5
3,2018,12,\n \n4. Is het college bekend met de jaarlijks...,\nHet college heeft hier kennis van genomen.,https://amsterdam.raadsinformatie.nl/document/...,,l5
4,2018,12,\n \na. Is het college van oordeel dat het CID...,vraag 4a: \nHet college is voor een pluriform...,https://amsterdam.raadsinformatie.nl/document/...,,mO


In [76]:
arrow_table = pa.Table.from_pandas(questions)
arrow_dict = arrow_table.to_pydict()
questions = Dataset.from_dict(arrow_dict)

In [77]:
questions[0]

{'Year': '2018',
 'Month': '12',
 'Question': '\n \n1. Heeft het college kennisgenomen van de genoemde publicatie van het CIDI?  \n \n',
 'Answer': ' \nNee.  ',
 'Document': 'https://amsterdam.raadsinformatie.nl/document/7170632/2/1382_19_Schriftelijke+vragen+Yimaz+indoctrinatie+via+lesmateriaal+van+CIDI',
 'URLs': None,
 'question_id': 'gY'}

### Rank Questions with BM25+

In [94]:
def bm25_search(query, corpus, bm25, k, true_documents):
    query_tokens = [token.lower() for token in str(query['Question']).lower().replace('\n', '').split()]
    scores = bm25.get_scores(query_tokens)
    results_heap = [(scores[i], doc['id'], doc['Textual_Content']) for i, doc in enumerate(corpus)]
    results_heap.sort(reverse=True)
    top_results = results_heap[:k]
    
    search_results = {
        'question_id': query['question_id'],
        'question': query['Question'],
        'ranked_ids': [result[1] for result in top_results],
        'ranked_text': [result[2] for result in top_results],
        'true_passages': query['passages_ids'] if true_documents else [],
        'scores': [result[0] for result in top_results],
        'answer': query['Answer']
    }
    
    return search_results


def perform_bm25_search(queries, corpus, k, true_documents=True):
    tokenized_corpus = [[token.lower() for token in doc['Textual_Content'].lower().replace('\n', '').split()] for doc in corpus]
    bm25 = BM25Okapi(tokenized_corpus)
    search_results = [bm25_search(query, corpus, bm25, k=k, true_documents=true_documents) for query in queries]
    return search_results



In [96]:
results_bm25_all = perform_bm25_search(questions_for_ranking, passages_df, k=10, true_documents=False) # 3:22 mins

KeyboardInterrupt: 

In [93]:
len(results_bm25_all)

73

In [97]:
with open('passages_pickled.picle', 'wb') as f:
    pickle.dump(passages_df, f)

### Elastic Search