### Setup CD

In [1]:
%cd  /Users/natalipeeva/Documents/GitHub/Automatic-Answering-of-City-Council-Questions/

/Users/natalipeeva/Documents/GitHub/Automatic-Answering-of-City-Council-Questions


In [2]:
import sys
import os
src_dir = os.path.join(os.getcwd(), 'src_clean')
sys.path.append(src_dir)

In [3]:
import pandas as pd
from hashids import Hashids
from tabulate import tabulate

In [4]:
from src_clean.preprocessing.passages import create_passages_dataframe, add_passages_ids
from src_clean.preprocessing.text_preprocessing import preprocess_text
from src_clean.ranking.sparse_ranking import perform_tfidf_search, perform_random_search, perform_bm25_search
from src_clean.ranking.evaluation import calculate_average_metrics_retrieval, calculate_metrics_answer_similarity, simulate_answer

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/natalipeeva/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Read Data

In [5]:
questions_for_ranking = pd.read_csv('data/amsterdam/amsterdam_questions.csv')
collection = pd.read_csv('data/amsterdam/amsterdam_full.csv')

## Prepare Data for ranking

### Create passages

In [6]:
passages_df = create_passages_dataframe(collection) # 50 secs for execution

### Clean passages

#### Drop duplicates in general; drop duplictates in column 'Textual_Content'

In [7]:
passages_df = passages_df.drop_duplicates() # drop dup.

In [8]:
passages_df = passages_df.drop_duplicates(subset='Textual_Content', keep=False) # drop dup. passages

#### Pre-process Passages Text

In [9]:
passages_df['Preprocessed_Text'] = passages_df['Textual_Content'].apply(lambda x: preprocess_text(x,stem=True,
                                                                                      remove_stopwords=True,
                                                                                      lowercase_text=True,
                                                                                      remove_punct=True)) # 40 seconds

#### Add Passage ids

In [10]:
hashids = Hashids()
passages_df["id"] = [hashids.encode(i) for i in range(len(passages_df))]

In [11]:
passages_df.tail()

Unnamed: 0,URL,Textual_Content,Preprocessed_Text,id
60893,https://www.amsterdam.nl/nieuws/kennisgevingen...,Postcode Toelichting Vul een geldige postcode ...,postcod toelicht vul geldig postcod volgend fo...,Wz8x
60894,https://www.amsterdam.nl/nieuws/kennisgevingen...,Wittenburgergracht 197 06 juni 2023 Besluit ap...,wittenburgergracht juni besluit apv vergunn ve...,XAMg
60895,https://www.amsterdam.nl/nieuws/kennisgevingen...,"19 1071BJ , Vrijheidslaan 25 1079KB , Vrijheid...",1071bj vrijheidslan 1079kb vrijheidslan 1079ke...,YBM2
60896,https://www.amsterdam.nl/nieuws/kennisgevingen...,Hartplein 2A in AMSTERDAM 06 juni 2023 Besluit...,hartplein 2a amsterdam juni besluit apv vergun...,ZDW5
60897,https://www.amsterdam.nl/nieuws/kennisgevingen...,binnentuin en privétuinen van het bouwblok tus...,binnentuin privetuin bouwblok tuss tugelaweg m...,1z9m


#### Check how many URLs are matching after pre-processing
- the removing duplicates in particular might have caused some URLs to not be in the collection anymore, however, if there is relevant content, it should still be in the passage collection

In [12]:
import ast

In [13]:
urls_unformatted = list(questions_for_ranking['URLs'])
reference_list = []
for string in urls_unformatted:
    sublist = ast.literal_eval(string)
    reference_list.append(sublist)

reference_urls = []
for sublist in reference_list:
    reference_urls.extend(sublist)

collected_urls = list(passages_df['URL'])
count = 0
elements_not_in_collected = []
for element in reference_urls:
    if element in collected_urls:
        count += 1
    else:
        elements_not_in_collected.append(element)


In [14]:
#count # it was 80 before droping duplicates, so some URLs are not in the collection anymore

### Clean Questions

#### Pre-process questions and answers

In [15]:
questions_for_ranking['Preprocessed_Question'] = questions_for_ranking['Question'].apply(lambda x: preprocess_text(x,stem=True,
                                                                                      remove_stopwords=True,
                                                                                      lowercase_text=True,
                                                                                      remove_punct=True)) 

questions_for_ranking['Preprocessed_Answer'] = questions_for_ranking['Answer'].apply(lambda x: preprocess_text(x,stem=True,
                                                                                      remove_stopwords=True,
                                                                                      lowercase_text=True,
                                                                                      remove_punct=True)) 

In [16]:
#questions_for_ranking.head()

#### add relevant passage ids to questions 
- Meaning: add the passages that are part of referenced URLs

In [17]:
questions_for_ranking = add_passages_ids(questions_for_ranking, passages_df) # 1:17 min

In [18]:
# questions_for_ranking.head()

#### remove the questions that have no reference anymore 
- Idea: so it doesn't bring down retrieval scores + avoid zero division

In [19]:
questions_for_ranking = questions_for_ranking[questions_for_ranking['passages_ids'].notna()] # remove no reference

In [20]:
#len(questions_for_ranking) # 52 samples left out of around 70

##### Add question ids 

In [21]:
hashids = Hashids()
questions_for_ranking["question_id"] = [hashids.encode(i) for i in range(len(questions_for_ranking))]

In [22]:
# questions_for_ranking.tail() # check

#### remove unused columns - questions

In [23]:
#questions_for_ranking.drop('Unnamed: 0.1', axis=1, inplace=True)
#questions_for_ranking.drop('Unnamed: 0', axis=1, inplace=True)
#questions_for_ranking.drop('Cleaned_URLs', axis=1, inplace=True)

#### prepare dataset

In [24]:
from datasets import Dataset
import pyarrow as pa

In [25]:
arrow_table = pa.Table.from_pandas(questions_for_ranking)
arrow_dict = arrow_table.to_pydict()
questions_for_ranking = Dataset.from_dict(arrow_dict)

In [26]:
# questions_for_ranking[0] # check

In [27]:
arrow_table = pa.Table.from_pandas(passages_df)
arrow_dict = arrow_table.to_pydict()
passages_df = Dataset.from_dict(arrow_dict)

In [28]:
# passages_df[1000] # check

In [29]:
type(passages_df)

datasets.arrow_dataset.Dataset

## Ranking

### TF-IDF search

In [30]:
results_tfidf = perform_tfidf_search(questions_for_ranking, passages_df, k=100, get_true_passages=True) # 1min:13 without NN

### Random

In [31]:
results_random = perform_random_search(questions_for_ranking, passages_df, k=100, get_true_passages=True) # 18 secs

### BM25

In [32]:
results_bm25 = perform_bm25_search(questions_for_ranking, passages_df, k=100, get_true_passages=True) # 1min 24

### Check Average Answer length
- Idea: replicate length with retrieved passages to 'simulate' an answer

In [33]:
from nltk.tokenize import word_tokenize
import nltk
# Set the language to Dutch
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/natalipeeva/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/natalipeeva/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to
[nltk_data]     /Users/natalipeeva/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

In [34]:
ans_len = []
for q in questions_for_ranking:
    ans_len.append(len(word_tokenize(q['Answer'], language='dutch')))
sum(ans_len)/len(ans_len) # 256.87 -> make limit for tokens in ranked content 256

252.8653846153846

## Evaluation

### Retrieval Scores - ranking method

#### Make Table

In [35]:
ks = [5, 10, 100]
results_lists = [results_tfidf, results_random, results_bm25]
retrieval_names = ['TF-IDF', 'Random', 'BM25']

table = []

for results, retrieval_name in zip(results_lists, retrieval_names):
    for k in ks:
        row = ['Metrics for k={} (Retrieval: {})'.format(k, retrieval_name)]
        average_metrics = calculate_average_metrics_retrieval(results, k)
        for metric, value in average_metrics.items():
            row.append('{:.4f}'.format(value))
        table.append(row)
headers = [''] + list(average_metrics.keys())

#### Table

In [36]:
print(tabulate(table, headers=headers, tablefmt="grid"))

+---------------------------------------+----------------------+--------------------+------------------+-------------------------+
|                                       |   average_recall@100 |   average_ndcg@100 |   average_rr@100 |   average_precision@100 |
| Metrics for k=5 (Retrieval: TF-IDF)   |               0.0397 |             0.0838 |           0.0663 |                  0.0346 |
+---------------------------------------+----------------------+--------------------+------------------+-------------------------+
| Metrics for k=10 (Retrieval: TF-IDF)  |               0.0312 |             0.0949 |           0.0781 |                  0.0212 |
+---------------------------------------+----------------------+--------------------+------------------+-------------------------+
| Metrics for k=100 (Retrieval: TF-IDF) |               0.102  |             0.0961 |           0.0804 |                  0.009  |
+---------------------------------------+----------------------+-------------------

#### ROUGE Scores - Preprocessed ranked text and answer

#### Make Table

In [37]:
answers = list(questions_for_ranking['Preprocessed_Answer']) # preprocessed

In [38]:
results_lists = [results_tfidf, results_random, results_bm25]
retrieval_names = ['TF-IDF', 'Random', 'BM25']

simulated_answers = {}
metrics = {}

for results, retrieval_name in zip(results_lists, retrieval_names):
    simulated_answers[retrieval_name] = []
    metrics[retrieval_name] = {}
    
    for result in results: # generate simulated answers = get top retrieved documents till tokens are < 256
        simulated_answer = simulate_answer(result['ranked_text_preprocessed'])
        simulated_answers[retrieval_name].append(simulated_answer)
    
    metric_values = calculate_metrics_answer_similarity(simulated_answers[retrieval_name], answers) # get ROUGE 
    
    metrics[retrieval_name] = metric_values

# Print the metrics table
table = []
headers = ['Retrieval Method'] + list(metric_values.keys())

for retrieval_name in retrieval_names:
    row = [retrieval_name] + [metrics[retrieval_name][metric] for metric in metric_values.keys()]
    table.append(row)


#### Table

In [39]:
print(tabulate(table, headers=headers, tablefmt="grid"))

+--------------------+---------------------+---------------------+---------------------+--------------+------------+
| Retrieval Method   |   ROUGE-1 (Average) |   ROUGE-2 (Average) |   ROUGE-L (Average) |   BLEU Score |   F1 Score |
| TF-IDF             |           0.0967257 |          0.0136018  |           0.0571549 |  0.00204189  |          0 |
+--------------------+---------------------+---------------------+---------------------+--------------+------------+
| Random             |           0.0619725 |          0.00709693 |           0.0355553 |  0.000267472 |          0 |
+--------------------+---------------------+---------------------+---------------------+--------------+------------+
| BM25               |           0.095807  |          0.0102444  |           0.0535993 |  0.00191856  |          0 |
+--------------------+---------------------+---------------------+---------------------+--------------+------------+


#### ROUGE Scores - Unpreprocessed ranked text and answer

#### Make table

In [40]:
answers = list(questions_for_ranking['Answer']) # unpreprocessed

In [41]:
results_lists = [results_tfidf, results_random, results_bm25]
retrieval_names = ['TF-IDF', 'Random', 'BM25']

simulated_answers = {}
metrics = {}

for results, retrieval_name in zip(results_lists, retrieval_names):
    simulated_answers[retrieval_name] = []
    metrics[retrieval_name] = {}
    
    for result in results: # generate simulated answers = get top retrieved documents till tokens are < 256
        simulated_answer = simulate_answer(result['ranked_text']) #unpreprocessed
        simulated_answers[retrieval_name].append(simulated_answer)
    
    metric_values = calculate_metrics_answer_similarity(simulated_answers[retrieval_name], answers) # get ROUGE 
    
    metrics[retrieval_name] = metric_values

# Print the metrics table
table = []
headers = ['Retrieval Method'] + list(metric_values.keys())

for retrieval_name in retrieval_names:
    row = [retrieval_name] + [metrics[retrieval_name][metric] for metric in metric_values.keys()]
    table.append(row)

#### Table

In [42]:
print(tabulate(table, headers=headers, tablefmt="grid"))

+--------------------+---------------------+---------------------+---------------------+--------------+------------+
| Retrieval Method   |   ROUGE-1 (Average) |   ROUGE-2 (Average) |   ROUGE-L (Average) |   BLEU Score |   F1 Score |
| TF-IDF             |            0.216264 |           0.0303644 |            0.112037 |   0.00603454 |          0 |
+--------------------+---------------------+---------------------+---------------------+--------------+------------+
| Random             |            0.202333 |           0.022343  |            0.107343 |   0.00180064 |          0 |
+--------------------+---------------------+---------------------+---------------------+--------------+------------+
| BM25               |            0.229851 |           0.0332365 |            0.122055 |   0.00798236 |          0 |
+--------------------+---------------------+---------------------+---------------------+--------------+------------+


## Save ranked results

In [43]:
import os
import pickle

def save_results(results, question_collection, passages_collection, folder_path):
    # Create the folder if it doesn't exist
    os.makedirs(folder_path, exist_ok=True)
    
    # Get the variable names of question_collection and passages_collection
    question_var_name = [var_name for var_name, var_val in globals().items() if var_val is question_collection][0]
    passages_var_name = [var_name for var_name, var_val in globals().items() if var_val is passages_collection][0]

    # Save each result variable with its corresponding filename
    for result in results:
        # Get the variable name of the result variable
        result_var_name = [var_name for var_name, var_val in globals().items() if var_val is result][0]
        
        # Generate the filename based on variable names
        filename = '{}_{}_{}.pickle'.format(result_var_name, question_var_name, passages_var_name)
        
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'wb') as f:
            pickle.dump(result, f)


In [44]:
results = [results_tfidf, results_bm25, results_random]
question_collection = questions_for_ranking
passages_collection = passages_df
folder_path = 'data/results_ranking'

In [45]:
save_results(results, question_collection, passages_collection, folder_path)