### Setup CD
- ranks all questions and preprocesses them withouth removing non-factual 

In [1]:
%cd  /Users/natalipeeva/Documents/GitHub/Automatic-Answering-of-City-Council-Questions/

/Users/natalipeeva/Documents/GitHub/Automatic-Answering-of-City-Council-Questions


In [2]:
import sys
import os
src_dir = os.path.join(os.getcwd(), 'src_clean')
sys.path.append(src_dir)

In [3]:
import pandas as pd
from hashids import Hashids
from tabulate import tabulate

In [4]:
from src_clean.preprocessing.passages import create_passages_dataframe, add_passages_ids
from src_clean.preprocessing.text_preprocessing import preprocess_text
from src_clean.ranking.sparse_ranking import perform_tfidf_search, perform_random_search, perform_bm25_search
from src_clean.ranking.evaluation import calculate_average_metrics_retrieval, calculate_metrics_answer_similarity, simulate_answer

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/natalipeeva/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Read Data

In [5]:
questions = pd.read_csv('data/question_answer/questions.csv')


In [6]:
len(questions)

19134

In [7]:
questions = pd.read_csv('data/question_answer/questions.csv')
collection = pd.read_csv('data/amsterdam/amsterdam_full.csv')

## Prepare Data for ranking

### Create passages

In [8]:
passages_df = create_passages_dataframe(collection) # 50 secs for execution
len(passages_df)

60903

### Clean passages

#### Drop duplicates in general; drop duplictates in column 'Textual_Content'

In [9]:
passages_df = passages_df.drop_duplicates() # drop dup.

In [10]:
passages_df = passages_df.drop_duplicates(subset='Textual_Content', keep=False) # drop dup. passages

#### Pre-process Passages Text

In [11]:
passages_df['Preprocessed_Text'] = passages_df['Textual_Content'].apply(lambda x: preprocess_text(x,stem=True,
                                                                                      remove_stopwords=True,
                                                                                      lowercase_text=True,
                                                                                      remove_punct=True)) # 40 seconds

#### Add Passage ids

In [12]:
hashids = Hashids()
passages_df["id"] = [hashids.encode(i) for i in range(len(passages_df))]

In [13]:
passages_df.tail()

Unnamed: 0,URL,Textual_Content,Preprocessed_Text,id
60893,https://www.amsterdam.nl/nieuws/kennisgevingen...,Postcode Toelichting Vul een geldige postcode ...,postcod toelicht vul geldig postcod volgend fo...,Wz8x
60894,https://www.amsterdam.nl/nieuws/kennisgevingen...,Wittenburgergracht 197 06 juni 2023 Besluit ap...,wittenburgergracht juni besluit apv vergunn ve...,XAMg
60895,https://www.amsterdam.nl/nieuws/kennisgevingen...,"19 1071BJ , Vrijheidslaan 25 1079KB , Vrijheid...",1071bj vrijheidslan 1079kb vrijheidslan 1079ke...,YBM2
60896,https://www.amsterdam.nl/nieuws/kennisgevingen...,Hartplein 2A in AMSTERDAM 06 juni 2023 Besluit...,hartplein 2a amsterdam juni besluit apv vergun...,ZDW5
60897,https://www.amsterdam.nl/nieuws/kennisgevingen...,binnentuin en privétuinen van het bouwblok tus...,binnentuin privetuin bouwblok tuss tugelaweg m...,1z9m


### Clean Questions

#### Pre-process questions and answers

In [14]:
questions.dropna(subset=['Question', 'Answer'], inplace=True)
questions = questions.drop_duplicates()
questions.drop_duplicates(subset='Answer', inplace=True)
questions.drop_duplicates(subset='Question', inplace=True)

In [15]:
questions.head()

Unnamed: 0,Year,Month,Question,Answer,Document,URLs
0,2018,12,\n \n1. Heeft het college kennisgenomen van de...,\nNee.,https://amsterdam.raadsinformatie.nl/document/...,
1,2018,12,\n \n2. Kan het college bevestigen of dit lesm...,"\nNee, het college heeft hier geen zicht op. ...",https://amsterdam.raadsinformatie.nl/document/...,
2,2018,12,\n \n ...,\nHet CIDI is duidelijk over de eigen doelste...,https://amsterdam.raadsinformatie.nl/document/...,
3,2018,12,\n \n4. Is het college bekend met de jaarlijks...,\nHet college heeft hier kennis van genomen.,https://amsterdam.raadsinformatie.nl/document/...,
4,2018,12,\n \na. Is het college van oordeel dat het CID...,vraag 4a: \nHet college is voor een pluriform...,https://amsterdam.raadsinformatie.nl/document/...,


In [16]:
questions['Preprocessed_Question'] = questions['Question'].apply(lambda x: preprocess_text(x,stem=True,
                                                                                      remove_stopwords=True,
                                                                                      lowercase_text=True,
                                                                                      remove_punct=True)) 

questions['Preprocessed_Answer'] = questions['Answer'].apply(lambda x: preprocess_text(x,stem=True,
                                                                                      remove_stopwords=True,
                                                                                      lowercase_text=True,
                                                                                      remove_punct=True)) 

In [17]:
questions.drop_duplicates(subset='Preprocessed_Question', inplace=True)
questions.drop_duplicates(subset='Preprocessed_Answer', inplace=True)

In [18]:
questions.describe()

Unnamed: 0,Year,Month,Question,Answer,Document,URLs,Preprocessed_Question,Preprocessed_Answer
count,17374,17374,17374,17374,17374,617,17374,17374
unique,10,12,17374,17374,2948,587,17374,17374
top,2020,8,\n \n1. Heeft het college kennisgenomen van de...,\nNee.,https://amsterdam.raadsinformatie.nl/document/...,https://www.rijksoverheid.nl/documenten/kamers...,colleg kennisgenom genoemd publicatie cidi,nee
freq,2620,1832,1,1,41,5,1,1


In [19]:
question_lengths = []
for q in questions['Preprocessed_Question']:
    question_lengths.append(len(q.split()))

In [20]:
import matplotlib.pyplot as plt

In [21]:
questions=questions[questions['Preprocessed_Answer'].str.split().str.len() > 10]

#### Optioonal

In [22]:
words_to_filter = ['bekend', 'kennisgenomen', 'op de hoogte', 'standpunt', 'mening', 'onbekend', 'vindt', 'visie',
                   'herkent']

In [23]:
questions = questions[~questions['Answer'].str.split().apply(lambda x: any(word.lower() in x for word in words_to_filter))]

In [24]:
questions = questions[~questions['Question'].str.split().apply(lambda x: any(word.lower() in x for word in words_to_filter))]

#### Optional lenth

In [25]:
questions=questions[questions['Preprocessed_Answer'].str.split().str.len() < 400]

##### Add question ids 

In [26]:
hashids = Hashids()
questions["question_id"] = [hashids.encode(i) for i in range(len(questions))]

#### prepare dataset

In [27]:
from datasets import Dataset
import pyarrow as pa

In [28]:
arrow_table = pa.Table.from_pandas(questions)
arrow_dict = arrow_table.to_pydict()
questions = Dataset.from_dict(arrow_dict)

In [29]:
arrow_table = pa.Table.from_pandas(passages_df)
arrow_dict = arrow_table.to_pydict()
passages_df = Dataset.from_dict(arrow_dict)

In [30]:
type(passages_df)

datasets.arrow_dataset.Dataset

## Ranking

### TF-IDF search

In [31]:
results_tfidf = perform_tfidf_search(questions, passages_df, k=10) # 6 mins for whole collection

## Evaluation

#### Table

#### ROUGE Scores - Preprocessed ranked text and answer

#### Make Table

In [32]:
answers = list(questions['Preprocessed_Answer']) # preprocessed

In [33]:
results_lists = [results_tfidf]
retrieval_names = ['TF-IDF']

simulated_answers = {}
metrics = {}

for results, retrieval_name in zip(results_lists, retrieval_names):
    simulated_answers[retrieval_name] = []
    metrics[retrieval_name] = {}
    
    for result in results: # generate simulated answers = get top retrieved documents till tokens are < 256
        simulated_answer = simulate_answer(result['ranked_text_preprocessed'])
        simulated_answers[retrieval_name].append(simulated_answer)
    
    metric_values = calculate_metrics_answer_similarity(simulated_answers[retrieval_name], answers) # get ROUGE 
    
    metrics[retrieval_name] = metric_values

# Print the metrics table
table = []
headers = ['Retrieval Method'] + list(metric_values.keys())

for retrieval_name in retrieval_names:
    row = [retrieval_name] + [metrics[retrieval_name][metric] for metric in metric_values.keys()]
    table.append(row)


#### Table

In [34]:
#print(tabulate(table, headers=headers, tablefmt="grid"))

## Save ranked results

In [36]:
import os
import pickle

def save_results(results, question_collection, passages_collection, folder_path):
    # Create the folder if it doesn't exist
    os.makedirs(folder_path, exist_ok=True)
    
    # Get the variable names of question_collection and passages_collection
    question_var_name = [var_name for var_name, var_val in globals().items() if var_val is question_collection][0]
    passages_var_name = [var_name for var_name, var_val in globals().items() if var_val is passages_collection][0]

    # Save each result variable with its corresponding filename
    for result in results:
        # Get the variable name of the result variable
        result_var_name = [var_name for var_name, var_val in globals().items() if var_val is result][0]
        
        # Generate the filename based on variable names
        filename = '{}_{}_{}.pickle'.format(result_var_name, question_var_name, passages_var_name)
        
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'wb') as f:
            pickle.dump(result, f)


In [37]:
results = [results_tfidf]
question_collection = questions
passages_collection = passages_df
folder_path = 'data/results_ranking/with-factual'

In [38]:
save_results(results, question_collection, passages_collection, folder_path)

In [40]:
import pickle
with open('data/results_ranking/with-nonfactual/questions_preprocessed.pickle', 'wb') as f:
    pickle.dump(questions, f)
