### Setup CD

In [1]:
%cd  /Users/natalipeeva/Documents/GitHub/Automatic-Answering-of-City-Council-Questions/

/Users/natalipeeva/Documents/GitHub/Automatic-Answering-of-City-Council-Questions


In [2]:
import sys
import os
src_dir = os.path.join(os.getcwd(), 'src_clean')
sys.path.append(src_dir)

In [3]:
import pandas as pd
from hashids import Hashids
from tabulate import tabulate

In [4]:
from src_clean.preprocessing.passages import create_passages_dataframe, add_passages_ids
from src_clean.preprocessing.text_preprocessing import preprocess_text
from src_clean.ranking.sparse_ranking import perform_tfidf_search, perform_random_search, perform_bm25_search
from src_clean.ranking.evaluation import calculate_average_metrics_retrieval, calculate_metrics_answer_similarity, simulate_answer

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/natalipeeva/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Read Data

In [5]:
questions = pd.read_csv('data/question_answer/questions.csv')


In [6]:
len(questions)

19134

In [7]:
questions = pd.read_csv('data/question_answer/questions.csv')
collection = pd.read_csv('data/amsterdam/amsterdam_full.csv')

## Prepare Data for ranking

### Create passages

In [8]:
passages_df = create_passages_dataframe(collection) # 50 secs for execution
len(passages_df)

60902

### Clean passages

#### Drop duplicates in general; drop duplictates in column 'Textual_Content'

In [9]:
passages_df = passages_df.drop_duplicates() # drop dup.
len(passages_df)

56850

In [10]:
passages_df = passages_df.drop_duplicates(subset='Textual_Content', keep=False) # drop dup. passages
len(passages_df)

37177

#### Pre-process Passages Text

In [11]:
passages_df['Preprocessed_Text'] = passages_df['Textual_Content'].apply(lambda x: preprocess_text(x,stem=True,
                                                                                      remove_stopwords=True,
                                                                                      lowercase_text=True,
                                                                                      remove_punct=True)) # 40 seconds

#### Add Passage ids

In [12]:
hashids = Hashids()
passages_df["id"] = [hashids.encode(i) for i in range(len(passages_df))]

In [13]:
passages_df.tail()

Unnamed: 0,URL,Textual_Content,Preprocessed_Text,id
60888,https://www.amsterdam.nl/nrga/,Transponeringstabel Transponeringstabel ( PDF ...,transponeringstabel transponeringstabel pdf kb...,RrLE
60892,https://www.amsterdam.nl/nieuws/kennisgevingen...,Postcode Toelichting Vul een geldige postcode ...,postcod toelicht vul geldig postcod volgend fo...,VyNX
60893,https://www.amsterdam.nl/nieuws/kennisgevingen...,Verleend Pieter Calandlaan 339 05 juni 2023 Be...,verleend pieter calandlan juni besluit eveneme...,Wz8x
60894,https://www.amsterdam.nl/nieuws/kennisgevingen...,05 juni 2023 Besluit ( ) exploitatievergunning...,juni besluit exploitatievergunn horecabedrijf ...,XAMg
60895,https://www.amsterdam.nl/nieuws/kennisgevingen...,juni 2023 Besluit apv vergunning Verleend - Ou...,juni besluit apv vergunn verleend oudezijd voo...,YBM2


### Clean Questions

#### Pre-process questions and answers

In [14]:
questions.dropna(subset=['Question', 'Answer'], inplace=True)

In [16]:
questions = questions.drop_duplicates()

In [18]:
questions.drop_duplicates(subset='Answer', inplace=True)

In [20]:
questions.drop_duplicates(subset='Question', inplace=True)

In [22]:
questions.head()

Unnamed: 0,Year,Month,Question,Answer,Document,URLs
0,2018,12,\n \n1. Heeft het college kennisgenomen van de...,\nNee.,https://amsterdam.raadsinformatie.nl/document/...,
1,2018,12,\n \n2. Kan het college bevestigen of dit lesm...,"\nNee, het college heeft hier geen zicht op. ...",https://amsterdam.raadsinformatie.nl/document/...,
2,2018,12,\n \n ...,\nHet CIDI is duidelijk over de eigen doelste...,https://amsterdam.raadsinformatie.nl/document/...,
3,2018,12,\n \n4. Is het college bekend met de jaarlijks...,\nHet college heeft hier kennis van genomen.,https://amsterdam.raadsinformatie.nl/document/...,
4,2018,12,\n \na. Is het college van oordeel dat het CID...,vraag 4a: \nHet college is voor een pluriform...,https://amsterdam.raadsinformatie.nl/document/...,


In [23]:
questions['Preprocessed_Question'] = questions['Question'].apply(lambda x: preprocess_text(x,stem=True,
                                                                                      remove_stopwords=True,
                                                                                      lowercase_text=True,
                                                                                      remove_punct=True)) 

questions['Preprocessed_Answer'] = questions['Answer'].apply(lambda x: preprocess_text(x,stem=True,
                                                                                      remove_stopwords=True,
                                                                                      lowercase_text=True,
                                                                                      remove_punct=True)) 

In [24]:
questions.drop_duplicates(subset='Preprocessed_Question', inplace=True)

In [25]:
questions.drop_duplicates(subset='Preprocessed_Answer', inplace=True)

In [27]:
questions.describe()

Unnamed: 0,Year,Month,Question,Answer,Document,URLs,Preprocessed_Question,Preprocessed_Answer
count,17374,17374,17374,17374,17374,617,17374,17374
unique,10,12,17374,17374,2948,587,17374,17374
top,2020,8,\n \n1. Heeft het college kennisgenomen van de...,\nNee.,https://amsterdam.raadsinformatie.nl/document/...,https://www.rijksoverheid.nl/documenten/kamers...,colleg kennisgenom genoemd publicatie cidi,nee
freq,2620,1832,1,1,41,5,1,1


In [37]:
questions=questions[questions['Preprocessed_Answer'].str.split().str.len() > 10]

In [38]:
words_to_filter = ['bekend', 'kennisgenomen', 'op de hoogte', 'standpunt', 'mening', 'onbekend', 'vindt', 'visie',
                   'herkent']

In [39]:
questions = questions[~questions['Answer'].str.split().apply(lambda x: any(word.lower() in x for word in words_to_filter))]

In [40]:
questions = questions[~questions['Question'].str.split().apply(lambda x: any(word.lower() in x for word in words_to_filter))]

In [41]:
len(questions)

10971

In [42]:
questions=questions[questions['Preprocessed_Answer'].str.split().str.len() < 400]

In [43]:
#questions.head()

##### Add question ids 

In [44]:
hashids = Hashids()
questions["question_id"] = [hashids.encode(i) for i in range(len(questions))]

In [45]:
# questions.tail() # check

#### prepare dataset

In [46]:
from datasets import Dataset
import pyarrow as pa

In [47]:
arrow_table = pa.Table.from_pandas(questions)
arrow_dict = arrow_table.to_pydict()
questions = Dataset.from_dict(arrow_dict)

In [48]:
# questions[0] # check

In [49]:
arrow_table = pa.Table.from_pandas(passages_df)
arrow_dict = arrow_table.to_pydict()
passages_df = Dataset.from_dict(arrow_dict)

In [50]:
# passages_df[1000] # check

In [51]:
type(passages_df)

datasets.arrow_dataset.Dataset

## Ranking

### TF-IDF search

In [52]:
results_tfidf = perform_tfidf_search(questions, passages_df, k=100) # 6 mins for whole collection

### Random

In [57]:
import numpy as np

def random_search(query, collection, k, get_true_passages=False):
    if get_true_passages:
        true_documents = query['passages_ids']
    else:
        true_documents = []

    # Randomly select k documents from the collection
    collection_indices = np.arange(len(collection))
    random_indices = np.random.choice(collection_indices, size=k, replace=False)
    random_results = [collection[int(i)] for i in random_indices]

    ranked_ids = [result['id'] for result in random_results]
    ranked_text = [result['Textual_Content'] for result in random_results]
    ranked_preprocessed = [result['Preprocessed_Text'] for result in random_results]
    scores = []  # Assign random scores for demonstration purposes

    # Dictionary with search results
    search_results = {
        'question_id': query['question_id'],
        'question': query['Question'],
        'ranked_ids': ranked_ids,
        'ranked_text_preprocessed': ranked_preprocessed,
        'ranked_text': ranked_text,
        'true_passages': true_documents,
        'scores': scores,
        'answer': query['Answer'],
        'preprocessed_question': query['Preprocessed_Question'],
        'preprocessed_answer': query['Preprocessed_Answer']
    }

    return search_results


In [58]:
def perform_random_search(queries, collection, k, get_true_passages= False):
    """
    Perform random retrieval search for each query in a list of queries.
    Input:
        queries - a list of queries
        collection: a pandas DataFrame representing the collection
        k: the number of top search results to retrieve
    Output: a list of dictionaries containing the search results
    """
    search_results = []

    for query in queries:
        results = random_search(query, collection, k=k, get_true_passages=get_true_passages)
        search_results.append(results)

    return search_results

In [59]:
results_random = perform_random_search(questions, passages_df, k=100) # 18 secs

## Save ranked results

In [99]:
import os
import pickle

def save_results(results, question_collection, passages_collection, folder_path):
    # Create the folder if it doesn't exist
    os.makedirs(folder_path, exist_ok=True)
    
    # Get the variable names of question_collection and passages_collection
    question_var_name = [var_name for var_name, var_val in globals().items() if var_val is question_collection][0]
    passages_var_name = [var_name for var_name, var_val in globals().items() if var_val is passages_collection][0]

    # Save each result variable with its corresponding filename
    for result in results:
        # Get the variable name of the result variable
        result_var_name = [var_name for var_name, var_val in globals().items() if var_val is result][0]
        
        # Generate the filename based on variable names
        filename = '{}_{}_{}.pickle'.format(result_var_name, question_var_name, passages_var_name)
        
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'wb') as f:
            pickle.dump(result, f)


In [94]:
results = [results_tfidf, results_random]
question_collection = questions
passages_collection = passages_df
folder_path = 'data/results_ranking'

In [101]:
save_results(results, question_collection, passages_collection, folder_path)

In [102]:
import pickle
with open('data/results_ranking/questions_preprocessed.pickle', 'wb') as f:
    pickle.dump(questions, f)

with open('data/results_ranking/passages_preprocessed.pickle', 'wb') as f:
    pickle.dump(passages_df, f)

In [103]:
import pickle

with open('/Users/natalipeeva/Documents/GitHub/Automatic-Answering-of-City-Council-Questions/data/results_ranking/results_tfidf_questions_passages_df.pickle', 'rb') as f:
    test = pickle.load(f)