####

# TF-IDF - smaller sample
This notebook will perform TF-IDF & cosine similarity based retrieval, where the sample would include only paths that have been collected.

In [1]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from bs4 import BeautifulSoup
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/natalipeeva/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/natalipeeva/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [85]:
from collections import Counter

In [113]:
from sklearn.metrics import classification_report

In [119]:
import requests

In [131]:
import time
import random

In [2]:
%cd '/Users/natalipeeva/Desktop'

/Users/natalipeeva/Desktop


In [5]:
with open('Data Samples/amsterdam_corpus.pickle', 'rb') as f:
    html_contents = pickle.load(f)

with open('amsterdam_questions.pickle', "rb") as f:
     amsterdam_questions = pickle.load(f)

with open('/Users/natalipeeva/Desktop/TF-IDF Ranking/afval-hergebruik.pickle', 'rb') as f:
    afval_contents = pickle.load(f)

with open('/Users/natalipeeva/Desktop/TF-IDF Ranking/bestur-organisatie.pickle', 'rb') as f:
    bestur_contents = pickle.load(f)

#### Combine supporting documents data

In [16]:
combined = afval_contents + bestur_contents + html_contents
len(combined)

11660

#### Get URLs and URL paths

In [9]:
def extract_path(link, start):
    """
    Where start is the domain name; e.g. 'www.amsterdam.nl/'
    """
    # start = 'www.amsterdam.nl/'
    end = '/'

    start_index = link.index(start) + len(start)
    end_index = link.index(end, start_index)

    result = link[start_index:end_index]

    return result

In [10]:
def get_all_paths(l):
    url_paths = []
    for link in l:
        try: 
            url_paths.append((link, extract_path(link, 'www.amsterdam.nl/')))
        except:
            try: 
                url_paths.append((link, link.split('http://www.amsterdam.nl/')[1]))
            except:
                url_paths.append((link, link))
    return url_paths

In [None]:
def remove_path_and_index(url):
    """
    Function that cleans the folder path to transform it to URL
    """
    first_slash_index = url.find('/') + 1  # Find index of first slash and add 1 to exclude it
    index_index = url.find('index')  # Find index of "index"
    if index_index == -1:  # If "index" not found, return the original string
        return url
    else:
        return url[first_slash_index:index_index]  # Return the string between the first slash and "index"

In [None]:
def clean_url(url):
    """Keep the URL part that only contains www.amsterdam onwards."""
    pattern = r'(www\.amsterdam.*)'
    match = re.search(pattern, url)
    if match:
        url = match.group(1)
        return url

In [None]:
def modify_link(link):
    # Remove '/index.html' and '.html' using regex
    modified_link = re.sub(r'(/index)?\.html', '', link)
    
    if 'veelgevraagd' in link:
        # Remove curly braces around categoryid using regex
        modified_link = re.sub(r'{(.+?)}', r'\1', modified_link)
    
    # Replace '?' with '/?'
    modified_link = modified_link.replace('?', '/?')
    return modified_link

In [44]:
def get_path(URL):
    match = re.search(r"(?<=amsterdam\.nl/)[^/]+", url)

    if match:
        result = match.group(0)
        return match.group()
    else:
        return str(URL)

#### Get text and pre-process

In [51]:
def get_doc_text(html_content):
    """
    Input: HTML
    Output: text
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    text = soup.get_text()
    
    return text

In [52]:
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Tokenize text into words
    words = word_tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('dutch'))
    words = [word for word in words if word not in stop_words]

    # Stem words
    stemmer = nltk.stem.snowball.DutchStemmer()
    words = [stemmer.stem(word) for word in words]

    # Join words back into a string
    text = ' '.join(words)

    return text

#### Pre-process + Make a supporting documents DataFrame
The following cells will make a DataFrame for the supporting documents which consists of the URL, HTML, Text, Pre-processed text as well as the URL Path of the documents.

In [45]:
supporting_documents = pd.DataFrame(combined, columns=['URL', 'HTML'])

In [1]:
supporting_documents.head()

NameError: name 'supporting_documents' is not defined

In [47]:
paths = []
for url in supporting_documents['URL']:
    paths.append(get_path(url))

In [50]:
supporting_documents['Paths'] = paths
supporting_documents.head()

Unnamed: 0,URL,HTML,Paths
0,https://www.amsterdam.nl/afval-en-hergebruik,b'<!DOCTYPE html>\n<html lang=nl>\n<head prefi...,afval-en-hergebruik
1,https://www.amsterdam.nl/afval-en-hergebruik/?...,b'<!DOCTYPE html>\n<html lang=nl>\n<head prefi...,afval-en-hergebruik
2,https://www.amsterdam.nl/afval-en-hergebruik/?...,b'<!DOCTYPE html>\n<html lang=nl>\n<head prefi...,afval-en-hergebruik
3,https://www.amsterdam.nl/afval-en-hergebruik/?...,b'<!DOCTYPE html>\n<html lang=nl>\n<head prefi...,afval-en-hergebruik
4,https://www.amsterdam.nl/afval-en-hergebruik?p...,b'<!DOCTYPE html>\n<html lang=nl>\n<head prefi...,afval-en-hergebruik?print=true


In [53]:
text = [get_doc_text(html) for html in supporting_documents['HTML']]
pre_processed_text = [preprocess_text(t) for t in text]

In [54]:
supporting_documents['Text'] = text
supporting_documents['Pre-processed Text'] = pre_processed_text

In [55]:
supporting_documents.head()

Unnamed: 0,URL,HTML,Paths,Text,Pre-processed Text
0,https://www.amsterdam.nl/afval-en-hergebruik,b'<!DOCTYPE html>\n<html lang=nl>\n<head prefi...,afval-en-hergebruik,\n\n\n\nAfval - Gemeente Amsterdam\n\n\n\n\n\n...,afval gemeent amsterdam direct inhoud gemeente...
1,https://www.amsterdam.nl/afval-en-hergebruik/?...,b'<!DOCTYPE html>\n<html lang=nl>\n<head prefi...,afval-en-hergebruik,\n\n\n\nAfval - Gemeente Amsterdam\n\n\n\n\n\n...,afval gemeent amsterdam direct inhoud gemeente...
2,https://www.amsterdam.nl/afval-en-hergebruik/?...,b'<!DOCTYPE html>\n<html lang=nl>\n<head prefi...,afval-en-hergebruik,\n\n\n\nAfval - Gemeente Amsterdam\n\n\n\n\n\n...,afval gemeent amsterdam direct inhoud gemeente...
3,https://www.amsterdam.nl/afval-en-hergebruik/?...,b'<!DOCTYPE html>\n<html lang=nl>\n<head prefi...,afval-en-hergebruik,\n\n\n\nAfval - Gemeente Amsterdam\n\n\n\n\n\n...,afval gemeent amsterdam gemeenteamsterdam amst...
4,https://www.amsterdam.nl/afval-en-hergebruik?p...,b'<!DOCTYPE html>\n<html lang=nl>\n<head prefi...,afval-en-hergebruik?print=true,\n\n\n\nAfval - Gemeente Amsterdam\n\n\n\n\n\n...,afval gemeent amsterdam gemeenteamsterdam amst...


In [60]:
# Check for duplicates 
print(len(supporting_documents['Pre-processed Text']))
print(len(set(supporting_documents['Pre-processed Text'])))
print('=> There are duplicates')

11660
9107
=> There are duplicates


In [61]:
supporting_documents = supporting_documents.drop_duplicates(subset='Pre-processed Text', keep='first')

In [65]:
print('Our collection has the following paths', set(supporting_documents['Paths']))

Our collection has the following paths {'subsidies', 'kunst-cultuur', 'afval-en-hergebruik', 'sport', 'zorg-ondersteuning', 'diversiteit', 'afval-hergebruik', 'veelgevraagd', 'onderwijs-jeugd', 'bestuur-organisatie', 'bestuur-en-organisatie', 'wonen-leefomgeving', 'projecten', 'parkeren', 'burgerzaken', 'belastingen-heffingen', 'toerisme-vrije-tijd', 'verkeer-vervoer', 'ondernemen', 'stadsdelen', 'werk-inkomen'}


In [90]:
keep_urls = list(set(supporting_documents['Paths']))

In [None]:
# chcek which questions are amsterdam.nl & have one of the distinct paths
# remove URLs that 

### Pre-process Q&A and restructure DataFrame

In [66]:
with open('amsterdam_questions.pickle', "rb") as f:
     amsterdam_questions = pickle.load(f)

In [69]:
questions = pd.read_csv(open('URL Analysis/questions.csv', 'r'))

In [71]:
questions.head()

Unnamed: 0,Year,Month,Question,Answer,Document,URLs
0,2018,12,\n \n1. Heeft het college kennisgenomen van de...,\nNee.,https://amsterdam.raadsinformatie.nl/document/...,
1,2018,12,\n \n2. Kan het college bevestigen of dit lesm...,"\nNee, het college heeft hier geen zicht op. ...",https://amsterdam.raadsinformatie.nl/document/...,
2,2018,12,\n \n ...,\nHet CIDI is duidelijk over de eigen doelste...,https://amsterdam.raadsinformatie.nl/document/...,
3,2018,12,\n \n4. Is het college bekend met de jaarlijks...,\nHet college heeft hier kennis van genomen.,https://amsterdam.raadsinformatie.nl/document/...,
4,2018,12,\n \na. Is het college van oordeel dat het CID...,vraag 4a: \nHet college is voor een pluriform...,https://amsterdam.raadsinformatie.nl/document/...,


In [77]:
mask = questions['URLs'].astype(str).str.contains('www.amsterdam.nl')
amsterdam_questions = questions[mask]

In [81]:
amsterdam_questions['URLs'].iloc[0]

'https://www.zwemwater.nl/.Hier\nhttps://maps.amsterdam.nl/zwemwater/\nhttps://www.amsterdam.nl/veelgevraagd/?caseid=%7BD6E280FB-4A76-40A0-9B88-12B87E446FA6%7D\nhttps://www.ggd.amsterdam.nl/gezond-wonen/zwemmen-open-water/'

In [83]:
reference_paths = []
for url in amsterdam_questions['URLs']:
    reference_paths.append(get_path(url))

In [87]:
Counter(reference_paths).most_common(5)

[('wonen-leefomgeving', 12),
 ('bestuur-organisatie', 10),
 ('publish', 7),
 ('veelgevraagd', 6),
 ('nieuwsbrieven', 6)]

In [88]:
amsterdam_questions['Paths'] = reference_paths

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  amsterdam_questions['Paths'] = reference_paths


In [93]:
keep_urls[:3]

['subsidies', 'kunst-cultuur', 'afval-en-hergebruik']

In [94]:
filtered_amsterdam = amsterdam_questions[amsterdam_questions['Paths'].isin(keep_urls)]

In [97]:
# Check if filtering worked 
set(filtered_amsterdam['Paths'])

{'bestuur-organisatie',
 'ondernemen',
 'projecten',
 'sport',
 'subsidies',
 'toerisme-vrije-tijd',
 'veelgevraagd',
 'werk-inkomen',
 'wonen-leefomgeving',
 'zorg-ondersteuning'}

In [117]:
Counter(filtered_amsterdam['Paths']).most_common()

[('wonen-leefomgeving', 12),
 ('bestuur-organisatie', 10),
 ('veelgevraagd', 6),
 ('zorg-ondersteuning', 5),
 ('projecten', 5),
 ('werk-inkomen', 2),
 ('toerisme-vrije-tijd', 1),
 ('ondernemen', 1),
 ('sport', 1),
 ('subsidies', 1)]

In [98]:
pre_processed_questions = [preprocess_text(q) for q in filtered_amsterdam['Question']]
filtered_amsterdam['Pre-processed Question'] = pre_processed_questions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_amsterdam['Pre-processed Question'] = pre_processed_questions


In [109]:
len(filtered_amsterdam)

44

### TF-IDF

In [102]:
documents = list(supporting_documents['Pre-processed Text'])
queries = list(filtered_amsterdam['Pre-processed Question'])

In [103]:
vectorizer = TfidfVectorizer()
document_vectors = vectorizer.fit_transform(documents)
query_vectors = vectorizer.transform(queries)

# Calculate cosine similarity
similarity_matrix = cosine_similarity(query_vectors, document_vectors)

# Sort documents by cosine similarity score and return top k
k = 1
top_k_docs = []
for i, query in enumerate(queries):
    doc_indices = similarity_matrix[i].argsort()[::-1][:k]
    #top_k_docs.append((query, [documents[j] for j in doc_indices]))
    
    top_k_docs.append((query, [documents[j] for j in doc_indices]))
    
#print(top_k_docs)


In [105]:
ranking = pd.DataFrame()
ranking['Question'] = [pair[0] for pair in top_k_docs]
ranking['Rank1'] = [pair[1][0] for pair in top_k_docs]

In [106]:
ranking = pd.merge(ranking, supporting_documents, left_on='Rank1', right_on='Pre-processed Text', how='left')

### Predictions

In [114]:
ranking['Paths'][:5]

0    bestuur-organisatie
1     zorg-ondersteuning
2           veelgevraagd
3     wonen-leefomgeving
4           veelgevraagd
Name: Paths, dtype: object

In [115]:
filtered_amsterdam['Paths'][:5]

615     zorg-ondersteuning
620     zorg-ondersteuning
856     wonen-leefomgeving
1237    wonen-leefomgeving
1587          veelgevraagd
Name: Paths, dtype: object

In [118]:
print(Counter(filtered_amsterdam['Paths']).most_common())

[('wonen-leefomgeving', 12), ('bestuur-organisatie', 10), ('veelgevraagd', 6), ('zorg-ondersteuning', 5), ('projecten', 5), ('werk-inkomen', 2), ('toerisme-vrije-tijd', 1), ('ondernemen', 1), ('sport', 1), ('subsidies', 1)]


In [116]:
print(classification_report(filtered_amsterdam['Paths'], ranking['Paths'])) # predictions without removing the not working URls

                     precision    recall  f1-score   support

bestuur-organisatie       0.40      0.40      0.40        10
      kunst-cultuur       0.00      0.00      0.00         0
         ondernemen       0.00      0.00      0.00         1
           parkeren       0.00      0.00      0.00         0
          projecten       0.33      0.20      0.25         5
              sport       0.00      0.00      0.00         1
         stadsdelen       0.00      0.00      0.00         0
          subsidies       0.00      0.00      0.00         1
toerisme-vrije-tijd       0.00      0.00      0.00         1
       veelgevraagd       0.21      0.50      0.30         6
       werk-inkomen       0.50      0.50      0.50         2
 wonen-leefomgeving       0.56      0.42      0.48        12
 zorg-ondersteuning       1.00      0.20      0.33         5

           accuracy                           0.34        44
          macro avg       0.23      0.17      0.17        44
       weighted avg   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Filter out non-existent URLs 

In [123]:
urls = list(filtered_amsterdam['URLs'])
urls[0]

'https://www.amsterdam.nl/zorg-ondersteuning/ondersteuning/vluchtelingen/ongedocumenteerden/'

In [132]:
urls = list(filtered_amsterdam['URLs'])

html = []
for url in urls:
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        html.append(soup)
    except:
        html.append('error')
        
    time.sleep(random.uniform(2, 8))


#### Remove 'Helaas. De pagina waar u naar op zoek was bestaat niet (meer).' and error

In [140]:
filtered_amsterdam['HTML'] = html
filtered_amsterdam['HTML Text'] = [get_doc_text(str(t)) for t in html]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_amsterdam['HTML'] = html
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_amsterdam['HTML Text'] = [get_doc_text(str(t)) for t in html]


In [145]:
filtered_amsterdam_2 = filtered_amsterdam[~filtered_amsterdam["HTML"].astype(str).str.contains("error")]


  filtered_amsterdam_2 = filtered_amsterdam[~filtered_amsterdam["HTML Text"].astype(str).str.contains("Helaas. De pagina waar u naar op zoek was bestaat niet (meer).")]


In [154]:
filtered_amsterdam_2 = filtered_amsterdam_2[~filtered_amsterdam_2["HTML Text"].astype(str).str.contains("Fout - Pagina niet gevonden - Gemeente Amsterdam")]


In [156]:
len(filtered_amsterdam_2)

30

### Check predictions on second filtering of amsterdam.nl referenced

In [157]:
queries = list(filtered_amsterdam_2['Pre-processed Question'])

In [158]:
vectorizer = TfidfVectorizer()
document_vectors = vectorizer.fit_transform(documents)
query_vectors = vectorizer.transform(queries)

# Calculate cosine similarity
similarity_matrix = cosine_similarity(query_vectors, document_vectors)

# Sort documents by cosine similarity score and return top k
k = 1
top_k_docs = []
for i, query in enumerate(queries):
    doc_indices = similarity_matrix[i].argsort()[::-1][:k]
    #top_k_docs.append((query, [documents[j] for j in doc_indices]))
    
    top_k_docs.append((query, [documents[j] for j in doc_indices]))
    
#print(top_k_docs)


In [159]:
ranking_2 = pd.DataFrame()
ranking_2['Question'] = [pair[0] for pair in top_k_docs]
ranking_2['Rank1'] = [pair[1][0] for pair in top_k_docs]

In [160]:
ranking_2 = pd.merge(ranking_2, supporting_documents, left_on='Rank1', right_on='Pre-processed Text', how='left')

In [161]:
len(ranking_2)

30

In [163]:
ranking_2['Paths'][:5]

0    bestuur-organisatie
1     zorg-ondersteuning
2           veelgevraagd
3             stadsdelen
4           veelgevraagd
Name: Paths, dtype: object

In [164]:
filtered_amsterdam_2['Paths'][:5]

615     zorg-ondersteuning
620     zorg-ondersteuning
1587          veelgevraagd
1934             projecten
2641    wonen-leefomgeving
Name: Paths, dtype: object

In [165]:
print(classification_report(filtered_amsterdam_2['Paths'], ranking_2['Paths'])) # predictions without removing the not working URls

                     precision    recall  f1-score   support

bestuur-organisatie       0.25      0.33      0.29         6
      kunst-cultuur       0.00      0.00      0.00         0
         ondernemen       0.00      0.00      0.00         1
           parkeren       0.00      0.00      0.00         0
          projecten       0.00      0.00      0.00         4
              sport       0.00      0.00      0.00         1
         stadsdelen       0.00      0.00      0.00         0
          subsidies       0.00      0.00      0.00         1
       veelgevraagd       0.20      0.50      0.29         4
 wonen-leefomgeving       0.67      0.50      0.57         8
 zorg-ondersteuning       1.00      0.20      0.33         5

           accuracy                           0.30        30
          macro avg       0.19      0.14      0.13        30
       weighted avg       0.42      0.30      0.30        30



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Check if any of the three retrieved appears

In [168]:
vectorizer = TfidfVectorizer()
document_vectors = vectorizer.fit_transform(documents)
query_vectors = vectorizer.transform(queries)

# Calculate cosine similarity
similarity_matrix = cosine_similarity(query_vectors, document_vectors)

# Sort documents by cosine similarity score and return top k
k = 3
top_k_docs = []
for i, query in enumerate(queries):
    doc_indices = similarity_matrix[i].argsort()[::-1][:k]
    #top_k_docs.append((query, [documents[j] for j in doc_indices]))
    
    top_k_docs.append(([documents[j] for j in doc_indices]))
    
#print(top_k_docs)

In [174]:
supporting_documents.loc[supporting_documents['Pre-processed Text']=='nieuw b w april gemeent amsterdam direct inhoud gemeenteamsterdam aaa amsterdam english sit menuzoek onderwerp nieuw contact zoek amsterdamnl zoek zoek verberg browser ondersteund gebruik recent versie edg chrom firefox pad huidig pagina hom bestur organisatie colleg burgemeester wethouder nieuw b w nieuw b w april abesluit algemen zak colleg b w kennisgenom uitkomst landelijk voorgeschrev zelfevaluaties vier basisregistraties colleg bijhoudt basisregistraties adress gebouw grootschal topograﬁ ondergrond waard onroer zak uitkomst aangebod landelijk toezichthouder basisregistraties ter kennisnam geagendeerd vergader raadscommissie financien kunst diversiteit colleg b w stelt open ruimt geleg marktkwartier stadsdel west amsterdam vast betreft peterseliekad kurkumakad saﬀraanlan kardemomstrat vanillestrat laosstrat oreganostrat steranijsstrat gemberstrat dragonstrat basilicumstrat garam masalastrat nootmuskaatstrat kruidnagelstrat jeneverbesstrat salieplein expeditiestrat oost expeditiestrat west groothandelsmarktplein juridisch zak colleg b w ingestemd opdrachtverlen programmamanager open informatiehuishoud uurtarief euro daarmee afgewek maximal bedrag opgenom protocol extern inzet afwijk voorgelegd colleg onderwijs colleg b w verleent eenmal aanvull subsidie uitvoer werkplat onderwijsonderzoek amsterdam voortgezet middel beroepsonderwijs werkplat schol opleid kennisinstell onderzoek bijdraagt professionaliser docent kwaliteit amsterdam onderwijs aanvull subsidie onderzoek eind huidig schooljar voortgezet social zak colleg b w ingestemd evaluatie participatierad elk drie jar functioner participatierad geevalueerd laatst ker colleg ziet aanleid jar evaluatie lat voer onafhank advies onderzoeksbureau evaluatie verwacht zomerreces afgerond daarna rad hierover geinformeerd gemeent vastgoed colleg b w ingestemd actualisatie procedur gemeent vastgoed vrijkomt doel hiervan vrijkom vastgoed efficient zet maatschapp beleidsdoel ondersteun faciliter grond ontwikkel colleg b w stemt ophog bestaand corporatiesubsidie duurzam nieuwbouwwon buiksloterham strandeiland afschaﬀ beng bijna energieneutral gebouw vervalt del beoogd dekking corporaties buiksloterham strandeiland ophog bestaand corporatiesubsidie duurzam nieuwbouwwon buiksloterham strandeiland hersteld verker vervoer luchtkwaliteitopen ruimt groen colleg b w stuurt brief gemeenterad afhandel motie raadslid ernsting groenlink oproept fietsknelpunt kaart breng bestaand dashboard verkeersveil uitgebreid fietsknelpunt basis melding gemeent fietsersbond colleg b w stuurt brief gemeenterad programma aanpak wegtunnel vierd kwartal hierin stat belangrijkst ontwikkel uitstel openstell piet heintunnel besluit vernieuw verkeerscentral amsterdam colleg b w besluit weesperstrat tuss juni juli af sluit autoverker effect daarvan omgev stad onderzoek del pagina del pagina facebok del pagina twitter del pagina linkedin del pagina whatsapp print pagina gemeent amsterdam contact hebt vrag kunt antwoord vind websit nem contact contactformulier bel telefoonnummer maandag vrijdag uur contactgegeven openingstijd volg gemeent nieuwsbrief amsterdam twitter facebok instagram linkedin youtub werkenbij kalender buurtactiviteit inspraakavond organiseert gemeent kijk kalender amsterdam amsterdam benieuwd allemal stad iamsterdamcom vindt best tip gebied cultur uitgan evenement lijst sit privacy cookies sit webarchief']

Unnamed: 0,URL,HTML,Paths,Text,Pre-processed Text
42,https://www.amsterdam.nl/bestuur-organisatie/c...,b'<!DOCTYPE html>\n<html lang=nl>\n<head prefi...,bestuur-organisatie,\n\n\n\nNieuws uit B en W 5 april 2023 - Gemee...,nieuw b w april gemeent amsterdam direct inhou...


In [180]:
def add_url_path(results_k, supporting_documents):
    results = []
    for result in results_k:
        #supporting_documents.loc[supporting_documents[str(result)]]
        results.append(supporting_documents.loc[supporting_documents[str(result)]])

    return results

In [170]:
def check_results(predicted_results, true_labels):
    results = []
    for result in predicted_results:
        if result in true_labels:
            results.append(1)
        else:
            results.append(0)
    
    return results

In [182]:
with open('supporting_docs_df.pickle', 'wb') as f:
    pickle.dump(supporting_documents, f)

In [187]:
filtered_amsterdam.to_csv('filtered_ams_questions.csv')