### Read Documents

In [1]:
import os
from bs4 import BeautifulSoup
import pickle
import json

In [2]:
from collections import Counter

In [22]:
import pandas as pd

In [3]:
%cd '/Users/natalipeeva/Desktop'

/Users/natalipeeva/Desktop


#### Supporting documents

In [4]:
with open('Data Samples/amsterdam_corpus.pickle', 'rb') as f:
    html_contents = pickle.load(f)

In [5]:
len(html_contents)

11432

#### Answer References

In [6]:
with open('Data Samples/refernces_dict.json', 'r') as f:
    data_dict = json.load(f)

##### Get reference URL path

In [7]:
my_list = list(data_dict.items()) # convert to list of tuples to match format of supporting

In [8]:
filtered_links = [link for link in my_list if 'www.amsterdam.nl' in link[0]] # keep only amsterdam.nl

In [9]:
def extract_path(link, start):
    """
    Where start is the domain name; e.g. 'www.amsterdam.nl/'
    """
    # start = 'www.amsterdam.nl/'
    end = '/'

    start_index = link.index(start) + len(start)
    end_index = link.index(end, start_index)

    result = link[start_index:end_index]

    return result

In [10]:
# example
extract_path('https://www.amsterdam.nl/veelgevraagd/?caseid=%7BD6E280FB-4A76-40A0-9B88-12B87E446FA6%7D', 'www.amsterdam.nl/')

'veelgevraagd'

In [11]:
ref_urls = [t[0] for t in filtered_links] # store URLs

In [12]:
paths = []
for link in ref_urls:
    try: 
        paths.append((link, extract_path(link, 'www.amsterdam.nl/')))
    except:
        try: 
            paths.append((link, link.split('http://www.amsterdam.nl/')[1]))
        except:
            paths.append((link, link))

#### Most common amsterdam.nl paths

In [13]:
Counter([t[1] for t in paths]).most_common(10)

[('wonen-leefomgeving', 12),
 ('bestuur-organisatie', 8),
 ('veelgevraagd', 7),
 ('publish', 7),
 ('projecten', 6),
 ('zorg-ondersteuning', 3),
 ('sociaaldomein', 3),
 ('parkeren-verkeer', 3),
 ('nieuws', 3),
 ('nieuwsbrieven', 2)]

##### Get corpus URL path

In [14]:
corpus_urls = [t[0] for t in html_contents]

In [18]:
paths_corpus = []
for link in set(corpus_urls):
    try: 
        paths_corpus.append((link, extract_path(link, 'www.amsterdam.nl/')))
    except:
        try: 
            paths_corpus.append((link, link.split('http://www.amsterdam.nl/')[1]))
        except:
            paths_corpus.append((link, link))

In [19]:
Counter([t[1] for t in paths_corpus]).most_common(10)

[('veelgevraagd', 3671),
 ('stadsdelen', 1808),
 ('wonen-leefomgeving', 1243),
 ('projecten', 1021),
 ('kunst-cultuur', 689),
 ('parkeren', 381),
 ('ondernemen', 373),
 ('toerisme-vrije-tijd', 369),
 ('zorg-ondersteuning', 361),
 ('werk-inkomen', 288)]

# Combine Data into panads.DataFrame

In [23]:
questions = pd.read_csv(open('URL Analysis/questions.csv', 'r'))
urls = sum(map(lambda x: x.split('\n'), questions[questions['URLs'].notnull()]['URLs']), [])
urls = list(map(lambda x: x if x.startswith('http') else f'http://{x}', urls))

In [24]:
questions.head()

Unnamed: 0,Year,Month,Question,Answer,Document,URLs
0,2018,12,\n \n1. Heeft het college kennisgenomen van de...,\nNee.,https://amsterdam.raadsinformatie.nl/document/...,
1,2018,12,\n \n2. Kan het college bevestigen of dit lesm...,"\nNee, het college heeft hier geen zicht op. ...",https://amsterdam.raadsinformatie.nl/document/...,
2,2018,12,\n \n ...,\nHet CIDI is duidelijk over de eigen doelste...,https://amsterdam.raadsinformatie.nl/document/...,
3,2018,12,\n \n4. Is het college bekend met de jaarlijks...,\nHet college heeft hier kennis van genomen.,https://amsterdam.raadsinformatie.nl/document/...,
4,2018,12,\n \na. Is het college van oordeel dat het CID...,vraag 4a: \nHet college is voor een pluriform...,https://amsterdam.raadsinformatie.nl/document/...,


In [28]:
ref_urls_df = pd.DataFrame(paths, columns=['URLs', 'URL_path'])

In [29]:
merged_questions = pd.merge(questions, ref_urls_df, on='URLs', how='left')

In [32]:
amsterdam_questions = merged_questions.dropna()

In [33]:
len(amsterdam_questions)

54

In [34]:
import pickle

In [35]:
with open('amsterdam_questions.pickle', "wb") as f:
    pickle.dump(amsterdam_questions, f)