In [1]:
%cd '/Users/natalipeeva/Desktop'

/Users/natalipeeva/Desktop


#### Imports

In [22]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from bs4 import BeautifulSoup
import pandas as pd

In [28]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/natalipeeva/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/natalipeeva/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#### Loading Data

In [4]:
with open('Data Samples/amsterdam_corpus.pickle', 'rb') as f:
    html_contents = pickle.load(f)

with open('amsterdam_questions.pickle', "rb") as f:
     amsterdam_questions = pickle.load(f)

In [18]:
def get_doc_text(html_content):
    """
    Input: HTML
    Output: text
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    text = soup.get_text()
    
    return text

In [41]:
queries = list(amsterdam_questions['Question'])

In [21]:
documents = [(text[0], get_doc_text(text[1]), text[1]) for text in html_contents]

In [24]:
supporting_documents = pd.DataFrame(documents, columns=['URL', 'Text', 'HTML'])

In [27]:
documents = supporting_documents['Text']

In [25]:
with open('supporting_documents.pickle', "wb") as f:
    pickle.dump(supporting_documents, f)

#### Pre-process queries and documents

In [29]:
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Tokenize text into words
    words = word_tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('dutch'))
    words = [word for word in words if word not in stop_words]

    # Stem words
    stemmer = nltk.stem.snowball.DutchStemmer()
    words = [stemmer.stem(word) for word in words]

    # Join words back into a string
    text = ' '.join(words)

    return text

In [30]:
preprocessed_docs = []
for doc in documents:
    preprocessed_docs.append(preprocess_text(doc))

In [None]:
documents = preprocessed_docs

In [31]:
supporting_documents['Pre-processed Text'] = preprocessed_docs

In [44]:
preprocessed_q = []
for q in queries:
    preprocessed_q.append(preprocess_text(q))

In [46]:
queries = preprocessed_q

#### TF-IDF Ranking

In [78]:
vectorizer = TfidfVectorizer()
document_vectors = vectorizer.fit_transform(documents)
query_vectors = vectorizer.transform(queries)

# Calculate cosine similarity between each query and document
similarity_matrix = cosine_similarity(query_vectors, document_vectors)

# Sort documents by cosine similarity score and return top k
k = 1
top_k_docs = []
for i, query in enumerate(queries):
    doc_indices = similarity_matrix[i].argsort()[::-1][:k]
    #top_k_docs.append((query, [documents[j] for j in doc_indices]))
    
    top_k_docs.append((query, [documents[j] for j in doc_indices]))
    
#print(top_k_docs)


In [82]:
ranking = pd.DataFrame()
ranking['Question'] = [pair[0] for pair in top_k_docs]
ranking['Rank1'] = [pair[1][0] for pair in top_k_docs]

In [83]:
ranking.head()

Unnamed: 0,Question,Rank1
0,huisarts gev informatie nodig declaratiemog ge...,windmolen amsterdam gemeent amsterdam direct i...
1,wet ongedocumenteerd weg medisch instell zoal ...,ongedocumenteerd gemeent amsterdam gemeenteams...
2,spug speciaal daartoe getraind opsporingsambte...,mag buitengewon opsporingsambtenar boa handhav...
3,gat wethouder ontslag zorg voorkom enkel noodzak,hulp zorg betal gemeent amsterdam gemeenteamst...
4,reactie rapport schrijft colleg klopt all sign...,melding open ruimt overlast gemeent amsterdam ...


In [95]:
len(ranking)

54

In [88]:
merged_df = pd.merge(ranking, supporting_documents, left_on='Rank1', right_on='Pre-processed Text', how='left')

In [102]:
merged_df= merged_df.drop_duplicates(subset='Question')

In [103]:
len(merged_df)

54

In [105]:
merged_df.head(2)

Unnamed: 0,Question,Rank1,URL,Text,HTML,Pre-processed Text
0,huisarts gev informatie nodig declaratiemog ge...,windmolen amsterdam gemeent amsterdam direct i...,/onderwepen/amsterdam-wonen-leefomgeving/www.a...,\n\n\n\nWindmolens in Amsterdam - Gemeente Ams...,<!DOCTYPE html>\n<html lang=nl>\n<head prefix=...,windmolen amsterdam gemeent amsterdam direct i...
2,wet ongedocumenteerd weg medisch instell zoal ...,ongedocumenteerd gemeent amsterdam gemeenteams...,/onderwepen/amsterdam-wonen-leefomgeving/www.a...,\n\n\n\nOngedocumenteerden - Gemeente Amsterda...,<!DOCTYPE html>\n<html lang=nl>\n<head prefix=...,ongedocumenteerd gemeent amsterdam gemeenteams...


### Check predictions 

In [106]:
def extract_path(link, start):
    """
    Where start is the domain name; e.g. 'www.amsterdam.nl/'
    """
    # start = 'www.amsterdam.nl/'
    end = '/'

    start_index = link.index(start) + len(start)
    end_index = link.index(end, start_index)

    result = link[start_index:end_index]

    return result

In [113]:
url_paths = []
for link in merged_df['URL']:
    try: 
        url_paths.append((link, extract_path(link, 'www.amsterdam.nl/')))
    except:
        try: 
            url_paths.append((link, link.split('http://www.amsterdam.nl/')[1]))
        except:
            url_paths.append((link, link))

In [114]:
predicted_paths = [path[1] for path in url_paths]

In [120]:
predicted_paths[:3]

['wonen-leefomgeving', 'zorg-ondersteuning', 'veelgevraagd']

In [119]:
paths = list(amsterdam_questions['URL_path'])

In [122]:
matches = []
for i in range(len(paths)):
    if paths[i] == predicted_paths[i]:
        matches.append(1)
    else:
        matches.append(0)
