# Introduction

This notebook is for developing and experimenting with functions related to the search engine project.

In [3]:
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
import io
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import os
import json
import re

## Parse a .pdf file

In [2]:
def pdf_to_text(input_file):
    i_f = open(input_file,'rb')
    resMgr = PDFResourceManager()
    retData = io.StringIO()
    TxtConverter = TextConverter(resMgr,retData, laparams= LAParams())
    interpreter = PDFPageInterpreter(resMgr,TxtConverter)
    for page in PDFPage.get_pages(i_f):
        interpreter.process_page(page)
 
    txt = retData.getvalue()
    return txt

## Preprocess parsed text

In [12]:
# The Snowball Stemmer requires that you pass a language parameter

def preprocess_document(document):
    stemmer = SnowballStemmer(language='english')
    stop_words = stopwords.words('english')
    number_token_name = "special_number_token"
    
    tokens = word_tokenize(document)
    tokens = [t for t in tokens if t.isalnum() and len(t) > 1 and t not in stop_words]
    tokens = [t.lower() for t in tokens]
    tokens = [stemmer.stem(t) for t in tokens]
    tokens = [number_token_name if t.isnumeric() else t for t in tokens]
    return tokens

## Examine vocabulary

Sometimes it occurs that words get cut into two tokens because there is a dash and a newline somewhere in the middle of the word. This results in some noisy tokens being created. Additionally, there are several tokens, which contain both numeric and alpha characters. These could be further examined and parsed with some regex manipulation.

In [4]:
print("Vocabulary size:", len(set(tokens)), "Corpus size:", len(tokens))

NameError: name 'tokens' is not defined

In [None]:
counts = pd.Series(tokens).value_counts()

print(counts[:60])
print("-" * 100)
print(counts[-60:])

In [None]:
from matplotlib import pyplot as plt

# Drop the first five elements to better see the distribution of tokens
plt.hist([counts.values[5:]], bins=100)
plt.show()

## Bring together the components into a function that preprocesses a pdf document

In [10]:
def preprocess_pdf_document(pdf_path, stemmer=None, stop_word_list=None, NUMBER_TOKEN_NAME=None):
    # Parse pdf
    parsed_text = pdf_to_text(pdf_path)
    
    if stemmer is None:
        stemmer = SnowballStemmer(language='english')
    
    if stop_word_list is None:
        stop_word_list = stopwords.words('english')
    
    if NUMBER_TOKEN_NAME is None:
        NUMBER_TOKEN_NAME = "special_number_token"

    preprocessed_document = preprocess_document(parsed_text)
    
    return preprocessed_document

## Create simple index for a document

This function creates an index for a list, which is essentially a dictionary containing the counts for each token.

In [6]:
def create_index(preprocessed_document):
    return pd.Series(preprocessed_document).value_counts().to_dict()

## Combine to obtain a function, which indexes a pdf file

In [7]:
def index_pdf_document(input_pdf):
    preprocessed_file = preprocess_pdf_document(input_pdf)
    return create_index(preprocessed_file)

## Compute similarity between two indices

This function computes the intersection of the value counts divided by the minimum sum of value counts. Under the assumption that one index is significantly smaller than the two, this should quickly give a score based on how many of the token occurrences in the smaller document are found in the larger document.

In [8]:
def index_similarity(idx1, idx2):
    sum_1 = sum(idx1.values())
    sum_2 = sum(idx2.values())
    
    intersection = 0
    
    if sum_1 < sum_2:
        min_idx = idx1
        max_idx = idx2
        min_sum = sum_1
    else:
        min_idx = idx2
        max_idx = idx1
        min_sum = sum_2
    
    for k, v in min_idx.items():
        intersection += min(max_idx.get(k, 0), v)
    
    return intersection / min_sum

## Iterate over pdf documents, compute indices and write them to a json file as an array.

In [13]:
def compute_indices(path_to_files, output_path="./test_data/indices.json"):
    
    result = []
    
    for file in os.listdir(path_to_files):
        filename = os.fsdecode(file)
        if filename.endswith(".pdf"): 
            index = index_pdf_document(os.path.join(path_to_files, filename))
            result.append({"file_name": filename, "index": index})
    with open(output_path, "w") as f:
        json.dump(result, f)

test_path = "./test_data"
compute_indices(test_path)

## Write function that retrieves the indices from the json files

In [14]:
def read_indices_from_json(indices_path=None):
    try:
        with open(indices_path) as f:
            return json.load(f)
    except Exception as e:
        print("JSON loading failed with exception:", e)
        return None

json_path = "./test_data/indices.json"
    
    
indices = read_indices_from_json(indices_path=json_path)

## Retrieve the most relevant documents given a query string

This function takes the similarity function as an argument, and also takes a function as an argument which retrieves the documents in a list where elements are in the form {"file_name": example_file.pdf, "index": {"example": 1, "token": 1}}. This way the indices can later be obtained from eg. local json files, an S3 bucket or MongoDB.

In [15]:
def most_similar_documents(query_string, compute_similarity, get_indices, n=3, 
                           indices_path="./test_data/indices.json"):
    
    preprocessed_query = preprocess_document(query_string)
    query_index = create_index(preprocessed_query)
    
    indices = get_indices(indices_path=indices_path)
    
    similarities = [compute_similarity(index["index"], query_index) for index in indices]
    
    zipped = sorted([(idx, sim) for idx, sim in enumerate(similarities)], key=lambda t: -t[1])
    
    indices_to_return = [t[0] for t in zipped[:n]]
        
    return [indices[idx]["file_name"] for idx in indices_to_return]

most_similar_documents("what is the revenue of danske bank", index_similarity, read_indices_from_json)

['swedbank_annual_2017.pdf',
 'danske_bank_Outlook_January 2019.pdf',
 'swedbank_mortgage_2018.pdf']

# Sentence embedding

Sentence embeddings are meant to transform sentences and documents from sequences of tokens into a d-dimensional vector space. A successful embedding algorithm maps semantically similar inputs close to each other in the embedding space wrt. some similarity or distance function.

Sentence embeddings could be utilized in this project to, for instance, map all of the documents into a vector space in an offline manner. When the user sends a query, the query would be mapped into that vector space. The service could then return documents with the greatest similarity to the query vector.

## Motivation

Why do this instead of / in addition to the existing lexical model? One reason is that two words or sentences can be semantically similar while being disjoint in terms of exact word matches. As an example:

- A king holds dominion over Sweden.
- A monarch rules in Spain.

These sentences are semantically very close to each other. However, the current similarity function would most likely assign them a similarity of 0 due to stop word removal. An effective sentence embedding model would ideally produce sentence vectors for these sentences, which are similar on a given similarity score.

In [13]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [17]:
#Our sentences we like to encode
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.',
    'The quick brown fox jumps over the lazy dog.']

#Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)

#Print the embeddings
for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

Sentence: This framework generates embeddings for each input sentence
Embedding: [-1.76214278e-01  1.20601267e-01 -2.93623865e-01 -2.29857773e-01
 -8.22927803e-02  2.37709224e-01  3.39985371e-01 -7.80964255e-01
  1.18127957e-01  1.63374022e-01 -1.37714952e-01  2.40282476e-01
  4.25125450e-01  1.72417805e-01  1.05279475e-01  5.18164098e-01
  6.22214638e-02  3.99286091e-01 -1.81652337e-01 -5.85578680e-01
  4.49717082e-02 -1.72750488e-01 -2.68443465e-01 -1.47386283e-01
 -1.89218074e-01  1.92150757e-01 -3.83842677e-01 -3.96006912e-01
  4.30648983e-01 -3.15319538e-01  3.65949363e-01  6.05158247e-02
  3.57325971e-01  1.59736261e-01 -3.00983787e-01  2.63250351e-01
 -3.94311160e-01  1.84855491e-01 -3.99549007e-01 -2.67889708e-01
 -5.45117259e-01 -3.13406922e-02 -4.30644006e-01  1.33278042e-01
 -1.74793795e-01 -4.35465485e-01 -4.77378786e-01  7.12556243e-02
 -7.37002566e-02  5.69137096e-01 -2.82579750e-01  5.24976403e-02
 -8.20008039e-01  1.98297009e-01  1.69511795e-01  2.71779984e-01
  2.64610

In [18]:
pdf_path = "./test_data/folksam_report_mid_2019.pdf"

In [19]:
def clean_document_for_embedding(pdf_path):
    document = pdf_to_text(pdf_path)
    sentences = sent_tokenize(document)
    result = []
    for s in sentences:
        cleaned = s.replace("-\n", "")
        cleaned = cleaned.replace("\n", " ")
        cleaned = cleaned.replace("\x0c", "")
        cleaned = cleaned.replace("\t", " ")
        cleaned = cleaned.replace("  ", " ")
        result.append(cleaned)
    return result

sentences = clean_document_for_embedding(pdf_path)

In [20]:
embeddings = model.encode(sentences)

In [21]:
query = ["Folksam business performance."]

embedded_query = model.encode(query)

In [22]:
from utils.similarity_functions import cosine_similarity

In [30]:
for idx, embedded_document_sentence in enumerate(embeddings):
    s = cosine_similarity(embedded_document_sentence, embedded_query[0])
    if s > 0.5:
        print(sentences[idx])
        print("-" * 100)
        print("HIGH SIMILARITY", s)
        print("-" * 100)
    elif s < 0:
        print(sentences[idx])
        print("-" * 100)
        print("LOW SIMILARITY", s)
        print("-" * 100)

Interim Report January - June 2019 Q1 Q2 Q3 Q4 The Folksam Group 2019 1 Overview: The Folksam Group Folksam overall Folksam Customer Index (FCI), % Premiums, SEK million1 Assets under management, at period-end, SEK million 2,3,4 Unit-linked insurance assets, at period-end, SEK million3,4,5 Number of full-time positions6 Jan – Jun 2019 747 35,051 441,124.
----------------------------------------------------------------------------------------------------
HIGH SIMILARITY 0.5282776
----------------------------------------------------------------------------------------------------
2)  Konsumentkooperationens Pensionsstiftelse is not included.
----------------------------------------------------------------------------------------------------
LOW SIMILARITY -0.11705731
----------------------------------------------------------------------------------------------------
The blue bond was issued by the Nordic Investment Bank (NIB) and is a Baltic Blue Bond, which is used for investments aimed

## Create a function to compute embeddings for all documents in a dir

In [32]:
def compute_embeddings(path_to_files, embedding_model, output_path="./test_data/embeddings.json"):
    
    
    result = []
    
    for file in os.listdir(path_to_files):
        filename = os.fsdecode(file)
        if filename.endswith(".pdf"): 
            preprocessed_document = clean_document_for_embedding(os.path.join(path_to_files, filename))
            sentence_embeddings = embedding_model.encode(preprocessed_document)
            result.append({"file_name": filename, "embeddings": sentence_embeddings.tolist()})
    with open(output_path, "w") as f:
        json.dump(result, f)

test_path = "./test_data"
compute_embeddings(test_path, model)

In [4]:
embeddings_path = "./test_data/embeddings.json"

with open(embeddings_path) as f:
    embeddings_json = json.load(f)

## Implement sentence-document similarity as average of k most similar sentences

In [29]:
from utils.similarity_functions import cosine_similarity

def semantic_similarity(document_embeddings, query_embedding, k=5):
    similarities = [cosine_similarity(doc_sent, query_embedding) for doc_sent in document_embeddings]
    return sum(sorted(similarities)[-k:]) / k


query = model.encode(["Scandinavian economic outlook in 2017"])[0]

for doc in embeddings_json:
    print(doc["file_name"], semantic_similarity(doc["embeddings"], query))
    print("-" * 50)

swedbank_mortgage_2018.pdf 0.4944541973397848
--------------------------------------------------
folksam_report_mid_2019.pdf 0.45705148123363043
--------------------------------------------------
seb_no_1901_en.pdf 0.6164372044735378
--------------------------------------------------
swedbank_annual_2017.pdf 0.5603944852043308
--------------------------------------------------
danske_bank_Outlook_January 2019.pdf 0.5956994259024004
--------------------------------------------------
