# Introduction

This notebook is for developing and experimenting with functions related to the search engine project.

In [14]:
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
import io
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import os
import json

## Parse a .pdf file

In [None]:
def pdf_to_text(input_file):
    i_f = open(input_file,'rb')
    resMgr = PDFResourceManager()
    retData = io.StringIO()
    TxtConverter = TextConverter(resMgr,retData, laparams= LAParams())
    interpreter = PDFPageInterpreter(resMgr,TxtConverter)
    for page in PDFPage.get_pages(i_f):
        interpreter.process_page(page)
 
    txt = retData.getvalue()
    return txt

## Preprocess parsed text

In [13]:
# The Snowball Stemmer requires that you pass a language parameter

def preprocess_document(document):
    stemmer = SnowballStemmer(language='english')
    stop_words = stopwords.words('english')
    number_token_name = "special_number_token"
    
    tokens = word_tokenize(document)
    tokens = [t for t in tokens if t.isalnum() and len(t) > 1 and t not in stop_words]
    tokens = [t.lower() for t in tokens]
    tokens = [stemmer.stem(t) for t in tokens]
    tokens = [NUMBER_TOKEN_NAME if t.isnumeric() else t for t in tokens]
    return tokens

## Examine vocabulary

Sometimes it occurs that words get cut into two tokens because there is a dash and a newline somewhere in the middle of the word. This results in some noisy tokens being created. Additionally, there are several tokens, which contain both numeric and alpha characters. These could be further examined and parsed with some regex manipulation.

In [6]:
print("Vocabulary size:", len(set(tokens)), "Corpus size:", len(tokens))

NameError: name 'tokens' is not defined

In [None]:
counts = pd.Series(tokens).value_counts()

print(counts[:60])
print("-" * 100)
print(counts[-60:])

In [None]:
from matplotlib import pyplot as plt

# Drop the first five elements to better see the distribution of tokens
plt.hist([counts.values[5:]], bins=100)
plt.show()

## Bring together the components into a function that preprocesses a pdf document

In [7]:
def preprocess_pdf_document(pdf_path, stemmer=None, stop_word_list=None, NUMBER_TOKEN_NAME=None):
    # Parse pdf
    parsed_text = pdf_to_text(pdf_path)
    
    if stemmer is None:
        stemmer = SnowballStemmer(language='english')
    
    if stop_words_list is None:
        stop_words_list = stopwords.words('english')
    
    if NUMBER_TOKEN_NAME is None:
        NUMBER_TOKEN_NAME = "special_number_token"

    preprocessed_document = preprocess_document(parsed_text)
    
    return preprocessed_document

## Create simple index for a document

This function creates an index for a list, which is essentially a dictionary containing the counts for each token.

In [16]:
def create_index(preprocessed_document):
    return pd.Series(preprocessed_document).value_counts().to_dict()

## Combine to obtain a function, which indexes a pdf file

In [9]:
def index_pdf_document(input_pdf):
    preprocessed_file = preprocess_pdf_document(input_pdf)
    return create_index(preprocessed_file)

## Compute similarity between two indices

This function computes the intersection of the value counts divided by the minimum sum of value counts. Under the assumption that one index is significantly smaller than the two, this should quickly give a score based on how many of the token occurrences in the smaller document are found in the larger document.

In [10]:
def index_similarity(idx1, idx2):
    sum_1 = sum(idx1.values())
    sum_2 = sum(idx2.values())
    
    intersection = 0
    
    if sum_1 < sum_2:
        min_idx = idx1
        max_idx = idx2
        min_sum = sum_1
    else:
        min_idx = idx2
        max_idx = idx1
        min_sum = sum_2
    
    for k, v in min_idx.items():
        intersection += min(max_idx.get(k, 0), v)
    
    return intersection / min_sum

## Iterate over pdf documents, compute indices and write them to a json file as an array.

In [33]:
def compute_indices(path_to_files, output_path="./test_data/indices.json"):
    
    result = []
    
    for file in os.listdir(path_to_files):
        filename = os.fsdecode(file)
        if filename.endswith(".pdf"): 
            index = index_pdf_document(os.path.join(path_to_files, filename))
            result.append({"file_name": filename, "index": index})
    with open(output_path, "w") as f:
        json.dump(result, f)

test_path = "./test_data"
compute_indices(test_path)

## Write function that retrieves the indices from the json files

In [29]:
def read_indices_from_json(indices_path=None):
    try:
        with open(indices_path) as f:
            return json.load(f)
    except Exception as e:
        print("JSON loading failed with exception:", e)
        return None

json_path = "./test_data/indices.json"
    
    
indices = read_indices_from_json(indices_path=json_path)

## Retrieve the most relevant documents given a query string

This function takes the similarity function as an argument, and also takes a function as an argument which retrieves the documents in a list where elements are in the form {"file_name": example_file.pdf, "index": {"example": 1, "token": 1}}. This way the indices can later be obtained from eg. local json files, an S3 bucket or MongoDB.

In [43]:
def most_similar_documents(query_string, compute_similarity, get_indices, n=3, 
                           indices_path="./test_data/indices.json"):
    
    preprocessed_query = preprocess_document(query_string)
    query_index = create_index(preprocessed_query)
    
    indices = get_indices(indices_path=indices_path)
    
    similarities = [compute_similarity(index["index"], query_index) for index in indices]
    
    zipped = sorted([(idx, sim) for idx, sim in enumerate(similarities)], key=lambda t: -t[1])
    
    indices_to_return = [t[0] for t in zipped[:n]]
        
    return [indices[idx]["file_name"] for idx in indices_to_return]

most_similar_documents("what is the revenue of danske bank", index_similarity, read_indices_from_json)

['swedbank_annual_2017.pdf',
 'danske_bank_Outlook_January 2019.pdf',
 'swedbank_mortgage_2018.pdf']