## Imports

In [18]:
import math
import random
from collections import Counter
import re

import elasticsearch
from elasticsearch import Elasticsearch, helpers

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

## Generate ES-connection and index structure

In [14]:
es = Elasticsearch()
es.info()

{'name': 'ULTIMECIA',
 'cluster_name': 'elasticsearch',
 'cluster_uuid': 'MHYEAbcOS_i6ybp0d4NE2A',
 'version': {'number': '7.9.1',
  'build_flavor': 'default',
  'build_type': 'zip',
  'build_hash': '083627f112ba94dffc1232e8b42b73492789ef91',
  'build_date': '2020-09-01T21:22:21.964974Z',
  'build_snapshot': False,
  'lucene_version': '8.6.2',
  'minimum_wire_compatibility_version': '6.8.0',
  'minimum_index_compatibility_version': '6.0.0-beta1'},
 'tagline': 'You Know, for Search'}

In [15]:
FIELDS = ["abstract", "subject", "instance"]
INDEX_NAME = 'not_so_fast_test'
INDEX_SETTINGS = {
    'mappings': {
            'properties': {
                'abstract': {
                    'type': 'text',
                    'term_vector': 'yes',
                    'analyzer': 'english'
                },
                'subject': {
                    'type': 'text',
                    'term_vector': 'yes',
                    'analyzer': 'english'
                },
                'instance': {
                    'type': 'text',
                    'term_vector': 'yes',
                    'analyzer': 'english'
                }
            }
        }
    }

In [16]:
if es.indices.exists(INDEX_NAME):
    es.indices.delete(index=INDEX_NAME)
es.indices.create(index=INDEX_NAME, body=INDEX_SETTINGS)

{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'not_so_fast_test'}

## Processing

In [64]:
stop_words = set(stopwords.words("english"))

## Loading of files

In [124]:
def loadCategories():
    pattern = re.compile(r'<http://dbpedia.org/resource/([^>]*)> <[^>]*> <http://dbpedia.org/resource/([^>]*)>.*', re.MULTILINE | re.DOTALL)
    kv = {}
    with open('datasets/DBpedia/article_categories_en.ttl', encoding='utf-8') as f:
        for i, line in enumerate(f):
            m = pattern.match(line)
            if m:
                entity = m.group(1).replace("_", " ").lower()
                category = m.group(2).replace("_", " ").split(":")[1].lower()

                if not entity in kv:
                    kv[entity] = []
                kv[entity].append(category)
    return kv


def loadAbstracts():
    pattern = re.compile(r'<http://dbpedia.org/resource/([^>]*)> <[^>]*> "([^>]*)"@en .*', re.MULTILINE | re.DOTALL)    
    kv = {}
    with open('datasets/DBpedia/long_abstracts_en.ttl', encoding='utf-8') as f:
        for i, line in enumerate(f):
            m = pattern.match(line)
            if m:
                entity = m.group(1).replace("_", " ").lower()
                abstract = m.group(2).lower()

                kv[entity] = abstract
    return kv


def loadInstances():
    pattern = re.compile(r'<http://dbpedia.org/resource/([^>]*)> <[^>]*> <http://([^>]*).*', re.MULTILINE | re.DOTALL)    
    kv = {}
    with open('datasets/DBpedia/instance_types_en.ttl', encoding='utf-8') as f:
        for i, line in enumerate(f):
            m = pattern.match(line)
            if m:
                entity = m.group(1).replace("_", " ").lower()
                instance = m.group(2).split("/")[-1].replace("_", " ").lower()
                
                kv[entity] = instance
    return kv


In [125]:
categories = loadCategories()
abstracts = loadAbstracts()
instances = loadInstances()

In [126]:
print(len(categories))
print(len(abstracts))
print(len(instances))

5161974
4929821
5043467


In [129]:
instances["achilles"]

"in greek mythology, achilles (/əˈkɪliːz/; ancient greek: ἀχιλλεύς, akhilleus, pronounced [akʰilːéu̯s]) was a greek hero of the trojan war and the central character and greatest warrior of homer's iliad. his mother was the nymph thetis, and his father, peleus, was the king of the myrmidons. achilles’ most notable feat during the trojan war was the slaying of the trojan hero hector outside the gates of troy. although the death of achilles is not presented in the iliad, other sources concur that he was killed near the end of the trojan war by paris, who shot him in the heel with an arrow. later legends (beginning with a poem by statius in the 1st century ad) state that achilles was invulnerable in all of his body except for his heel. because of his death from a small wound in the heel, the term achilles' heel has come to mean a person's point of weakness."

## Indexing

In [None]:
def indexData(size = 5000):
    for key in abstracts.keys():
        