In [97]:
import math
import random
import time
import numpy as np
import traceback
import elasticsearch
from collections import Counter
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from elasticsearch import Elasticsearch, helpers

In [98]:
es = Elasticsearch()
es.info()

{'name': 'BERNTA-PC',
 'cluster_name': 'elasticsearch',
 'cluster_uuid': 'IP06yo9vScKZA1ZTb8R9HA',
 'version': {'number': '7.9.2',
  'build_flavor': 'default',
  'build_type': 'zip',
  'build_hash': 'd34da0ea4a966c4e49417f2da2f244e3e97b4e6e',
  'build_date': '2020-09-23T00:45:33.626720Z',
  'build_snapshot': False,
  'lucene_version': '8.6.2',
  'minimum_wire_compatibility_version': '6.8.0',
  'minimum_index_compatibility_version': '6.0.0-beta1'},
 'tagline': 'You Know, for Search'}

In [99]:
FIELDS = ['abstract', 'subject', 'instance']
INDEX_NAME = 'fasttest'
INDEX_SETTINGS = {
    'mappings': {
            'properties': {
                'abstract': {
                    'type': 'text',
                    'term_vector': 'yes',
                    'analyzer': 'english'
                },
                'subject': {
                    'type': 'text',
                    'term_vector': 'yes',
                    'analyzer': 'english'
                },
                'instance': {
                    'type': 'text',
                    'term_vector': 'yes',
                    'analyzer': 'english'
                }
            }
        }
    }

In [195]:
def createTheIndex():
    if es.indices.exists(INDEX_NAME):
        es.indices.delete(index=INDEX_NAME)    
    es.indices.create(index=INDEX_NAME, body=INDEX_SETTINGS)

In [196]:
#createTheIndex()

In [110]:
#stop_words = set(stopwords.words('english'))
#print(stop_words) # Needed?

In [199]:
def peek(filename, size, enc='utf-8'):
    """
    Print out the first X lines in the file.
    """
    if size <= 0:
        print("Size must be greater than zero!")
        return

    with open(filename, encoding=enc) as f:
        for i,line in enumerate(f):
            if (size >= 0) and (i >= size):
                break
            if i == 0: # Skip top line.
                continue
            print(line.strip())

In [200]:
ENTITIES_PROCESSED = None
DEBUGGING = False # If true, only test (index) on a small subset!

In [201]:
def parseAbstract(data, line):
    """Parse a line from long_abstract."""
    if (line is None) or (line[0] == '#'):
        return
    line = line.strip().replace('"', '').replace('\'', '').replace('@en .', '').replace('/>', '>').split(' ')
    if len(line) < 3:
        return # Invalid line.
    entity = line[0][1:-1].split('/')[-1].replace('_', ' ')
    # TODO, long abstracts might need some more preprocessing, like removing symbols except for ',.- etc.. ??
    value = ' '.join(line[2:]).replace('\\', '')
    data.append({
                "_id": entity, 
                "_source": {'abstract': value, 'subject': '', 'instance': ''}
    })
    if DEBUGGING:
        ENTITIES_PROCESSED.add(entity) # Testing

def parseSubject(data, line):
    """Parse a line from categories."""
    if (line is None) or (line[0] == '#'):
        return None, None
    line = line.strip().replace('/>', '>').split(' ')
    if len(line) < 3:
        return None, None # Invalid line.
    entity = line[0][1:-1].split('/')[-1].replace('_', ' ')
    value = line[2][1:-1].split('/')[-1][len('Category:'):].replace('_', ' ')
    if not entity in data:
        data[entity] = {
            "_id": entity, 
            "_source": {"doc": {'subject': value}},
            "_op_type": "update"
        }
    else:
        data[entity]['_source']['doc']['subject'] = data[entity]['_source']['doc']['subject'] + ', ' + value # Spaghetti?!
    return entity[0].upper(), entity

def parseType(data, line):
    """Parse a line from instances."""
    if (line is None) or (line[0] == '#'):
        return
    line = line.strip().replace('/>', '>').split(' ')
    if len(line) < 3:
        return # Invalid line.
    entity = line[0][1:-1].split('/')[-1].replace('_', ' ')
    value = line[2][1:-1].split('/')[-1].replace('owl#', '').replace('_', ' ')
    data.append({
                "_id": entity, 
                "_source": {"doc": {'instance': value}},
                "_op_type": "update"
    })
    
def getBulkData(data):
    """
    To prevent issues when debugging,
    we only bulk data which was indexed @ abstract.
    """
    if DEBUGGING:
        return [d for d in data if (d['_id'] in ENTITIES_PROCESSED)]
    else:
        return data

In [203]:
def indexData(size=5000):
    """
    Index the data, size = how many entities to parse at a time.
    size should not be much bigger than 20000, due to bulk index size limitations @ elasticsearch!
    """
    global ENTITIES_PROCESSED
    ENTITIES_PROCESSED = set()
    files = [
        ('datasets/DBpedia/long_abstracts_en.ttl', 'utf-8'),
        ('datasets/DBpedia/article_categories_en.ttl', 'utf-8'),
        ('datasets/DBpedia/instance_types_en.ttl', 'utf-8')
    ]
    try:
        files = [open(f, 'r', encoding=e) for f, e in files] # Datasets to index.
        listAbstract, listSubject, listType = [], {}, []
        abstractFile, categoriesFile, instancesFile = files[0], files[1], files[2]
        start_time = time.time()
        
        # Process abstracts first! (bulk)
        for i, line in enumerate(abstractFile):
            if i == 0: # Skip top line.
                continue
            parseAbstract(listAbstract, line)
            if (len(listAbstract) > size):
                helpers.bulk(es, listAbstract, index=INDEX_NAME, raise_on_error=False, raise_on_exception=False)
                listAbstract.clear()
                if DEBUGGING: # Only consider a small subset during test.
                    break
                
        if len(listAbstract): # Still have some remaining items? Bulk them now.
            helpers.bulk(es, listAbstract, index=INDEX_NAME, raise_on_error=False, raise_on_exception=False)
            listAbstract.clear()
            
        print("Indexed abstracts.")
        print("Time Elapsed: {:.4f} sec.".format((time.time()-start_time)))
        
        lineB, lineC = next(categoriesFile, None), next(instancesFile, None) # Skip top lines!
        test1, test2 = False, False
        
        while (lineB or lineC):
            if DEBUGGING and test1 and test2: # Limit to a small subset during testing.
                break

            if lineB:
                lineB = next(categoriesFile, None)
                
            if lineC:
                lineC = next(instancesFile, None)
                
            parseType(listType, lineC)
            currSubjectChar, ent = parseSubject(listSubject, lineB)
            
            # When we have at least 'size' subjects (entities)
            # Continue to the first next char which differs from the previous entry 
            # Which triggered the underneath condition.
            # Add further entries until the first letter in the ent. changes.
            # Bulk the entries until that entry!
            if (len(listSubject) > size):
                lastSubjectChar = currSubjectChar # Find the next first letter of ent. which differs from this. Then bulk.
                newValue = None
                while True:
                    if lineB:
                        lineB = next(categoriesFile, None)                        
                    if lineB is None:
                        break
                    currSubjectChar, ent = parseSubject(listSubject, lineB)
                    if currSubjectChar and (currSubjectChar != lastSubjectChar):
                        newValue = listSubject[ent] # This value belongs to the next 'group', save it for that group.
                        del listSubject[ent]
                        break
                helpers.bulk(es, getBulkData(listSubject.values()), index=INDEX_NAME, raise_on_error=False, raise_on_exception=False)
                listSubject.clear()
                if newValue: # Add the newest value back again.
                    listSubject[ent] = newValue
                test2 = True

            if (len(listType) > size):
                helpers.bulk(es, getBulkData(listType), index=INDEX_NAME, raise_on_error=False, raise_on_exception=False)
                listType.clear()
                test1 = True

        # If there are remaining elements left, be sure to bulk index them!
        if len(listSubject):
            helpers.bulk(es, getBulkData(listSubject.values()), index=INDEX_NAME, raise_on_error=False, raise_on_exception=False)

        if len(listType):
            helpers.bulk(es, getBulkData(listType), index=INDEX_NAME, raise_on_error=False, raise_on_exception=False)
        
        print("Finished indexing successfully!")
        print("Time Elapsed: {:.4f} sec.".format((time.time()-start_time)))
    except Exception as e:
        print('Error:', e)
        print(traceback.format_exc())
    finally:
        for f in files:
            f.close()
        listAbstract.clear()
        listSubject.clear()
        listType.clear()
        ENTITIES_PROCESSED.clear()

In [204]:
#indexData(15000)

In [188]:
peek('datasets/DBpedia/long_abstracts_en.ttl', 3)

<http://dbpedia.org/resource/Animalia_(book)> <http://dbpedia.org/ontology/abstract> "Animalia is an illustrated children's book by Graeme Base. It was originally published in 1986, followed by a tenth anniversary edition in 1996, and a 25th anniversary edition in 2012. Over three million copies have been sold. A special numbered and signed anniversary edition was also published in 1996, with an embossed gold jacket."@en .
<http://dbpedia.org/resource/Actrius> <http://dbpedia.org/ontology/abstract> "Actresses (Catalan: Actrius) is a 1997 Catalan language Spanish drama film produced and directed by Ventura Pons and based on the award-winning stage play E.R. by Josep Maria Benet i Jornet. The film has no male actors, with all roles played by females. The film was produced in 1996."@en .


In [189]:
peek('datasets/DBpedia/article_categories_en.ttl', 3)

<http://dbpedia.org/resource/A> <http://purl.org/dc/terms/subject> <http://dbpedia.org/resource/Category:ISO_basic_Latin_letters> .
<http://dbpedia.org/resource/A> <http://purl.org/dc/terms/subject> <http://dbpedia.org/resource/Category:Vowel_letters> .


In [208]:
peek('datasets/DBpedia/instance_types_en.ttl', 3)

<http://dbpedia.org/resource/Anarchism> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/2002/07/owl#Thing> .
<http://dbpedia.org/resource/Achilles> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/2002/07/owl#Thing> .


In [191]:
peek('datasets/DBpedia/mappingbased_literals_en.ttl', 3)

<http://dbpedia.org/resource/Autism> <http://dbpedia.org/ontology/icd10> "F84.0" .
<http://dbpedia.org/resource/Autism> <http://dbpedia.org/ontology/icd9> "299.00" .


In [192]:
peek('datasets/DBpedia/mappingbased_objects_en.ttl', 3)

<http://dbpedia.org/resource/Anarchism> <http://www.w3.org/2000/01/rdf-schema#seeAlso> <http://dbpedia.org/resource/Anarchist_terminology> .
<http://dbpedia.org/resource/Anarchism> <http://www.w3.org/2000/01/rdf-schema#seeAlso> <http://dbpedia.org/resource/Anarchism> .


In [207]:
es.search(index=INDEX_NAME, body={'query': {'match': {'abstract': 'heron'}}}, _source=True, size=10)

{'took': 149,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 1227, 'relation': 'eq'},
  'max_score': 15.82955,
  'hits': [{'_index': 'fasttest',
    '_type': '_doc',
    '_id': 'Chinese pond heron',
    '_score': 15.82955,
    '_source': {'abstract': 'The Chinese pond heron (Ardeola bacchus) is an East Asian freshwater bird of the heron family, (Ardeidae). It is one of six species of birds known as pond herons (genus Ardeola). It is parapatric (or nearly so) with the Indian pond heron (A. grayii) to the west and the Javan pond heron (A. speciosa) to the south, and these three are presumed to form a superspecies. As a group they are variously affiliated with the squacco heron (A. ralloides) or the Malagasy pond heron (A. idae). As of mid-2011 there are no published molecular analyses of pond heron interrelationships and osteological data is likewise not analyzed for all relevant comparison taxa.',
     'subject': '

In [109]:
# 4931948 abstracts
# 850298+ cat

In [194]:
es.termvectors(index=INDEX_NAME, id='Acquire')

{'_index': 'fasttest',
 '_type': '_doc',
 '_id': 'Acquire',
 '_version': 3,
 'found': True,
 'took': 115,
 'term_vectors': {'instance': {'field_statistics': {'sum_doc_freq': 4703524,
    'doc_count': 4700543,
    'sum_ttf': 4703524},
   'terms': {'game': {'term_freq': 1}}},
  'subject': {'field_statistics': {'sum_doc_freq': 51470334,
    'doc_count': 4757672,
    'sum_ttf': 73561889},
   'terms': {'1962': {'term_freq': 1},
    '3m': {'term_freq': 1},
    'avalon': {'term_freq': 1},
    'board': {'term_freq': 3},
    'bookshelf': {'term_freq': 1},
    'econom': {'term_freq': 1},
    'game': {'term_freq': 7},
    'hill': {'term_freq': 1},
    'introduc': {'term_freq': 1},
    'lai': {'term_freq': 1},
    'multiplay': {'term_freq': 1},
    'sackson': {'term_freq': 1},
    'sid': {'term_freq': 1},
    'simul': {'term_freq': 1},
    'tile': {'term_freq': 1}}},
  'abstract': {'field_statistics': {'sum_doc_freq': 304406661,
    'doc_count': 7065416,
    'sum_ttf': 401041953},
   'terms': {'19