In [34]:
import traceback
import elasticsearch
import time
import string
import re
from elasticsearch import Elasticsearch, helpers

In [35]:
es = Elasticsearch()
es.info()

{'name': 'BERNTA-PC',
 'cluster_name': 'elasticsearch',
 'cluster_uuid': 'IP06yo9vScKZA1ZTb8R9HA',
 'version': {'number': '7.9.2',
  'build_flavor': 'default',
  'build_type': 'zip',
  'build_hash': 'd34da0ea4a966c4e49417f2da2f244e3e97b4e6e',
  'build_date': '2020-09-23T00:45:33.626720Z',
  'build_snapshot': False,
  'lucene_version': '8.6.2',
  'minimum_wire_compatibility_version': '6.8.0',
  'minimum_index_compatibility_version': '6.0.0-beta1'},
 'tagline': 'You Know, for Search'}

In [36]:
INDEX_NAME = 'smart'
INDEX_SETTINGS = {
    'mappings': {
            'properties': {
                'abstract': {
                    'type': 'text',
                    'term_vector': 'yes',
                    'analyzer': 'english'
                },
                'instance': {
                    'type': 'text'
                }
            }
        }
    }

In [37]:
def createTheIndex():
    if es.indices.exists(INDEX_NAME):
        es.indices.delete(index=INDEX_NAME)    
    print(es.indices.create(index=INDEX_NAME, body=INDEX_SETTINGS))

In [38]:
#createTheIndex()

In [39]:
def peek(filename, size, enc='utf-8'):
    """
    Print out the first X lines in the file.
    """
    if size <= 0:
        print("Size must be greater than zero!")
        return

    with open(filename, encoding=enc) as f:
        for i,line in enumerate(f):
            if (size >= 0) and (i >= size):
                break
            if i == 0: # Skip top line.
                continue
            print(line.strip())

In [40]:
ENTITIES_PROCESSED = None
DEBUGGING = False # If true, only test (index) on a small subset!

In [41]:
stop_words = set(['a', 'an', 'and', 'are', 'as', 'at', 'be', 'but', 'by', 'for', 'if', 'in', 'into', 'is', 'it', 'no', 'not', 'of', 'on', 'or', 'such', 'that', 'the', 'their', 'then', 'there', 'these', 'they', 'this', 'to', 'was', 'will', 'with'])
print(stop_words, '-', string.punctuation) # Default in ElasticSearch, + punctuation

{'at', 'into', 'and', 'on', 'such', 'not', 'an', 'there', 'as', 'their', 'was', 'are', 'it', 'these', 'with', 'is', 'for', 'if', 'of', 'but', 'no', 'that', 'will', 'a', 'then', 'to', 'the', 'by', 'in', 'or', 'they', 'this', 'be'} - !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [42]:
def preprocess(text, remove_stopwords=False):
    text = text.replace('_', ' ').replace('-', ' ')
    text = text.translate(str.maketrans('', '', string.punctuation)) # Remove punctuation stuff.
    text = re.sub('\s\s+', ' ', text) # Replace consequtive whitespace with a single space.
    if remove_stopwords:
        return ' '.join([v for v in text.split(' ') if not v in stop_words])
    return text

def parseAbstract(data, line):
    """Parse a line from long_abstract."""
    if (line is None) or (line[0] == '#'):
        return
    line = line.lower().strip()[:-5].replace('/>', '>').split(' ')
    if len(line) < 3:
        return # Invalid line.
    entity = preprocess(line[0][1:-1].split('/')[-1])
    value = preprocess(' '.join(line[2:]).replace('\\', ''), True)
    data.append({
                "_id": entity, 
                "_source": {'abstract': value, 'instance': 'thing'}
    })
    if DEBUGGING:
        ENTITIES_PROCESSED.add(entity) # Testing

def parseType(data, line):
    """Parse a line from instances."""
    if (line is None) or (line[0] == '#'):
        return
    line = line.lower().strip().replace('/>', '>').split(' ')
    if (len(line) < 3) or ('__' in line[0]):
        return # Invalid line.
    entity = preprocess(line[0][1:-1].split('/')[-1])
    value = preprocess(line[2][1:-1].split('/')[-1].replace('owl#', ''))
    data.append({
                "_id": entity, 
                "_source": {"doc": {'instance': value}},
                "_op_type": "update"
    })
    
def getBulkData(data):
    """
    To prevent issues when debugging,
    we only bulk data which was indexed @ abstract.
    """
    if DEBUGGING:
        return [d for d in data if (d['_id'] in ENTITIES_PROCESSED)]
    else:
        return data

In [43]:
def indexData(size=5000):
    """
    Index the data, size = how many entities to parse at a time.
    size should not be much bigger than 20000, due to bulk index size limitations @ elasticsearch!
    """
    global ENTITIES_PROCESSED
    ENTITIES_PROCESSED = set()
    files = [
        ('datasets/DBpedia/long_abstracts_en.ttl', 'utf-8'),
        ('datasets/DBpedia/instance_types_en.ttl', 'utf-8')
    ]
    try:
        files = [open(f, 'r', encoding=e) for f, e in files] # Datasets to index.
        listAbstract, listType = [], []
        start_time = time.time()
        
        # Process abstracts first! (bulk)
        for i, line in enumerate(files[0]):
            if i == 0: # Skip top line.
                continue
            parseAbstract(listAbstract, line)
            if (len(listAbstract) > size):
                helpers.bulk(es, listAbstract, index=INDEX_NAME, raise_on_error=False, raise_on_exception=False)
                listAbstract.clear()
                if DEBUGGING: # Only consider a small subset during test.
                    break
                
        if len(listAbstract): # Still have some remaining items? Bulk them now.
            helpers.bulk(es, listAbstract, index=INDEX_NAME, raise_on_error=False, raise_on_exception=False)
            listAbstract.clear()
            
        print("Indexed abstracts.")
        print("Time Elapsed: {:.4f} sec.".format((time.time()-start_time)))

        for i, line in enumerate(files[1]):
            if i == 0: # Skip top line.
                continue
            parseType(listType, line)
            if (len(listType) > size):
                helpers.bulk(es, getBulkData(listType), index=INDEX_NAME, raise_on_error=False, raise_on_exception=False)
                listType.clear()
                if DEBUGGING: # Only consider a small subset during test.
                    break
        
        if len(listType):
            helpers.bulk(es, getBulkData(listType), index=INDEX_NAME, raise_on_error=False, raise_on_exception=False)
        
        print("Finished indexing successfully!")
        print("Time Elapsed: {:.4f} sec.".format((time.time()-start_time)))
    except Exception as e:
        print('Error:', e)
        print(traceback.format_exc())
    finally:
        for f in files:
            f.close()
        listAbstract.clear()
        listType.clear()
        ENTITIES_PROCESSED.clear()

In [62]:
# Takes approx 1hr and 30 min!
#indexData(10000)

In [44]:
peek('datasets/DBpedia/long_abstracts_en.ttl', 5)

<http://dbpedia.org/resource/Animalia_(book)> <http://dbpedia.org/ontology/abstract> "Animalia is an illustrated children's book by Graeme Base. It was originally published in 1986, followed by a tenth anniversary edition in 1996, and a 25th anniversary edition in 2012. Over three million copies have been sold. A special numbered and signed anniversary edition was also published in 1996, with an embossed gold jacket."@en .
<http://dbpedia.org/resource/Actrius> <http://dbpedia.org/ontology/abstract> "Actresses (Catalan: Actrius) is a 1997 Catalan language Spanish drama film produced and directed by Ventura Pons and based on the award-winning stage play E.R. by Josep Maria Benet i Jornet. The film has no male actors, with all roles played by females. The film was produced in 1996."@en .
<http://dbpedia.org/resource/Alain_Connes> <http://dbpedia.org/ontology/abstract> "Alain Connes (French: [alɛ̃ kɔn]; born 1 April 1947) is a French mathematician, currently Professor at the Collège de Fra

In [45]:
peek('datasets/DBpedia/instance_types_en.ttl', 5)

<http://dbpedia.org/resource/Anarchism> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/2002/07/owl#Thing> .
<http://dbpedia.org/resource/Achilles> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/2002/07/owl#Thing> .
<http://dbpedia.org/resource/Autism> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://dbpedia.org/ontology/Disease> .
<http://dbpedia.org/resource/Alabama> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://dbpedia.org/ontology/AdministrativeRegion> .
