In [1]:
import math
import random
import numpy as np
import elasticsearch
from collections import Counter
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from elasticsearch import Elasticsearch, helpers

In [2]:
es = Elasticsearch()
es.info()

{'name': 'BERNTA-PC',
 'cluster_name': 'elasticsearch',
 'cluster_uuid': 'IP06yo9vScKZA1ZTb8R9HA',
 'version': {'number': '7.9.2',
  'build_flavor': 'default',
  'build_type': 'zip',
  'build_hash': 'd34da0ea4a966c4e49417f2da2f244e3e97b4e6e',
  'build_date': '2020-09-23T00:45:33.626720Z',
  'build_snapshot': False,
  'lucene_version': '8.6.2',
  'minimum_wire_compatibility_version': '6.8.0',
  'minimum_index_compatibility_version': '6.0.0-beta1'},
 'tagline': 'You Know, for Search'}

In [3]:
FIELDS = ['abstract', 'subject']
INDEX_NAME = 'fasttest'
INDEX_SETTINGS = {
    'mappings': {
            'properties': {
                'abstract': {
                    'type': 'text',
                    'term_vector': 'yes',
                    'analyzer': 'english'
                },
                'subject': {
                    'type': 'text',
                    'term_vector': 'yes',
                    'analyzer': 'english'
                }
            }
        }
    }

In [4]:
if es.indices.exists(INDEX_NAME):
    es.indices.delete(index=INDEX_NAME)    
es.indices.create(index=INDEX_NAME, body=INDEX_SETTINGS)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'fasttest'}

In [5]:
stop_words = set(stopwords.words('english'))
print(stop_words) # Needed?

{'have', 'for', 'about', 'those', "wasn't", 'than', 'because', 'you', 'their', 'should', 'between', 's', 'some', 'own', 't', "isn't", 'too', 'yours', 'under', "doesn't", 'and', 'at', 'was', 'we', 'what', 'ma', 'themselves', 'whom', 'during', 'both', 'does', 'had', 'where', 'shouldn', 'out', 'they', 'am', 'as', 'by', 'all', 'other', 'him', 'll', 'himself', "mightn't", 'my', 'doesn', "you'd", 'myself', 'itself', 'yourselves', 'mightn', 'into', 'are', "that'll", 'no', "won't", 'our', 'been', 'weren', 'is', 'such', 'a', 'with', "haven't", 'any', "weren't", 'his', 'but', 'down', "hadn't", 'doing', "don't", 'this', 'same', 'i', 'on', 'an', 'why', 'just', 'wouldn', 'up', 'hers', "mustn't", 'over', 'has', 'don', 'to', "it's", 'while', 'will', 'that', 'he', 'above', "you'll", 'hadn', 'were', 'didn', 'your', 'it', 'ain', 'further', 'again', 'before', 'm', 'do', 'd', 'off', 'theirs', 'who', 'its', 'y', "you're", 'ours', 'did', 'here', 'isn', "shouldn't", 'of', 'herself', "couldn't", 'these', 'aga

In [6]:
def peek(filename, size, enc='utf-8'):
    """
    Print out the first X lines in the file.
    """
    if size <= 0:
        print("Size must be greater than zero!")
        return

    with open(filename, encoding=enc) as f:
        for i,line in enumerate(f):
            if (size >= 0) and (i >= size):
                break
            if i == 0: # Skip top line.
                continue
            print(line.strip())

In [7]:
def loadCategories():
    kv = {}
    with open('datasets/DBpedia/article_categories_en.ttl', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i == 0: # Skip top line.
                continue
            line = line.strip().replace('"', '').replace('\'', '').split(' ')
            entity = line[0].replace('<', '').replace('>', '').split('/')[-1].replace('_', ' ')
            subject = line[2].replace('<', '').replace('>', '').split('/')[-1][len('Category:'):].replace('_', ' ')
            if not entity in kv:
                kv[entity] = []
            kv[entity].append(subject)
    return kv

def loadAbstracts(size=5000):
    kv = {}
    with open('datasets/DBpedia/long_abstracts_en.ttl', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if (i >= size):
                break
            if i == 0: # Skip top line.
                continue
            
            line = line.strip().replace('"', '').replace('\'', '').split(' ')
            a = line[0].replace('<', '').replace('>', '').split('/')[-1].replace('_', ' ')
            #b = line[1].replace('<', '').replace('>', '').split('/')[-1].replace('_', ' ')
            c = ' '.join(line[2:]).replace('@en .', '')
            kv[a] = c
    return kv

In [15]:
#SUBJECTS_CATEGORIES = loadCategories() # Might take some time to run!

In [16]:
#len(SUBJECTS_CATEGORIES) # 5 mill junk

In [17]:
def parseLine(data, line):
    """
    Parse a line of text, add its details to the key value dict.
    """        
    line = line.strip().replace('"', '').replace('\'', '').split(' ')
    a = line[0].replace('<', '').replace('>', '').split('/')[-1].replace('_', ' ')
    b = line[1].replace('<', '').replace('>', '').split('/')[-1].replace('_', ' ')
    if not a in data:
        data[a] = {'abstract': '', 'subject': []} # Default dict. structure for an entity? (' '.join(field) will be done before indexing elasticsearch)

    if line[2].startswith('<'):
        c = line[2].replace('<', '').replace('>', '').split('/')[-1][len('Category:'):].replace('_', ' ')
        data[a][b].append(c)
    else:
        c = ' '.join(line[2:]).replace('@en .', '')
        data[a][b] = c

def indexData(size=5000):
    """
    Index the data, size = how many lines to process at a time, in bulk idx.
    """
    files = [
        ('datasets/DBpedia/long_abstracts_en.ttl', 'utf-8'),
        ('datasets/DBpedia/article_categories_en.ttl', 'utf-8')
    ]
    files = [open(f, 'r', encoding=e) for f, e in files] # Datasets to index.
    linesProcessed = 0
    kv = {}
    try:
        abstractFile, categoriesFile = files[0], files[1]        
        lineA, lineB = next(abstractFile), next(categoriesFile) # Skip top lines!        
        while lineA or lineB:
            lineA, lineB = next(abstractFile), next(categoriesFile)
            parseLine(kv, lineA)
            parseLine(kv, lineB)            
            linesProcessed += 1

            # Have we processed enough? Batch index these entries if so.
            if (linesProcessed % size) == 0: # Refresh
                actions = []
                for k,v in kv.items():
                    actions.append({
                        "_id": k, 
                        "_source": {'abstract': v['abstract'], 'subject': ', '.join(v['subject'])}
                        })
                response = helpers.bulk(es, actions, index=INDEX_NAME)
                print("BulkResponse:", response)
                kv.clear()
                del actions[0:len(actions)]
                break # <- Remove me when we know that this is good enuff...
    except Exception as e:
        print(e)
    finally:
        for f in files:
            f.close()

In [18]:
indexData(10000)

BulkResponse: (10024, [])


In [166]:
peek('datasets/DBpedia/infobox_properties_en.ttl', 4)

<http://dbpedia.org/resource/Anarchism> <http://dbpedia.org/property/voy> "no"^^<http://www.w3.org/1999/02/22-rdf-syntax-ns#langString> .
<http://dbpedia.org/resource/Anarchism> <http://dbpedia.org/property/n> "no"^^<http://www.w3.org/1999/02/22-rdf-syntax-ns#langString> .
<http://dbpedia.org/resource/Anarchism> <http://dbpedia.org/property/v> "no"^^<http://www.w3.org/1999/02/22-rdf-syntax-ns#langString> .


In [141]:
peek('datasets/DBpedia/long_abstracts_en.ttl', 4)

<http://dbpedia.org/resource/Animalia_(book)> <http://dbpedia.org/ontology/abstract> "Animalia is an illustrated children's book by Graeme Base. It was originally published in 1986, followed by a tenth anniversary edition in 1996, and a 25th anniversary edition in 2012. Over three million copies have been sold. A special numbered and signed anniversary edition was also published in 1996, with an embossed gold jacket."@en .
<http://dbpedia.org/resource/Actrius> <http://dbpedia.org/ontology/abstract> "Actresses (Catalan: Actrius) is a 1997 Catalan language Spanish drama film produced and directed by Ventura Pons and based on the award-winning stage play E.R. by Josep Maria Benet i Jornet. The film has no male actors, with all roles played by females. The film was produced in 1996."@en .
<http://dbpedia.org/resource/Alain_Connes> <http://dbpedia.org/ontology/abstract> "Alain Connes (French: [alɛ̃ kɔn]; born 1 April 1947) is a French mathematician, currently Professor at the Collège de Fra

In [142]:
peek('datasets/DBpedia/anchor_text_en.ttl', 4)

<http://dbpedia.org/resource/Computer_accessibility> <http://dbpedia.org/ontology/wikiPageWikiLinkText> "Computer accessibility"@en .
<http://dbpedia.org/resource/History_of_Afghanistan> <http://dbpedia.org/ontology/wikiPageWikiLinkText> "History of Afghanistan"@en .
<http://dbpedia.org/resource/Geography_of_Afghanistan> <http://dbpedia.org/ontology/wikiPageWikiLinkText> "Geography of Afghanistan"@en .


In [143]:
peek('datasets/DBpedia/article_categories_en.ttl', 4)

<http://dbpedia.org/resource/A> <http://purl.org/dc/terms/subject> <http://dbpedia.org/resource/Category:ISO_basic_Latin_letters> .
<http://dbpedia.org/resource/A> <http://purl.org/dc/terms/subject> <http://dbpedia.org/resource/Category:Vowel_letters> .
<http://dbpedia.org/resource/Achilles> <http://purl.org/dc/terms/subject> <http://dbpedia.org/resource/Category:Characters_in_the_Iliad> .


In [144]:
peek('datasets/DBpedia/instance_types_en.ttl', 4)

<http://dbpedia.org/resource/Anarchism> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/2002/07/owl#Thing> .
<http://dbpedia.org/resource/Achilles> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/2002/07/owl#Thing> .
<http://dbpedia.org/resource/Autism> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://dbpedia.org/ontology/Disease> .


In [145]:
peek('datasets/DBpedia/mappingbased_literals_en.ttl', 4)

<http://dbpedia.org/resource/Autism> <http://dbpedia.org/ontology/icd10> "F84.0" .
<http://dbpedia.org/resource/Autism> <http://dbpedia.org/ontology/icd9> "299.00" .
<http://dbpedia.org/resource/Autism> <http://dbpedia.org/ontology/omim> "209850"^^<http://www.w3.org/2001/XMLSchema#integer> .


In [146]:
peek('datasets/DBpedia/mappingbased_objects_en.ttl', 4)

<http://dbpedia.org/resource/Anarchism> <http://www.w3.org/2000/01/rdf-schema#seeAlso> <http://dbpedia.org/resource/Anarchist_terminology> .
<http://dbpedia.org/resource/Anarchism> <http://www.w3.org/2000/01/rdf-schema#seeAlso> <http://dbpedia.org/resource/Anarchism> .
<http://dbpedia.org/resource/Anarchism> <http://www.w3.org/2000/01/rdf-schema#seeAlso> <http://dbpedia.org/resource/France> .


In [168]:
#es.update(index=INDEX_NAME, id='Achilles', body={"doc": {'abstract':kv['Achilles']['abstract'][0]}})
#es.update(index=INDEX_NAME, id='Achilles', body={"doc": {'subject':" ".join(kv['Achilles']['subject'])}})
#es.update(index=INDEX_NAME, id='Achilles', body={"doc": {'subject':" ".join(['meep', 'beep'])}})

In [19]:
es.search(index=INDEX_NAME, body={'query': {'match': {'abstract': 'mythology'}}}, _source=True, size=15)

{'took': 1460,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 264, 'relation': 'eq'},
  'max_score': 8.064013,
  'hits': [{'_index': 'fasttest',
    '_type': '_doc',
    '_id': 'Religion and mythology',
    '_score': 8.064013,
    '_source': {'abstract': 'Religion and mythology differ in scope but have overlapping aspects. Both terms refer to systems of concepts that are of high importance to a certain community, making statements concerning the supernatural or sacred. Generally, mythology is considered one component or aspect of religion. Religion is the broader term: besides mythological aspects, it includes aspects of ritual, morality, theology, and mystical experience. A given mythology is almost always associated with a certain religion such as Greek mythology with Ancient Greek religion. Disconnected from its religious system, a myth may lose its immediate relevance to the community and evolve—away from sacr

In [206]:
x = loadCategories(5000000)
len(x)

850298

In [208]:
x['Apollo']

['Apollo',
 'Arts gods',
 'Deities in the Iliad',
 'Dragonslayers',
 'Health gods',
 'Knowledge gods',
 'LGBT themes in mythology',
 'Muses',
 'Temples of Apollo',
 'Mythological Greek archers',
 'Mythological rapists',
 'Oracular gods',
 'Roman gods',
 'Solar gods']

In [215]:
y = loadAbstracts(5000000)
len(y)

4931948

In [213]:
y['Achilles']

'In Greek mythology, Achilles (/əˈkɪliːz/; Ancient Greek: Ἀχιλλεύς, Akhilleus, pronounced [akʰilːéu̯s]) was a Greek hero of the Trojan War and the central character and greatest warrior of Homers Iliad. His mother was the nymph Thetis, and his father, Peleus, was the king of the Myrmidons. Achilles’ most notable feat during the Trojan War was the slaying of the Trojan hero Hector outside the gates of Troy. Although the death of Achilles is not presented in the Iliad, other sources concur that he was killed near the end of the Trojan War by Paris, who shot him in the heel with an arrow. Later legends (beginning with a poem by Statius in the 1st century AD) state that Achilles was invulnerable in all of his body except for his heel. Because of his death from a small wound in the heel, the term Achilles heel has come to mean a persons point of weakness.'

In [216]:
print(x['Achilles'], y['Achilles'])

['Characters in the Iliad', 'Demigods of Classical mythology', 'Kings of the Myrmidons', 'Greek mythological hero cult', 'People of the Trojan War', 'Thessalians in the Trojan War'] In Greek mythology, Achilles (/əˈkɪliːz/; Ancient Greek: Ἀχιλλεύς, Akhilleus, pronounced [akʰilːéu̯s]) was a Greek hero of the Trojan War and the central character and greatest warrior of Homers Iliad. His mother was the nymph Thetis, and his father, Peleus, was the king of the Myrmidons. Achilles’ most notable feat during the Trojan War was the slaying of the Trojan hero Hector outside the gates of Troy. Although the death of Achilles is not presented in the Iliad, other sources concur that he was killed near the end of the Trojan War by Paris, who shot him in the heel with an arrow. Later legends (beginning with a poem by Statius in the 1st century AD) state that Achilles was invulnerable in all of his body except for his heel. Because of his death from a small wound in the heel, the term Achilles heel ha

In [217]:
x.clear()
y.clear()

In [218]:
# 4931948 abstracts

In [232]:
SUBJECTS_CATEGORIES['Achilles']

['Characters in the Iliad',
 'Demigods of Classical mythology',
 'Kings of the Myrmidons',
 'Greek mythological hero cult',
 'People of the Trojan War',
 'Thessalians in the Trojan War']