## Imports

In [124]:
import json
from collections import Counter

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

import elasticsearch
from elasticsearch import Elasticsearch

In [187]:
QUERY_TRAIN_FILEPATH = 'datasets\DBpedia\smarttask_dbpedia_train.json'
QUERY_TEST_FILEPATH = 'datasets\DBpedia\smarttask_dbpedia_test_questions.json'

INDEX_NAME = 'fasttest'
FIELDS = ['abstract', 'subject', "instance"]

In [65]:
stop_words = set(stopwords.words('english'))


## Elastic search

In [24]:
es = Elasticsearch()
es.info()

{'name': 'ULTIMECIA',
 'cluster_name': 'elasticsearch',
 'cluster_uuid': 'MHYEAbcOS_i6ybp0d4NE2A',
 'version': {'number': '7.9.1',
  'build_flavor': 'default',
  'build_type': 'zip',
  'build_hash': '083627f112ba94dffc1232e8b42b73492789ef91',
  'build_date': '2020-09-01T21:22:21.964974Z',
  'build_snapshot': False,
  'lucene_version': '8.6.2',
  'minimum_wire_compatibility_version': '6.8.0',
  'minimum_index_compatibility_version': '6.0.0-beta1'},
 'tagline': 'You Know, for Search'}

## Load Queries

In [191]:
def load_train_queries(filepath):
    """
    Load training queries from a file. 
    Returns a dictoinary with queryID as key and corresponding query, category and type
    """
    query_dicts = {}
    with open(filepath, "r") as f:
      queries = f.read()
    
    for query in json.loads(queries):
        try:
            query_dicts[query["id"]] = {"query": query["question"].lower(), "category": query["category"], "type": query["type"]}
        except Exception as e:
            # print("Query: {}\n\tThrew an exception: {}\n".format(query, e))
            continue
    return query_dicts


def load_test_queries(filepath):
    """
    Load test queries from a file.
    Returns a dictionary with wueryID as key, and corresponding query as a string
    """
    query_dicts = {}
    with open(filepath, "r") as f:
      queries = f.read()
    
    for query in json.loads(queries):
        try:
            query_dicts[query["id"]] = {"query": query["question"].lower()}
        except Exception as e:
            # print("Query: {}\n\tThrew an exception: {}\n".format(query, e))
            continue
    return query_dicts

In [201]:
training_queries = load_train_queries(QUERY_TRAIN_FILEPATH)
test_queries = load_test_queries(QUERY_TEST_FILEPATH)

print("# training queries:", len(training_queries), "\n\tExample key'dbpedia_17655' returns:", training_queries['dbpedia_17655'])
print("# test queries:", len(test_queries), "\n\tExample key'dbpedia_21099' contain:", test_queries['dbpedia_21099'])



# training queries: 17254 
	Example key'dbpedia_17655' returns: {'query': 'what town is the birthplace of joseph greenberg?', 'category': 'resource', 'type': ['dbo:City', 'dbo:Settlement', 'dbo:PopulatedPlace', 'dbo:Place', 'dbo:Location']}
# test queries: 4369 
	Example key'dbpedia_21099' contain: {'query': 'under which president did some politicians live in kensington?'}


## Baseline Retrieval
Implements Okapi BM25, uses the Elastic search inbuilt implementation

In [205]:
def internal_BM25(index_name, query, field, k = 100):
    """
    Perform baseline retrieval on a index using the inbuilt BM25 index

    Arguments:
        index_name: string
        query: string, space separated terms
        k: integer
    
    Returns:
        List of k first entity IDs(string)
    """
    hits = es.search(index=index_name, body={'query': {'match': {field: query}}}, _source=False, size=k).get('hits', {}).get('hits', {})
    result = [hits[idx]["_id"] for idx in range(len(hits))]
    
    categories = ", ".join([es.get(index=index_name, id = doc)["_source"].get("instance", "") for doc in result])
    categories = [word for word in categories.split(", ") if word != ""]
    return Counter(categories).most_common()
    


def internal_BM25_score(index_name, query, field, k = 100):
    """
    Perform baseline retrieval on a index using the inbuilt BM25 index

    Arguments:
        index_name: string
        query: string, space separated terms
        k: integer
    
    Returns:
        List of k first entity IDs(string), and corresponding score(double)
    """

    hits = es.search(index=index_name, body={'query': {'match': {field: query}}}, _source=False, size=k).get('hits', {}).get('hits', {})
    return {hits[idx]["_id"]:hits[idx]["_score"] for idx in range(len(hits))}


In [207]:
internal_BM25(index_name=INDEX_NAME, query="Mythology", field= FIELDS[0])
# Won't print anything as "instance doesn't exist in current version of the index"

[]