## Imports

In [42]:
import json
import elasticsearch
import string
import re
from collections import Counter
from elasticsearch import Elasticsearch

In [43]:
QUERY_TRAIN_FILEPATH = 'datasets\DBpedia\smarttask_dbpedia_train.json'
QUERY_TEST_FILEPATH = 'datasets\DBpedia\smarttask_dbpedia_test_questions.json'

INDEX_NAME = 'fasttest'
FIELDS = ['abstract', 'subject', "instance"]

In [44]:
stop_words = set(['a', 'an', 'and', 'are', 'as', 'at', 'be', 'but', 'by', 'for', 'if', 'in', 'into', 'is', 'it', 'no', 'not', 'of', 'on', 'or', 'such', 'that', 'the', 'their', 'then', 'there', 'these', 'they', 'this', 'to', 'was', 'will', 'with'])
print(stop_words) # Default in ElasticSearch

{'by', 'not', 'they', 'there', 'in', 'then', 'with', 'was', 'their', 'be', 'on', 'and', 'it', 'of', 'an', 'are', 'into', 'that', 'these', 'if', 'for', 'to', 'but', 'no', 'the', 'this', 'is', 'such', 'as', 'at', 'or', 'a', 'will'}


In [45]:
punctuation = string.punctuation.replace("'", "")
def preprocess(text):
    text = text.strip().lower().translate(str.maketrans('', '', punctuation)) # Remove punctuation stuff.
    text = re.sub('\s\s+', ' ', text).split(' ') # Replace consequtive whitespace with a single space.
    return ' '.join([v for v in text if not v in stop_words])

## Elastic search

In [46]:
es = Elasticsearch()
es.info()

{'name': 'BERNTA-PC',
 'cluster_name': 'elasticsearch',
 'cluster_uuid': 'IP06yo9vScKZA1ZTb8R9HA',
 'version': {'number': '7.9.2',
  'build_flavor': 'default',
  'build_type': 'zip',
  'build_hash': 'd34da0ea4a966c4e49417f2da2f244e3e97b4e6e',
  'build_date': '2020-09-23T00:45:33.626720Z',
  'build_snapshot': False,
  'lucene_version': '8.6.2',
  'minimum_wire_compatibility_version': '6.8.0',
  'minimum_index_compatibility_version': '6.0.0-beta1'},
 'tagline': 'You Know, for Search'}

## Load Queries

In [47]:
def load_train_queries(filepath):
    """
    Load training queries from a file. 
    Returns a dictoinary with queryID as key and corresponding query, category and type.
    """
    query_dicts = {}
    with open(filepath, "r") as f:
      queries = f.read()
    
    for query in json.loads(queries):
        try:
            qID, qText, qCat, qType = query["id"].lower(), query["question"].lower(), query["category"].lower(), ' '.join(query["type"]).lower()
            if not 'dbo:' in qType: # Skip queries without a dbo: type.
                continue
            query_dicts[qID] = {"query": preprocess(qText), "category": qCat, "type": qType}
        except Exception as e:
            # print("Query: {}\n\tThrew an exception: {}\n".format(query, e))
            continue
    return query_dicts

def load_test_queries(filepath):
    """
    Load test queries from a file.
    Returns a dictionary with queryID as key, and corresponding query as a string.
    """
    query_dicts = {}
    with open(filepath, "r") as f:
      queries = f.read()
    
    for query in json.loads(queries):
        try:
            query_dicts[query["id"].lower()] = {"query": preprocess(query["question"].lower())}
        except Exception as e:
            # print("Query: {}\n\tThrew an exception: {}\n".format(query, e))
            continue
    return query_dicts

In [48]:
training_queries = load_train_queries(QUERY_TRAIN_FILEPATH)
test_queries = load_test_queries(QUERY_TEST_FILEPATH)

print("# training queries:", len(training_queries), "\n\tExample key'dbpedia_17655' returns:", training_queries['dbpedia_17655'])
print("# test queries:", len(test_queries), "\n\tExample key'dbpedia_21099' contain:", test_queries['dbpedia_21099'])

# training queries: 9557 
	Example key'dbpedia_17655' returns: {'query': 'what town birthplace joseph greenberg', 'category': 'resource', 'type': 'dbo:city dbo:settlement dbo:populatedplace dbo:place dbo:location'}
# test queries: 4369 
	Example key'dbpedia_21099' contain: {'query': 'under which president did some politicians live kensington'}


## Baseline Retrieval
Implements Okapi BM25, uses the Elastic search inbuilt implementation

In [49]:
def internal_BM25(query, k = 100, field = 'abstract', index = INDEX_NAME):
    """
    Perform baseline retrieval on a index using the inbuilt BM25 index

    Arguments:
        index: string
        query: string, space separated terms
        k: integer
    
    Returns:
        List of k first entity IDs(string)
    """
    hits = es.search(index=index, body={'query': {'match': {field: query}}}, _source=False, size=k).get('hits', {}).get('hits', {})
    hits_ids = [obj['_id'] for obj in hits]
    hits_types = [es.get(index=index, id=doc)["_source"].get("instance", "Thing") for doc in hits_ids]
    return Counter([obj for obj in hits_types if len(obj) > 0]).most_common()
    
def internal_BM25_score(query, k = 100, field = 'abstract', index = INDEX_NAME):
    """
    Perform baseline retrieval on a index using the inbuilt BM25 index

    Arguments:
        index: string
        query: string, space separated terms
        k: integer
    
    Returns:
        List of k first entity IDs(string), and corresponding score(double)
    """
    hits = es.search(index=index, body={'query': {'match': {field: query}}}, _source=False, size=k).get('hits', {}).get('hits', {})
    hits.sort(key = lambda x: x['_score'], reverse=True)
    return {obj['_id']:obj['_score'] for obj in hits}

In [50]:
internal_BM25("civil rights")

[('Person', 22),
 ('Thing', 12),
 ('Organisation', 7),
 ('OfficeHolder', 4),
 ('GovernmentAgency', 3),
 ('Politician', 2),
 ('AcademicJournal', 2),
 ('Non-ProfitOrganisation', 2),
 ('Company', 1),
 ('Museum', 1),
 ('Saint', 1),
 ('PoliticalParty', 1),
 ('Writer', 1),
 ('Ambassador', 1)]

In [51]:
internal_BM25_score("civil rights", k=5)

{'Leadership Conference on Civil and Human Rights': 14.707036,
 'Civil Rights Commission (Puerto Rico)': 14.573362,
 'Lawyers Committee for Civil Rights Under Law': 14.516139,
 'Chicano Movement': 14.269186,
 'Civil Rights Act': 14.151209}

In [90]:
def analyze_query(es, query, index=INDEX_NAME, field = 'abstract'):
    """Analyzes a query with respect to the relevant index. 
    
    Arguments:
        es: Elasticsearch object instance.
        query: String of query terms.
        field: The field with respect to which the query is analyzed. 
        index: Name of the index with respect to which the query is analyzed.  
    
    Returns:
        A list of query terms that exist in the specified field among the documents in the index. 
    """
    tokens = es.indices.analyze(index=index, body={'text': query})['tokens']
    query_terms = []
    for t in sorted(tokens, key=lambda x: x['position']):
        ## Use a boolean query to find at least one document that contains the term.
        hits = es.search(index=index, body={'query': {'match': {field: t['token']}}}, 
                                   _source=False, size=1).get('hits', {}).get('hits', {})
        doc_id = hits[0]['_id'] if len(hits) > 0 else None
        if doc_id is None:
            continue
        query_terms.append(t['token'])
    return query_terms

def extract_features(query_terms, doc_id, es, index=INDEX_NAME):
    """Extracts query features, document features and query-document features of a query and document pair.
    
        Arguments:
            query_terms: List of analyzed query terms.
            doc_id: Document identifier of indexed document.
            es: Elasticsearch object instance.
            index: Name of relevant index on the running Elasticsearch service. 
            
        Returns:
            List of extracted feature values in a fixed order.
    """
    feature_vect = []
    # TODO
    return feature_vect

def prepare_ltr_training_data(es, k=100, amount=0, index=INDEX_NAME):
    """Prepares feature vectors and labels for query and document pairs found in the training data.
    
        Arguments:
            query_ids: List of query IDs.
            es: Elasticsearch object instance.
            index: Name of relevant index on the running Elasticsearch service. 
            
        Returns:
            X: List of feature vectors extracted for each pair of query and retrieved or relevant document. 
            y: List of corresponding labels.
    """
    X, y = [], []
    progress, N = 0, len(training_queries)
    for queryObject in training_queries.values():
        query = analyze_query(es, queryObject['query'], index)
        hits = es.search(index=index, q=' '.join(query), _source=True, size=k)['hits']['hits']
        for obj in hits:
            dId, types = obj['_id'], obj['_source']['instance'].lower()
            if len(types) == 0:
                types = 'thing' # Default

            y.append((1 if (types in queryObject['type']) else 0))
            X.append(extract_features(query, dId, es, index))

            #if types != 'thing':
            #print('Query:', ' '.join(query), ', Expected type:', queryObject['type'], ', Returned type:' , types)

        progress += 1
        if (progress % 50) == 0:
            print('Progress - {}/{} queries handled.'.format(progress, N))

        if amount and (progress >= amount):
            break

    return X, y

def evaluate_baseline(es, k=100, amount=0, index=INDEX_NAME):
    """
    Evaluate the BM25 baseline on our train queries.
    """
    matches, progress, N = 0, 0, len(training_queries)
    for queryObject in training_queries.values():
        query = analyze_query(es, queryObject['query'], index)
        hits = es.search(index=index, q=' '.join(query), _source=True, size=k)['hits']['hits']
        for obj in hits:
            types = obj['_source']['instance'].lower()
            if len(types) == 0:
                types = 'thing' # Default

            if (types in queryObject['type']):
                matches += 1
                break

        progress += 1
        if (progress % 50) == 0:
            print('Progress - {}/{} queries handled.'.format(progress, N))

        if amount and (progress >= amount):
            break

    return (matches / progress)

In [199]:
x,y = prepare_ltr_training_data(es, k=100, amount=50)

Progress - 50/17254 queries handled.


In [55]:
acc = evaluate_baseline(es, k=200, amount=500) # Eval. X queries.
acc

Progress - 50/9557 queries handled.
Progress - 100/9557 queries handled.
Progress - 150/9557 queries handled.
Progress - 200/9557 queries handled.
Progress - 250/9557 queries handled.
Progress - 300/9557 queries handled.
Progress - 350/9557 queries handled.
Progress - 400/9557 queries handled.
Progress - 450/9557 queries handled.
Progress - 500/9557 queries handled.


0.724