## Imports

In [31]:
import json
import elasticsearch
import string
import re
import math
from collections import Counter
from elasticsearch import Elasticsearch

In [2]:
QUERY_TRAIN_FILEPATH = 'datasets\DBpedia\smarttask_dbpedia_train.json'
QUERY_TEST_FILEPATH = 'datasets\DBpedia\smarttask_dbpedia_test_questions.json'
INDEX_NAME = 'smart'

In [3]:
stop_words = set(['a', 'an', 'and', 'are', 'as', 'at', 'be', 'but', 'by', 'for', 'if', 'in', 'into', 'is', 'it', 'no', 'not', 'of', 'on', 'or', 'such', 'that', 'the', 'their', 'then', 'there', 'these', 'they', 'this', 'to', 'was', 'will', 'with'])
print(stop_words) # Default in ElasticSearch

{'there', 'in', 'at', 'these', 'this', 'but', 'that', 'no', 'was', 'with', 'or', 'on', 'not', 'such', 'if', 'as', 'it', 'then', 'they', 'of', 'a', 'by', 'for', 'the', 'to', 'their', 'are', 'into', 'will', 'be', 'is', 'and', 'an'}


In [4]:
def preprocess(text):
    text = text.strip().lower()
    text = text.replace('_', ' ').replace('-', ' ')
    text = text.translate(str.maketrans('', '', string.punctuation)) # Remove punctuation stuff.
    text = re.sub('\s\s+', ' ', text).split(' ') # Replace consequtive whitespace with a single space.
    return ' '.join([v for v in text if not v in stop_words])

## Elastic search

In [5]:
es = Elasticsearch()
es.info()

{'name': 'BERNTA-PC',
 'cluster_name': 'elasticsearch',
 'cluster_uuid': 'IP06yo9vScKZA1ZTb8R9HA',
 'version': {'number': '7.9.2',
  'build_flavor': 'default',
  'build_type': 'zip',
  'build_hash': 'd34da0ea4a966c4e49417f2da2f244e3e97b4e6e',
  'build_date': '2020-09-23T00:45:33.626720Z',
  'build_snapshot': False,
  'lucene_version': '8.6.2',
  'minimum_wire_compatibility_version': '6.8.0',
  'minimum_index_compatibility_version': '6.0.0-beta1'},
 'tagline': 'You Know, for Search'}

In [6]:
es.indices.refresh(INDEX_NAME)
count = es.cat.count(INDEX_NAME, params={"format": "json"})
print('Docs:', int(count[0]['count']))

Docs: 4926217


## Load Queries

In [7]:
def load_train_queries(filepath):
    """
    Load training queries from a file. 
    Returns a dictoinary with queryID as key and corresponding query, category and type.
    """
    query_dicts = {}
    with open(filepath, "r") as f:
      queries = f.read()
    
    for query in json.loads(queries):
        try:
            qID, qText, qCat, qType = query["id"].lower(), query["question"].lower(), query["category"].lower(), ' '.join(query["type"]).lower()
            if not 'dbo:' in qType: # Skip queries without a dbo: type.
                continue
            query_dicts[qID] = {"query": preprocess(qText), "category": qCat, "type": qType}
        except Exception as e:
            # print("Query: {}\n\tThrew an exception: {}\n".format(query, e))
            continue
    return query_dicts

def load_test_queries(filepath):
    """
    Load test queries from a file.
    Returns a dictionary with queryID as key, and corresponding query as a string.
    """
    query_dicts = {}
    with open(filepath, "r") as f:
      queries = f.read()
    
    for query in json.loads(queries):
        try:
            query_dicts[query["id"].lower()] = {"query": preprocess(query["question"].lower())}
        except Exception as e:
            # print("Query: {}\n\tThrew an exception: {}\n".format(query, e))
            continue
    return query_dicts

In [8]:
training_queries = load_train_queries(QUERY_TRAIN_FILEPATH)
test_queries = load_test_queries(QUERY_TEST_FILEPATH)

print("# training queries:", len(training_queries), "\n\tExample key'dbpedia_17655' returns:", training_queries['dbpedia_17655'])
print("# test queries:", len(test_queries), "\n\tExample key'dbpedia_21099' contain:", test_queries['dbpedia_21099'])

# training queries: 9557 
	Example key'dbpedia_17655' returns: {'query': 'what town birthplace joseph greenberg', 'category': 'resource', 'type': 'dbo:city dbo:settlement dbo:populatedplace dbo:place dbo:location'}
# test queries: 4369 
	Example key'dbpedia_21099' contain: {'query': 'under which president did some politicians live kensington'}


## Baseline Retrieval
Implements Okapi BM25, uses the Elastic search inbuilt implementation

In [9]:
def internal_BM25(query, k = 100, field = 'abstract', index = INDEX_NAME):
    """
    Perform baseline retrieval on a index using the inbuilt BM25 index

    Arguments:
        index: string
        query: string, space separated terms
        k: integer
    
    Returns:
        List of k first entity IDs(string)
    """
    hits = es.search(index=index, body={'query': {'match': {field: query}}}, _source=False, size=k).get('hits', {}).get('hits', {})
    hits_ids = [obj['_id'] for obj in hits]
    hits_types = [es.get(index=index, id=doc)["_source"].get("instance", "thing") for doc in hits_ids]
    return Counter([obj for obj in hits_types if len(obj) > 0]).most_common()
    
def internal_BM25_score(query, k = 100, field = 'abstract', index = INDEX_NAME):
    """
    Perform baseline retrieval on a index using the inbuilt BM25 index

    Arguments:
        index: string
        query: string, space separated terms
        k: integer
    
    Returns:
        List of k first entity IDs(string), and corresponding score(double)
    """
    hits = es.search(index=index, body={'query': {'match': {field: query}}}, _source=False, size=k).get('hits', {}).get('hits', {})
    hits.sort(key = lambda x: x['_score'], reverse=True)
    return {obj['_id']:obj['_score'] for obj in hits}

In [10]:
internal_BM25("civil rights")

[('thing', 50),
 ('person', 21),
 ('organisation', 9),
 ('officeholder', 4),
 ('governmentagency', 3),
 ('academicjournal', 2),
 ('politician', 2),
 ('politicalparty', 2),
 ('non profitorganisation', 2),
 ('company', 1),
 ('saint', 1),
 ('museum', 1),
 ('writer', 1),
 ('ambassador', 1)]

In [11]:
internal_BM25_score("civil rights", k=5)

{'leadership conference on civil and human rights': 14.634237,
 'civil rights commission puerto rico': 14.519391,
 'lawyers committee for civil rights under law': 14.449045,
 'chicano movement': 14.209293,
 'civil rights act': 14.079039}

In [48]:
def analyze_query(es, query, index=INDEX_NAME, field = 'abstract'):
    """Analyzes a query with respect to the relevant index. 
    
    Arguments:
        es: Elasticsearch object instance.
        query: String of query terms.
        field: The field with respect to which the query is analyzed. 
        index: Name of the index with respect to which the query is analyzed.  
    
    Returns:
        A list of query terms that exist in the specified field among the documents in the index. 
    """
    tokens = es.indices.analyze(index=index, body={'text': query})['tokens']
    query_terms = []
    for t in sorted(tokens, key=lambda x: x['position']):
        ## Use a boolean query to find at least one document that contains the term.
        hits = es.search(index=index, body={'query': {'match': {field: t['token']}}}, 
                                   _source=False, size=1).get('hits', {}).get('hits', {})
        doc_id = hits[0]['_id'] if len(hits) > 0 else None
        if doc_id is None:
            continue
        query_terms.append(t['token'])
    return query_terms

def extract_features(query_terms, doc_id, es, index=INDEX_NAME):
    """Extracts query features, document features and query-document features of a query and document pair.
    
        Arguments:
            query_terms: List of analyzed query terms.
            doc_id: Document identifier of indexed document.
            es: Elasticsearch object instance.
            index: Name of relevant index on the running Elasticsearch service. 
            
        Returns:
            List of extracted feature values in a fixed order.
    """
    feature_vect = []
    # TODO
    return feature_vect

def prepare_ltr_training_data(es, k=100, amount=0, index=INDEX_NAME):
    """Prepares feature vectors and labels for query and document pairs found in the training data.
    
        Arguments:
            query_ids: List of query IDs.
            es: Elasticsearch object instance.
            index: Name of relevant index on the running Elasticsearch service. 
            
        Returns:
            X: List of feature vectors extracted for each pair of query and retrieved or relevant document. 
            y: List of corresponding labels.
    """
    X, y = [], []
    progress, N = 0, len(training_queries)
    for queryObject in training_queries.values():
        query = analyze_query(es, queryObject['query'], index)
        hits = es.search(
            index=index, 
            _source=True, 
            size=k, 
            body={"query": {"bool": {"must": {"match": {"abstract": ' '.join(query)}}, "must_not": {"match": {"instance": "thing"}}}}}
        )['hits']['hits']
        for obj in hits:
            score = (1 if (obj['_source']['instance'] in queryObject['type']) else 0)
            y.append(score)
            X.append(extract_features(query, obj['_id'], es, index))

        progress += 1
        if (progress % 50) == 0:
            print('Progress - {}/{} queries handled.'.format(progress, N))

        if amount and (progress >= amount):
            break

    return X, y

def evaluate_baseline(es, k=100, amount=0, index=INDEX_NAME):
    """
    Evaluate the BM25 baseline on our train queries.
    """
    progress, N = 0, len(training_queries)
    results = []
    for qId, queryObject in training_queries.items():
        query = analyze_query(es, queryObject['query'], index)
        hits = es.search(
            index=index, 
            _source=True, 
            size=k, 
            body={"query": {"bool": {"must": {"match": {"abstract": ' '.join(query)}}, "must_not": {"match": {"instance": "thing"}}}}}
        )['hits']['hits']        
        relevancy = []
        hasAMatch = False
        for i, obj in enumerate(hits):
            score = (1 if (obj['_source']['instance'] in queryObject['type']) else 0)
            
            if score: # Indicate that there was at least 1 match!
                hasAMatch = True                

            if len(relevancy) < 10: # Add the 10 first document scores relevancy. 0 = not relevant, 1 = relevant.
                relevancy.append(score)
            
            # If we had a match, and we have filled our relevancy list, break it off.
            if hasAMatch and (len(relevancy) >= 10):
                break

        results.append({
            'id': qId,
            'relevancy': relevancy,
            'match': (1 if hasAMatch else 0)
        })

        progress += 1
        if (progress % 50) == 0:
            print('Progress - {}/{} queries handled.'.format(progress, N))

        if amount and (progress >= amount):
            break

    return results

In [49]:
def dcg(gains, k=5):
    """Computes DCG for a given ranking.
    Traditional DCG formula: DCG_k = sum_{i=1}^k gain_i / log_2(i+1).
    """
    dcg = 0
    for i in range(0, min(k, len(gains))):
        dcg += gains[i] / math.log(i + 2, 2)
    return dcg

def ndcg(gains, ideal_gains, k=5):
    """Computes NDCG given gains for a ranking as well as the ideal gains."""
    try:
        return dcg(gains, k) / dcg(ideal_gains, k)
    except:
        return 0

In [50]:
x,y = prepare_ltr_training_data(es, amount=50)

Progress - 50/9557 queries handled.


In [43]:
res = evaluate_baseline(es, k=150, amount=1000) # Eval. X queries.

Progress - 50/9557 queries handled.
Progress - 100/9557 queries handled.
Progress - 150/9557 queries handled.
Progress - 200/9557 queries handled.
Progress - 250/9557 queries handled.
Progress - 300/9557 queries handled.
Progress - 350/9557 queries handled.
Progress - 400/9557 queries handled.
Progress - 450/9557 queries handled.
Progress - 500/9557 queries handled.
Progress - 550/9557 queries handled.
Progress - 600/9557 queries handled.
Progress - 650/9557 queries handled.
Progress - 700/9557 queries handled.
Progress - 750/9557 queries handled.
Progress - 800/9557 queries handled.
Progress - 850/9557 queries handled.
Progress - 900/9557 queries handled.
Progress - 950/9557 queries handled.
Progress - 1000/9557 queries handled.


In [44]:
acc = sum([v['match'] for v in res]) / len(res)
print('Accuracy ->', acc)

Accuracy -> 0.775


In [47]:
acc_ndcg10 = [ndcg(v['relevancy'], sorted(v['relevancy'], reverse=True), k=10) for v in res]
print('nDCG10 ->', sum(acc_ndcg10) / len(acc_ndcg10))

nDCG10 -> 0.3078854550734691


In [46]:
acc_ndcg5 = [ndcg(v['relevancy'], sorted(v['relevancy'], reverse=True), k=5) for v in res]
print('nDCG5 ->', sum(acc_ndcg5) / len(acc_ndcg5))

nDCG5 -> 0.21778122899477148


In [58]:
v = es.search(index=INDEX_NAME, body={"query": {"bool": {"must": {"match": {"abstract": "who is george carlin"}}, "must_not": {"match": {"instance": "thing"}}}}})['hits']['hits']
set([x['_source']['instance'] for x in v])

{'album', 'book', 'recordlabel', 'televisionshow'}

In [59]:
v = es.search(index=INDEX_NAME, body={"query": {"bool": {"must": {"match": {"abstract": "who is george carlin"}}}}})['hits']['hits']
set([x['_source']['instance'] for x in v])

{'album', 'book', 'recordlabel', 'televisionshow', 'thing'}