## Imports

In [1]:
import json
import elasticsearch
import string
import re
import math
import time
import os
import numpy as np
from gensim.test.utils import datapath
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from gensim.scripts.glove2word2vec import glove2word2vec
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
from elasticsearch import Elasticsearch
from sklearn.ensemble import RandomForestClassifier

In [2]:
QUERY_TRAIN_FILEPATH = 'datasets\DBpedia\smarttask_dbpedia_train.json'
QUERY_TEST_FILEPATH = 'datasets\DBpedia\smarttask_dbpedia_test_questions.json'
INDEX_NAME = 'smart'

#### Elasticsearch Default Stop Words

In [3]:
stop_words = set(['a', 'an', 'and', 'are', 'as', 'at', 'be', 'but', 'by', 'for', 'if', 'in', 'into', 'is', 'it', 'no', 'not', 'of', 'on', 'or', 'such', 'that', 'the', 'their', 'then', 'there', 'these', 'they', 'this', 'to', 'was', 'will', 'with'])
print(stop_words) # Default in ElasticSearch

{'will', 'as', 'was', 'if', 'a', 'at', 'an', 'is', 'these', 'are', 'there', 'but', 'in', 'into', 'it', 'this', 'or', 'their', 'by', 'that', 'and', 'the', 'then', 'such', 'of', 'not', 'to', 'with', 'no', 'they', 'for', 'be', 'on'}


In [4]:
def preprocess(text):
    """Preprocess some query, lower, remove punctuation stuff, stopwords, etc."""
    text = text.strip().lower()
    text = text.replace('_', ' ').replace('-', ' ')
    text = text.translate(str.maketrans('', '', string.punctuation)) # Remove punctuation stuff.
    text = re.sub('\s\s+', ' ', text).split(' ') # Replace consequtive whitespace with a single space.
    return ' '.join([v for v in text if not v in stop_words]).strip()

## Word2Vec - Convert GloVe to Gensim

In [5]:
EMBEDDING_FILE = datapath(os.getcwd()+'/datasets/gensim/gensim.6B.100d.txt')
def convertGloveToGensim(target, output):
    _ = glove2word2vec(datapath(os.getcwd()+target), datapath(os.getcwd()+output))
#convertGloveToGensim('/datasets/glove/glove.6B.100d.txt', '/datasets/gensim/gensim.6B.100d.txt')

## Elastic search

In [6]:
es = Elasticsearch()
es.info()

{'name': 'BERNTA-PC',
 'cluster_name': 'elasticsearch',
 'cluster_uuid': 'IP06yo9vScKZA1ZTb8R9HA',
 'version': {'number': '7.9.2',
  'build_flavor': 'default',
  'build_type': 'zip',
  'build_hash': 'd34da0ea4a966c4e49417f2da2f244e3e97b4e6e',
  'build_date': '2020-09-23T00:45:33.626720Z',
  'build_snapshot': False,
  'lucene_version': '8.6.2',
  'minimum_wire_compatibility_version': '6.8.0',
  'minimum_index_compatibility_version': '6.0.0-beta1'},
 'tagline': 'You Know, for Search'}

In [7]:
es.indices.refresh(INDEX_NAME)
count = es.cat.count(INDEX_NAME, params={"format": "json"})
print('Docs:', int(count[0]['count']))

Docs: 4926217


In [8]:
NUM_DOCS = int(count[0]['count'])

## Load Queries

In [9]:
def load_train_queries(filepath):
    """
    Load training queries from a file. 
    Returns a dictoinary with queryID as key and corresponding query, category and type.
    """
    query_dicts = {}
    queries = None
    with open(filepath, "r") as f:
        queries = f.read()
    
    for query in json.loads(queries):
        try:
            qID, qText, qCat, qType = query["id"].lower(), query["question"].lower(), query["category"].lower(), ' '.join(query["type"]).lower()
            if not 'dbo:' in qType: # Skip queries without a dbo: type.
                continue
            query_dicts[qID] = {"query": preprocess(qText), "category": qCat, "type": qType.replace('dbo:', '')}
        except Exception as e:
            # print("Query: {}\n\tThrew an exception: {}\n".format(query, e))
            continue
    return query_dicts

def load_test_queries(filepath):
    """
    Load test queries from a file.
    Returns a dictionary with queryID as key, and corresponding query as a string.
    """
    query_dicts = {}
    queries = None
    with open(filepath, "r") as f:
        queries = f.read()
    
    for query in json.loads(queries):
        try:
            query_dicts[query["id"].lower()] = {"query": preprocess(query["question"].lower())}
        except Exception as e:
            # print("Query: {}\n\tThrew an exception: {}\n".format(query, e))
            continue
    return query_dicts

In [10]:
training_queries = load_train_queries(QUERY_TRAIN_FILEPATH)
test_queries = load_test_queries(QUERY_TEST_FILEPATH)

print("# training queries:", len(training_queries), "\n\tExample key 'dbpedia_17655' returns:", training_queries['dbpedia_17655'])
print("# test queries:", len(test_queries), "\n\tExample key 'dbpedia_21099' contain:", test_queries['dbpedia_21099'])

# training queries: 9557 
	Example key 'dbpedia_17655' returns: {'query': 'what town birthplace joseph greenberg', 'category': 'resource', 'type': 'city settlement populatedplace place location'}
# test queries: 4369 
	Example key 'dbpedia_21099' contain: {'query': 'under which president did some politicians live kensington'}


## Cache analyze query
For each train / test query -> cache the respective analyze query terms.
This will speed up the evaluation later.
#### This will take 2-8 minutes! (depends on elasticsearch caching)

In [11]:
def analyze_query(es, query, index=INDEX_NAME):
    """Analyzes a query with respect to the relevant index. 
    
    Arguments:
        es: Elasticsearch object instance.
        query: String of query terms.
        index: Name of the index with respect to which the query is analyzed.  
    
    Returns:
        A list of query terms that exist in the abstract field among the documents in the index. 
    """
    tokens = es.indices.analyze(index=index, body={'text': query})['tokens']
    query_terms = []
    for t in sorted(tokens, key=lambda x: x['position']):
        ## Use a boolean query to find at least one document that contains the term.
        hits = es.search(index=index, body={'query': {'match': {'abstract': t['token']}}}, 
                                   _source=False, size=1).get('hits', {}).get('hits', {})
        doc_id = hits[0]['_id'] if len(hits) > 0 else None
        if doc_id is None:
            continue
        query_terms.append(t['token'])
    return query_terms

In [12]:
start = time.time()

for qId, queryObject in training_queries.items():
    training_queries[qId]['analyzed'] = analyze_query(es, queryObject['query'], INDEX_NAME)
    
for qId, queryObject in test_queries.items():
    test_queries[qId]['analyzed'] = analyze_query(es, queryObject['query'], INDEX_NAME)
    
print("Time Elapsed:", (time.time()-start))

Time Elapsed: 303.1173372268677


In [13]:
training_queries['dbpedia_17655']['analyzed']

['what', 'town', 'birthplace', 'joseph', 'greenberg']

## Load evaluation types

In [14]:
def loadDBPediaTypes():
    kv = {}
    max_depth = 0
    with open('./evaluation/dbpedia/dbpedia_types.tsv', 'r') as f:
        for i, line in enumerate(f):
            if i == 0: # Skip header
                continue
            line = line.strip().lower().split('\t')
            if len(line) != 3:
                continue
            type_name, depth, parent_type = line[0].split(':')[-1], int(line[1]), line[-1].split(':')[-1]
            if (len(type_name) == 0) or (len(parent_type) == 0):
                continue
            kv[type_name] = {'depth':depth, 'parent':parent_type}
            max_depth = max(depth, max_depth)
    return kv, max_depth

def getTypeHierarchy(kv, items, target):
    if not target in kv:
        return
    items.append(target)
    getTypeHierarchy(kv, items, kv[target]['parent'])

def buildDBPediaTypeHierarchy(kv, target, reverse=True):
    items = [] # List of types, representing the hierarchy of the types related to the target.
    getTypeHierarchy(kv, items, target)
    if reverse:
        return items[::-1] # Reverse the order to return the correct hierarchy where the first item = top level.
    return items

def cacheDBPediaPaths():
    """Simplify Evaluation Path Computations"""
    for k in type_hierarchy.keys():
        type_hierarchy[k]['path'] = buildDBPediaTypeHierarchy(type_hierarchy, k, False)

In [15]:
type_hierarchy, max_depth = loadDBPediaTypes()
print(list(type_hierarchy.keys())[:4], 'Max Depth', max_depth)

['basketballleague', 'naturalevent', 'province', 'lunarcrater'] Max Depth 7


In [16]:
buildDBPediaTypeHierarchy(type_hierarchy, 'comic') # Example hierarchy

['work', 'writtenwork', 'comic']

In [17]:
start = time.time()
cacheDBPediaPaths()
print("Time Elapsed:", (time.time()-start))

Time Elapsed: 0.0010001659393310547


## Baseline Retrieval
Implements Okapi BM25, uses the Elastic search inbuilt implementation

In [18]:
def evaluate_baseline(es, amount=0, index=INDEX_NAME):
    """
    Evaluate the BM25 baseline on our train queries.
    
    Arguments:
        es: Elasticsearch object instance.
        amount: How many queries to run, 0 = all.
        index: Name of the index with respect to which the query is analyzed.  
    
    Returns:
        A dictionary containing the queryIDs - list of retrieved instance types.
    """
    progress, N = 0, len(training_queries)
    results = {}
    for qId, queryObject in training_queries.items():
        query = queryObject['analyzed']
        hits = es.search(index=index, _source=True, size=10,
            body={"query": {"bool": {"must": {"match": {"abstract": ' '.join(query)}}, "must_not": {"match": {"instance": "thing"}}}}}
        )['hits']['hits']
        results[qId] = [obj['_source']['instance'] for obj in hits]

        progress += 1
        if (progress % 50) == 0:
            print('Progress - {}/{} queries handled.'.format(progress, N))

        if amount and (progress >= amount):
            break

    return results

def tokens_to_vec(tokens, model):
    """
    Convert a list of tokens to some word 2 vec representation which conforms to our model.
    
    Arguments:
        tokens: A list of words.
        model: A word2vec model.
    
    Returns:
        A D-dim vector which represents the tokens - the embedding
    """
    size = model.vectors.shape[1]
    if len(tokens) == 0:
        return np.zeros(size)        
    embeddings = []
    for v in tokens:
        embeddings.append((model[v] if (v in model) else np.random.rand(size)))
    return np.mean(embeddings, axis=0).reshape(1, -1) # Take the mean of our matrix and return it as a D-size vector.

def evaluate_word2vec(es, model, k=1000, amount=0, index=INDEX_NAME):
    """
    Evaluate ranking using word2vec method.
    We are using pre-trained embeddings. Convert each query and related doc to word2vec format,
    compare the similarity and re-rank the entries.
    
    Arguments:
        es: Elasticsearch object instance.
        model: word2vec model.
        k: How many documents to handle per query.
        amount: How many queries to run, 0 = all.
        index: Name of the index with respect to which the query is analyzed.  
    
    Returns:
        A dictionary containing the queryIDs - list of retrieved instance types.
    """
    progress, N = 0, len(training_queries)
    results = {}
    for qId, queryObject in training_queries.items():
        query = queryObject['analyzed']
        hits = es.search(index=index, _source=True, size=k, 
            body={"query": {"bool": {"must": {"match": {"abstract": ' '.join(query)}}, "must_not": {"match": {"instance": "thing"}}}}}
        )['hits']['hits']
        queryEmbedding = tokens_to_vec(query, model)
        rerank = []
        for obj in hits:
            docEmbedding = tokens_to_vec(obj['_source']['abstract'].split(), model)
            sim = cosine_similarity(queryEmbedding, docEmbedding).item()
            rerank.append((obj['_source']['instance'], sim))
            
        rerank.sort(key=lambda x:x[-1], reverse=True) # Re-rank the initial hits using our word2vec mdl.
        results[qId] = [v for v,_ in rerank[:10]]

        progress += 1
        if (progress % 50) == 0:
            print('Progress - {}/{} queries handled.'.format(progress, N))

        if amount and (progress >= amount):
            break

    return results

def evaluate_simple(es, k=1000, amount=0, index=INDEX_NAME):
    """
    A test evaluation, simply re-rank using relevancy,
    0 = Not relevant
    1 = Partially relevant
    2 = Relevant
    
    Arguments:
        es: Elasticsearch object instance.
        k: How many documents to handle per query.
        amount: How many queries to run, 0 = all.
        index: Name of the index with respect to which the query is analyzed.  
    
    Returns:
        A dictionary containing the queryIDs - list of retrieved instance types.
    """
    progress, N = 0, len(training_queries)
    results = {}
    for qId, queryObject in training_queries.items():
        type_relevancy = {}
        for typ in queryObject['type'].split(' '):
            if not typ in type_hierarchy:
                continue
            hierarchy = buildDBPediaTypeHierarchy(type_hierarchy, typ)
            for v in hierarchy:
                type_relevancy[v] = 1 # Relevant, its in the same hierarchy but in a diff pos.            

        for typ in queryObject['type'].split(' '):
            type_relevancy[typ] = 2 # This is the type we want. Give it the highest weight.
            
        if len(type_relevancy) == 0:
            continue

        query = queryObject['analyzed']
        hits = es.search(index=index, _source=True, size=k, 
            body={"query": {"bool": {"must": {"match": {"abstract": ' '.join(query)}}, "must_not": {"match": {"instance": "thing"}}}}}
        )['hits']['hits']

        rerank = []
        for obj in hits:
            instanceType = obj['_source']['instance']
            if not instanceType in type_hierarchy:
                rerank.append((instanceType, 0))
                continue
            if instanceType in type_relevancy:                
                rerank.append((instanceType, type_relevancy[instanceType]))
                continue                
            weight = buildDBPediaTypeHierarchy(type_hierarchy, instanceType)
            weight = [(1 if (t in type_relevancy) else 0) for t in weight] + [0]
            rerank.append((instanceType, max(weight)))

        rerank.sort(key=lambda x:x[-1], reverse=True) # Re-rank the initial hits based on their relevancy.
        results[qId] = [v for v,_ in rerank[:10]]

        progress += 1
        if (progress % 50) == 0:
            print('Progress - {}/{} queries handled.'.format(progress, N))

        if amount and (progress >= amount):
            break

    return results

#### Balog's Evaluation code, with some minor edits

In [19]:
def dcg(gains, k=5):
    """
    Computes DCG for a given ranking.
    Traditional DCG formula: DCG_k = sum_{i=1}^k gain_i / log_2(i+1).
    """
    dcg = 0
    for i in range(0, min(k, len(gains))):
        dcg += gains[i] / math.log(i + 2, 2)
    return dcg

def ndcg(gains, ideal_gains, k=5):
    """Computes NDCG given gains for a ranking as well as the ideal gains."""
    try:
        return dcg(gains, k) / dcg(ideal_gains, k)
    except:
        return 0

def get_type_path(type, type_hierarchy):
    """
    Gets the type's path in the hierarchy (excluding the root type, like owl:Thing).
    The path for each type is computed only once then cached in type_hierarchy,
    to save computation.
    """
    if not type in type_hierarchy:
        type_hierarchy[type] = {'depth':1, 'parent':'', 'path':[type]}
    return type_hierarchy[type]['path']

def get_type_distance(type1, type2, type_hierarchy):
    """
    Computes the distance between two types in the hierarchy.
    Distance is defined to be the number of steps between them in the hierarchy,
    if they lie on the same path (which is 0 if the two types match), and
    infinity otherwise.
    """
    type1_path = get_type_path(type1, type_hierarchy)
    type2_path = get_type_path(type2, type_hierarchy)
    distance = math.inf
    if type1 in type2_path:
        distance = type2_path.index(type1)
    if type2 in type1_path:
        distance = min(type1_path.index(type2), distance)
    return distance

def get_most_specific_types(types, type_hierarchy):
    """Filters a set of input types to most specific types w.r.t the type
    hierarchy; i.e., super-types are removed."""
    filtered_types = set(types)
    for type in types:
        type_path = get_type_path(type, type_hierarchy)
        for supertype in type_path[1:]:
            if supertype in filtered_types:
                filtered_types.remove(supertype)
    return filtered_types

def get_expanded_types(types, type_hierarchy):
    """Expands a set of types with both more specific and more generic types
    (i.e., all super-types and sub-types)."""
    expanded_types = set()
    for type in types:
        # Adding all supertypes.
        expanded_types.update(get_type_path(type, type_hierarchy))
        # Adding all subtypes (NOTE: this bit could be done more efficiently).
        for type2 in type_hierarchy:
            if type_hierarchy[type2]['depth'] <= type_hierarchy[type]['depth']:
                continue
            type2_path = get_type_path(type2, type_hierarchy)
            if type in type2_path:
                expanded_types.update(type2_path)
    return expanded_types

def compute_type_gains(predicted_types, gold_types, type_hierarchy, max_depth):
    """Computes gains for a ranked list of type predictions.

    Following the definition of Linear gain in (Balog and Neumayer, CIKM'12),
    the gain for a given predicted type is 0 if it is not on the same path with
    any of the gold types, and otherwise it's $1-d(t,t_q)/h$ where $d(t,t_q)$ is
    the distance between the predicted type and the closest matching gold type
    in the type hierarchy and h is the maximum depth of the type hierarchy.

    Args:
        predicted_types: Ranked list of predicted types.
        gold_types: List/set of gold types (i.e., perfect answers).
        type_hierarchy: Dict with type hierarchy.
        max_depth: Maximum depth of the type hierarchy.

    Returns:
        List with gain values corresponding to each item in predicted_types.
    """
    gains = []
    expanded_gold_types = get_expanded_types(gold_types, type_hierarchy)
    for predicted_type in predicted_types:
        if predicted_type in expanded_gold_types:
            # Since not all gold types may lie on the same branch, we take the
            # closest gold type for determining distance.
            min_distance = math.inf
            for gold_type in gold_types:
                min_distance = min(get_type_distance(predicted_type, gold_type,
                                                     type_hierarchy),
                                   min_distance)
            gains.append(1 - min_distance / max_depth)
        else:
            gains.append(0)
    return gains

def evaluate(result):
    """
    Evaluate the resulting dictionary, compute accuracy, strict and fuzzy ndcg_5, ndcg_10 where
    ndcg_5 and ndcg_10 is computed using lenient NDCG@k with a Linear decay.
    
    Arguments:
        result: A dictionary with queryIDs: List of retrieved types from the top 10 docs, 
        and a bool indicating if there was a perfect match.
    """
    accuracy = []
    strict_ndcg_5, strict_ndcg_10 = [], []
    fuzzy_ndcg_5, fuzzy_ndcg_10 = [], []
    for qId, obj in training_queries.items():
        if qId not in result:
            continue

        qTypes = obj['type'].split(' ')
        if len(qTypes) == 0:
            continue

        predicted_type = result[qId]
        predicted_type_strict = [(1 if (t in obj['type']) else 0) for t in predicted_type]        
        exact_match = max(predicted_type_strict + [0]) # Yes / No was there an explicit match?
        
        # Filters obj types to most specific ones in the hierarchy.
        obj_types = get_most_specific_types(qTypes, type_hierarchy)
        gains = compute_type_gains(predicted_type, obj_types, type_hierarchy, max_depth)
        ideal_gains = sorted(gains, reverse=True)

        accuracy.append(exact_match)
        
        strict_ndcg_5.append(ndcg(predicted_type_strict, sorted(predicted_type_strict, reverse=True), k=5))
        strict_ndcg_10.append(ndcg(predicted_type_strict, sorted(predicted_type_strict, reverse=True), k=10))
        
        fuzzy_ndcg_5.append(ndcg(gains, ideal_gains, k=5))
        fuzzy_ndcg_10.append(ndcg(gains, ideal_gains, k=10))
        
    print('Evaluation results (based on {} questions):'.format(len(accuracy)))
    print('-------------------')
    
    print('Exact Type Prediction')
    print('  Accuracy: {:5.3f}'.format(sum(accuracy) / len(accuracy)))
    
    print('Strict Type ranking')
    print('  NDCG@5:  {:5.3f}'.format(sum(strict_ndcg_5) / len(strict_ndcg_5)))
    print('  NDCG@10: {:5.3f}'.format(sum(strict_ndcg_10) / len(strict_ndcg_10)))
    
    print('Fuzzy Type ranking')
    print('  NDCG@5:  {:5.3f}'.format(sum(fuzzy_ndcg_5) / len(fuzzy_ndcg_5)))
    print('  NDCG@10: {:5.3f}'.format(sum(fuzzy_ndcg_10) / len(fuzzy_ndcg_10)))

#### Writing/Reading evaluation results to/from a file.

In [20]:
def write_result_to_file(res, file):
    with open('./results/{}.csv'.format(file), 'w') as f:
        for qId, obj in res.items():
            f.write('{},{}\n'.format(qId, ' '.join(obj)))

def read_result_from_file(file):
    result = {}
    with open('./results/{}.csv'.format(file), 'r') as f:
        for line in f:
            line = line.strip().split(',')
            if len(line) != 2:
                continue
            result[line[0]] = [v for v in line[-1].split(' ') if len(v) > 0]
    return result

### Evaluate baseline

In [23]:
start = time.time()
res_baseline = evaluate_baseline(es)
print("Time Elapsed:", (time.time()-start))
write_result_to_file(res_baseline, 'baseline')

In [22]:
res_baseline = read_result_from_file('baseline')
evaluate(res_baseline)

Evaluation results (based on 9557 questions):
-------------------
Exact Type Prediction
  Accuracy: 0.492
Strict Type ranking
  NDCG@5:  0.237
  NDCG@10: 0.323
Fuzzy Type ranking
  NDCG@5:  0.312
  NDCG@10: 0.414


### Evaluate advanced - Word2Vec ~ 45 min

In [None]:
model = KeyedVectors.load_word2vec_format(EMBEDDING_FILE)

In [None]:
start = time.time()
res_word2vec = evaluate_word2vec(es, model, k=300)
print("Time Elapsed:", time.time()-start)
write_result_to_file(res_word2vec, 'advanced_word2vec')

In [23]:
res_word2vec = read_result_from_file('advanced_word2vec')
evaluate(res_word2vec)

Evaluation results (based on 9557 questions):
-------------------
Exact Type Prediction
  Accuracy: 0.522
Strict Type ranking
  NDCG@5:  0.280
  NDCG@10: 0.367
Fuzzy Type ranking
  NDCG@5:  0.364
  NDCG@10: 0.455


### Advanced Method, Pointwise - Classifier
Declare documents as relevant: 
* 0 - Not relevant
* 1 - Kinda relevant
* 2 - Totally relevant!

In [24]:
GLOB_TERM_DOC_FREQ = {} # Save computations by storing term->docFrequency in a dict. Every query will run this K times so.. be clever!

In [25]:
def extract_features(query_terms, doc_id, es, index=INDEX_NAME):
    """
    Extracts query features, document features and query-document features of a query and document pair.
    
        Arguments:
            query_terms: List of analyzed query terms.
            doc_id: Document identifier of indexed document.
            es: Elasticsearch object instance.
            index: Name of relevant index on the running Elasticsearch service. 
            
        Returns:
            List of extracted feature values in a fixed order.
    """
    query = dict(Counter(query_terms))    
    doc_term_freqs = {} # Term frequencies in the document.
    tv = es.termvectors(index=index, id=doc_id, fields='abstract', term_statistics=False)    
    for term, term_stat in tv['term_vectors']['abstract']['terms'].items():
        doc_term_freqs[term] = term_stat['term_freq']
    
    idf = []
    for term in query_terms:
        if not term in GLOB_TERM_DOC_FREQ:
            n = 0
            hits = es.search(
                index=index, 
                body={"query": {"bool": {"must": {"match": {"abstract": term}}, "must_not": {"match": {"instance": "thing"}}}}}, 
                _source=False, size=1).get('hits',{}).get('hits',{})
            doc_id = (hits[0]['_id'] if (len(hits) > 0) else None)
            if doc_id is not None:
                tv = es.termvectors(index=index, id=doc_id, fields='abstract', term_statistics=True)['term_vectors']['abstract']['terms']
                if term in tv:
                    n = tv[term]['doc_freq']                    
            GLOB_TERM_DOC_FREQ[term] = n
            
        n = GLOB_TERM_DOC_FREQ[term]
        if n: # Must be greater than > 0
            idf.append(math.log(NUM_DOCS/n))

    terms_doc_unique = [v for k,v in doc_term_freqs.items() if k in query] # Unique to query and doc.
    
    return [
        len(query_terms),
        sum(idf),
        max([0] + idf),
        (sum(idf) / max(len(idf),1)),
        sum(doc_term_freqs.values()),
        len(terms_doc_unique),
        sum(terms_doc_unique),
        max([0] + terms_doc_unique),
        (sum(terms_doc_unique) / max(len(query.keys()),1))
    ]

def evaluate_l2r(es, k=200, amount=0, index=INDEX_NAME):
    """
    Train a model, generate X - feature vectors and y - relevance labels.
    
    Relevancy is defined as such,
    0 = Not relevant
    1 = Partially relevant
    2 = Relevant
    
    Arguments:
        es: Elasticsearch object instance.
        k: How many documents to handle per query.
        amount: How many queries to run, 0 = all.
        index: Name of the index with respect to which the query is analyzed.  
    
    Returns:
        Returns a list of feature vectors, a list of their respective labels (relevance) and a list of 
        instances (type instances) noted from every retrieved doc.
    """
    progress, N = 0, len(training_queries)
    X, y, instances = [], [], []
    for qId, queryObject in training_queries.items():
        type_relevancy = {}        
        for typ in queryObject['type'].split(' '):
            if not typ in type_hierarchy:
                continue
            hierarchy = buildDBPediaTypeHierarchy(type_hierarchy, typ)
            for v in hierarchy:
                type_relevancy[v] = 1 # Relevant, its in the same hierarchy but in a diff pos.            
                
        for typ in queryObject['type'].split(' '):
            type_relevancy[typ] = 2 # This is the type we want. Give it the highest weight.
            
        if len(type_relevancy) == 0:
            continue

        query = queryObject['analyzed']
        hits = es.search(index=index, _source=True, size=k, 
            body={"query": {"bool": {"must": {"match": {"abstract": ' '.join(query)}}, "must_not": {"match": {"instance": "thing"}}}}}
        )['hits']['hits']

        for obj in hits:
            relevancy = 0 # Default = not relevant
            instanceType = obj['_source']['instance']
            if instanceType in type_relevancy:
                relevancy = type_relevancy[instanceType]
            elif instanceType in type_hierarchy:                
                relevancy = buildDBPediaTypeHierarchy(type_hierarchy, instanceType)
                relevancy = max([(1 if (t in type_relevancy) else 0) for t in relevancy] + [0])
            y.append(relevancy)
            X.append(extract_features(query, obj['_id'], es, index))
            instances.append(instanceType)

        progress += 1
        if (progress % 50) == 0:
            print('Progress - {}/{} queries handled.'.format(progress, N))

        if amount and (progress >= amount):
            break

    return X, y, instances

def evaluate_l2r_rerank(es, model, X, instances, k=200, amount=0, index=INDEX_NAME):
    """
    Evaluate our l2r model. Re-rank predicting relevancy. Sort by relevancy. High -> Low
    
    Relevancy is defined as such,
    0 = Not relevant
    1 = Partially relevant
    2 = Relevant
    
    Arguments:
        es: Elasticsearch object instance.
        model: L2R classifier.
        instances: A list of the type instances.
        k: How many documents to handle per query.
        amount: How many queries to run, 0 = all.
        index: Name of the index with respect to which the query is analyzed.  
    
    Returns:
        A dictionary containing the queryIDs - list of retrieved instance types.
    """
    progress, N = 0, len(training_queries)
    results, idx = {}, 0
    predictions = (model.predict(X[:(k*amount)]) if amount else model.predict(X))
    
    for qId, queryObject in training_queries.items():
        query = queryObject['analyzed']
        hits = es.search(index=index, _source=False, size=k, 
            body={"query": {"bool": {"must": {"match": {"abstract": ' '.join(query)}}, "must_not": {"match": {"instance": "thing"}}}}}
        )['hits']['hits']
        rerank = []
        for _ in hits:
            rerank.append((instances[idx], predictions[idx]))
            idx += 1
        rerank.sort(key=lambda x:x[-1], reverse=True) # Re-rank the initial hits based on their relevancy.
        results[qId] = [v for v,_ in rerank[:10]]

        progress += 1
        if (progress % 50) == 0:
            print('Progress - {}/{} queries handled.'.format(progress, N))

        if amount and (progress >= amount):
            break

    return results

def evaluate_l2r_test(es, model, k=100, amount=0, index=INDEX_NAME):
    """
    Predict the types for test queries. Re-rank the k docs and pick the majority vote out of the top 10.
    If no majority vote, use the top doc type.
    
    Arguments:
        es: Elasticsearch object instance.
        model: L2R classifier.
        k: How many documents to handle per query.
        amount: How many queries to run, 0 = all.
        index: Name of the index with respect to which the query is analyzed.  
    
    Returns:
        A dictionary containing the queryIDs and its predicted type.
    """
    progress, N = 0, len(test_queries)
    results = {}
    for qId, queryObject in test_queries.items():
        query = queryObject['analyzed']
        hits = es.search(index=index, _source=True, size=k, 
            body={"query": {"bool": {"must": {"match": {"abstract": ' '.join(query)}}, "must_not": {"match": {"instance": "thing"}}}}}
        )['hits']['hits']
        feat_vecs, instances = [], []
        for obj in hits:
            feat_vecs.append(extract_features(query, obj['_id'], es, index))
            instances.append(obj['_source']['instance'])
        if len(instances) == 0: # No hits!
            results[qId] = 'N/A'
            continue
        instances_rerank = [instances[idx] for idx in np.argsort(model.predict(feat_vecs))[::-1]]
        results[qId] = Counter(instances_rerank[:10]).most_common(1)[0][0]

        progress += 1
        if (progress % 50) == 0:
            print('Progress - {}/{} queries handled.'.format(progress, N))

        if amount and (progress >= amount):
            break

    return results

### Evaluate Pointwise Method (50+ min!)

In [None]:
start = time.time()
X, y, instances = evaluate_l2r(es)
print("Time Elapsed:", time.time()-start)

l2r_model = RandomForestClassifier(n_estimators = 100)
_ = l2r_model.fit(X, y)

start = time.time()
res_advanced_pntwse = evaluate_l2r_rerank(es, l2r_model, X, instances)
print("Time Elapsed:", time.time()-start)

write_result_to_file(res_advanced_pntwse, 'advanced')

In [26]:
res_advanced_pntwse = read_result_from_file('advanced')
evaluate(res_advanced_pntwse)

Evaluation results (based on 9557 questions):
-------------------
Exact Type Prediction
  Accuracy: 0.776
Strict Type ranking
  NDCG@5:  0.731
  NDCG@10: 0.754
Fuzzy Type ranking
  NDCG@5:  0.753
  NDCG@10: 0.780


### Misc, type prediction using test queries. Predict the type of a query.

In [None]:
test_type_res = evaluate_l2r_test(es, l2r_model)

In [27]:
def write_test_types(res, file):
    with open('./results/{}.csv'.format(file), 'w') as f:
        for qId, obj in res.items():
            f.write('{},{}\n'.format(qId, obj))
            
def read_test_types(file):
    result = {}
    with open('./results/{}.csv'.format(file), 'r') as f:
        for line in f:
            line = line.strip().split(',')
            if len(line) != 2:
                continue
            result[line[0]] = line[1]
    return result

In [None]:
write_test_types(test_type_res, 'test_type_predictions')

In [28]:
test_type_res = read_test_types('test_type_predictions')
print('Query:', test_queries['dbpedia_687']['query'], '\nType:', test_type_res['dbpedia_687'])

Query: how many platforms does tomb raider have 
Type: videogame
