In [17]:
import os
import time
import zipfile

import lxml.etree as et
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk

from document import *
from query import *
from vector import *

INDEX = "ind"


def create_action(doc_id, doc_json):
    return {
        '_index': INDEX,
        '_id': doc_id,
        '_source': doc_json
    }


def action_generator():
    DOCS_FOLDER = "documents"
    for filename in os.listdir(DOCS_FOLDER):
        name = DOCS_FOLDER + os.sep + filename
        zip_file = zipfile.ZipFile(name, 'r')

        for filename in zip_file.filelist:
            try:
                doc_json = zip_file.read(filename).decode('utf-8')
                yield create_action(filename.orig_filename.strip(".txt"), doc_json)
            except:
                print(filename.orig_filename)
                return


In [2]:
SETTINGS = {
    'mappings': {
        'properties': {
            'content': {
                'type': 'text',
                'analyzer': 'russian_plain'
            },
            'stemmed': {
                'type': 'text',
                'analyzer': 'russian_stemmed'
            },
            'titles': {
                'type': 'text',
                'analyzer': 'russian_plain'
            },
            'pagerank': {
                'type': 'rank_feature'
            }
        }
    },
    'settings': {
        'analysis': {
            'analyzer': {
                'russian_plain': {
                    'char_filter': ['yo'],
                    'tokenizer': 'alphanum',
                    'filter': ['lowercase']
                },
                'russian_stemmed': {
                    'char_filter': ['yo'],
                    'tokenizer': 'whitespace',
                    'filter': ['lowercase']
                }
            },
            'char_filter': {
                'yo': {
                    'type': 'mapping',
                    'mappings': ['ё => е']
                }
            },
            'tokenizer': {
                'alphanum': {
                    'type': 'char_group',
                    'tokenize_on_chars': ["whitespace", "punctuation", "symbol", "\n"]
                }
            }
        },
        'index': {
            'blocks': {
                'read_only_allow_delete': 'false'
            }
        }
    }
}


def recreate_index():
    try:
        es.indices.delete(index=INDEX)
    except:
        pass
    es.indices.create(index=INDEX, body=SETTINGS)

In [5]:
es = Elasticsearch([{'host': 'localhost', 'port': 9200, 'timeout': 360, 'maxsize': 25}])

"""
recreate_index()
start_indexing = time.time()
for ok, result in parallel_bulk(es, action_generator(), queue_size=4, thread_count=4, chunk_size=500):
    if not ok:
        print(result)
print("Indexing time: ", time.time() - start_indexing)
print("Index size in bytes: ", es.indices.stats()['_all']['primaries']['store']['size_in_bytes'])
"""


'\nrecreate_index()\nstart_indexing = time.time()\nfor ok, result in parallel_bulk(es, action_generator(), queue_size=4, thread_count=4, chunk_size=500):\n    if not ok:\n        print(result)\nprint("Indexing time: ", time.time() - start_indexing)\nprint("Index size in bytes: ", es.indices.stats()[\'_all\'][\'primaries\'][\'store\'][\'size_in_bytes\'])\n'

In [6]:
"""pagerank_file = open("pageranks.txt", "r")
line = pagerank_file.readline()
while line:
    doc_id, pr = line.strip().split(":")
    es.update(index=INDEX, id=doc_id, body={'doc': {'pagerank': pr}})
    line = pagerank_file.readline()
"""
QUERIES_FILE = "web2008_adhoc.xml"
RELEVANCE_FILE = "or_relevant-minus_table.xml"
RELEVANCE_FILE_2008 = "or_relevant-minus_table.xml"
queries = {}
root = et.parse(QUERIES_FILE).getroot()
for element in root.iterfind('task', namespaces=root.nsmap):
    text = element[0].text
    id = element.attrib.get('id')
    element.clear()
    queries[id] = Query(id, text)
root = et.parse(RELEVANCE_FILE).getroot()
for element in root.iterfind('task', namespaces=root.nsmap):
    id = element.attrib.get('id')
    for document in element.iterfind('document', namespaces=root.nsmap):
        doc_id = document.attrib.get('id')
        relevance = document.attrib.get('relevance')
        document.clear()
        if relevance == 'vital':
            queries[id].relevant.add(doc_id)
    element.clear()
root = et.parse(RELEVANCE_FILE_200).getroot()
for element in root.iterfind('task', namespaces=root.nsmap):
    id = element.attrib.get('id')
    for document in element.iterfind('document', namespaces=root.nsmap):
        doc_id = document.attrib.get('id')
        relevance = document.attrib.get('relevance')
        document.clear()
        if relevance == 'vital':
            queries[id].relevant.add(doc_id)
    element.clear()

queries = {query_id: queries[query_id] for query_id in queries if len(queries[query_id].relevant) > 0}  
print("Total number of queries: ", len(queries))    

Total number of queries:  495


In [18]:
def search(query_text, query_result_size=100):
    query = {
        'query': {
            'bool': {
                'should': {
                    'match': {
                        'content': query_text
                    }
                }

            }
        }
    }
    query_result = es.search(index=INDEX, body=query, size=query_result_size)
    return query_result['hits']['hits']

In [19]:
def get_vectors(query_text):
    result = search(query_text, 100)
    vectors = []
    for doc in result:
        bm25_score = doc['_score']
        # TODO title content match: need to actually store plain title?
        # TODO span
        query_length = len(query_text)
        doc_length = len(doc['_source']['content'])
        doc_pr = doc['_source']['pagerank']
        # TODO url length
        vectors.append(Vector(bm25_score, False, 1, query_length, doc_length, doc_pr, 0))
    return vectors

In [21]:
# for query_id in query get vectors for query_id and 