In [17]:
import os
import zipfile
import json

import lxml.etree as et
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk
from pymystem3 import Mystem

from query import *
from vector import *

INDEX = "ind"
mystem = Mystem(disambiguation=False)


def create_action(doc_id, doc_json):
    return {
        '_index': INDEX,
        '_id': doc_id,
        '_source': doc_json
    }


def action_generator():
    DOCS_FOLDER = "documents"
    for filename in os.listdir(DOCS_FOLDER):
        name = DOCS_FOLDER + os.sep + filename
        zip_file = zipfile.ZipFile(name, 'r')

        for filename in zip_file.filelist:
            try:
                doc_string = zip_file.read(filename).decode('utf-8')
                doc_json = json.loads(doc_string)
                url_to_id[doc_json['url']] = doc_json['id']
                doc_json['pagerank'] = pageranks[doc_json['id']]
                yield create_action(filename.orig_filename.strip(".txt"), json.dumps(doc_json))
            except:
                print(filename.orig_filename)
                return


In [2]:
SETTINGS = {
    'mappings': {
        'properties': {
            'stemmed': {
                'type': 'text',
                'analyzer': 'russian_stemmed'
            },
            'titles': {
                'type': 'text',
                'analyzer': 'russian_stemmed'
            },
            'url': {
                'type': 'text'
            },
            'pagerank': {
                'type': 'rank_feature'
            }
        }
    },
    'settings': {
        'analysis': {
            'analyzer': {
                'russian_stemmed': {
                    'char_filter': ['yo'],
                    'tokenizer': 'whitespace',
                    'filter': ['lowercase']
                }
            },
            'char_filter': {
                'yo': {
                    'type': 'mapping',
                    'mappings': ['ё => е']
                }
            },
            'tokenizer': {
                'alphanum': {
                    'type': 'char_group',
                    'tokenize_on_chars': ["whitespace", "punctuation", "symbol", "\n"]
                }
            }
        },
        'index': {
            'blocks': {
                'read_only_allow_delete': 'false'
            }
        }
    }
}


def recreate_index():
    try:
        es.indices.delete(index=INDEX)
    except:
        pass
    es.indices.create(index=INDEX, body=SETTINGS)


In [5]:
es = Elasticsearch([{'host': 'localhost', 'port': 9200, 'timeout': 360, 'maxsize': 25}])
pagerank_file = open("pageranks.txt", "r")
line = pagerank_file.readline()
url_to_id = {}
pageranks = {}
while line:
    doc_id, pr = line.strip().split(":")
    pageranks[doc_id] = pr

recreate_index()
for ok, result in parallel_bulk(es, action_generator(), queue_size=4, thread_count=4, chunk_size=500):
    if not ok:
        print(result)


'\nrecreate_index()\nstart_indexing = time.time()\nfor ok, result in parallel_bulk(es, action_generator(), queue_size=4, thread_count=4, chunk_size=500):\n    if not ok:\n        print(result)\nprint("Indexing time: ", time.time() - start_indexing)\nprint("Index size in bytes: ", es.indices.stats()[\'_all\'][\'primaries\'][\'store\'][\'size_in_bytes\'])\n'

In [6]:
QUERIES_FILE = "web2008_adhoc.xml"
RELEVANCE_FILE_2009 = "or_relevant-minus_table2009.xml"
RELEVANCE_FILE_2008 = "or_relevant-minus_table2008.xml"
queries = {}
root = et.parse(QUERIES_FILE).getroot()
for element in root.iterfind('task', namespaces=root.nsmap):
    text = element[0].text
    id = element.attrib.get('id')
    element.clear()
    queries[id] = Query(id, text)
root = et.parse(RELEVANCE_FILE_2009).getroot()
for element in root.iterfind('task', namespaces=root.nsmap):
    id = element.attrib.get('id')
    for document in element.iterfind('document', namespaces=root.nsmap):
        doc_id = document.attrib.get('id')
        relevance = document.attrib.get('relevance')
        document.clear()
        if relevance == 'vital':
            queries[id].relevant_test.add(doc_id)
    element.clear()
root = et.parse(RELEVANCE_FILE_2008).getroot()
for element in root.iterfind('task', namespaces=root.nsmap):
    id = element.attrib.get('id')
    for document in element.iterfind('document', namespaces=root.nsmap):
        doc_id = url_to_id[document.attrib.get('id')]
        relevance = document.attrib.get('relevance')
        document.clear()
        if relevance == 'vital':
            queries[id].relevant_train.add(doc_id)
    element.clear()

queries = {query_id: queries[query_id] for query_id in queries if len(queries[query_id].relevant) > 0}  
print("Total number of queries: ", len(queries)) 


Total number of queries:  495


In [18]:
def search(query_text, query_result_size=100):
    query = {
        'query': {
            'bool': {
                'should': [
                    {
                        'match': {
                            'stemmed': {
                                'query': query_text,
                                'boost': '5.0'
                            }
                        }
                    },
                    {
                        'match': {
                            'titles': {
                                'query': query_text
                            }
                        }
                    }
                ]

            }
        }
    }
    query_result = es.search(index=INDEX, body=query, size=query_result_size)
    return query_result['hits']['hits']


In [19]:
def count_query_coverage(query_words, text):
    matches = 0
    for word in query_words:
        matches += 1 if word in text else 0
    return matches / len(query_words)


def count_span(query_words, text):
    text_words = text.split()
    if len(query_words) > len(text_words):
        return 0
    
    words_cnt = {}
    for word in query_words:
        words_cnt[word] = 0
    
    query_words = set(query_words)
    unique_words_cnt = 0
    left = 0
    min_span = len(text_words) + 1
    for right in range(0, len(text_words)):
        word = text_words[right]
        if word in words_cnt:
            words_cnt[word] += 1
            if words_cnt[word] == 1:
                unique_words_cnt += 1
        
        if unique_words_cnt == len(query_words):
            word = text_words[left]
            while word not in words_cnt or words_cnt[word] > 1:
                left += 1
            min_span = min(min_span, right - left + 1)
            
    if 
    
    return len(query_words) / min_span


def get_vectors(query_text):
    lemmatized_query = mystem.lemmatize(query_text)
    query_words = [word.lower() for word in lemmatized_query if word.isalnum()]
    lemmatized_query = " ".join(lemmatized_query)
    result = search(lemmatized_query, 100)
    vectors = []
    for doc in result:
        bm25_score = doc['_score']
        title_match = count_query_coverage(query_words, doc['_source']['title'])
        content_match = count_query_coverage(query_words, doc['_source']['stemmed'])
        span = count_span(query_words, doc['_source']['stemmed'])
        query_length = len(lemmatized_query)
        doc_length = len(doc['_source']['stemmed'])
        doc_pr = doc['_source']['pagerank']
        url_len = len(doc['_source']['url'])
        vectors.append(Vector(bm25_score, title_match, content_match, span, query_length, doc_length, doc_pr, url_len))
    return vectors


In [21]:
# for query_id in query get vectors for query_id and 