In [1]:
import os
import zipfile
import json

import lxml.etree as et
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk
from pymystem3 import Mystem

from query import *
from vector import *

INDEX = "ind"
mystem = Mystem(disambiguation=False)


def create_action(doc_id, doc_json):
    return {
        '_index': INDEX,
        '_id': doc_id,
        '_source': doc_json
    }


def action_generator():
    DOCS_FOLDER = "documents"
    for filename in os.listdir(DOCS_FOLDER):
        name = DOCS_FOLDER + os.sep + filename
        zip_file = zipfile.ZipFile(name, 'r')

        #сnt = 0
        for filename in zip_file.filelist:
            #if cnt == 100:
            #    return 
            try:
                #cnt += 1
                doc_string = zip_file.read(filename).decode('utf-8')
                doc_json = json.loads(doc_string)
                doc_id = filename.orig_filename.strip(".txt")
                url_to_id[doc_json['url']] = doc_id
                doc_json['pagerank'] = pageranks[doc_id]
                yield create_action(doc_id, json.dumps(doc_json))
            except:
                print(filename.orig_filename)
                return


In [2]:
SETTINGS = {
    'mappings': {
        'properties': {
            'stemmed': {
                'type': 'text',
                'analyzer': 'russian_stemmed'
            },
            'titles': {
                'type': 'text',
                'analyzer': 'russian_stemmed'
            },
            'url': {
                'type': 'text'
            },
            'pagerank': {
                'type': 'rank_feature'
            }
        }
    },
    'settings': {
        'analysis': {
            'analyzer': {
                'russian_stemmed': {
                    'char_filter': ['yo'],
                    'tokenizer': 'whitespace',
                    'filter': ['lowercase']
                }
            },
            'char_filter': {
                'yo': {
                    'type': 'mapping',
                    'mappings': ['ё => е']
                }
            },
            'tokenizer': {
                'alphanum': {
                    'type': 'char_group',
                    'tokenize_on_chars': ["whitespace", "punctuation", "symbol", "\n"]
                }
            }
        },
        'index': {
            'blocks': {
                'read_only_allow_delete': 'false'
            }
        }
    }
}


def recreate_index():
    try:
        es.indices.delete(index=INDEX)
    except:
        pass
    es.indices.create(index=INDEX, body=SETTINGS)


In [3]:
es = Elasticsearch([{'host': 'localhost', 'port': 9200, 'timeout': 360, 'maxsize': 25}])


url_to_id = {}
pageranks = {}

with open("pageranks.txt", "r") as pagerank_file:
    line = pagerank_file.readline()
    while line:
        doc_id, pr = line.strip().split(":")
        pageranks[doc_id] = pr
        line = pagerank_file.readline()

recreate_index()



In [4]:

for ok, result in parallel_bulk(es, action_generator(), queue_size=4, thread_count=4, chunk_size=500):
    if not ok:
        print(result)


In [16]:
QUERIES_FILE = "web2008_adhoc.xml"
RELEVANCE_FILE_2009 = "or_relevant-minus_table2009.xml"
RELEVANCE_FILE_2008 = "or_relevant-minus_table2008.xml"
queries = {}
root = et.parse(QUERIES_FILE).getroot()
for element in root.iterfind('task', namespaces=root.nsmap):
    text = element[0].text
    id = element.attrib.get('id')
    element.clear()
    queries[id] = Query(id, text)
root = et.parse(RELEVANCE_FILE_2008).getroot()
for element in root.iterfind('task', namespaces=root.nsmap):
    id = element.attrib.get('id')
    for document in element.iterfind('document', namespaces=root.nsmap):
        if document.attrib.get('id') in url_to_id:
            doc_id = url_to_id[document.attrib.get('id')]
            relevance = document.attrib.get('relevance')
            document.clear()
            if relevance == 'vital':
                queries[id].relevant_train.add(doc_id)
            else:
                queries[id].irrelevant_train.add(doc_id)
    element.clear()

root = et.parse(RELEVANCE_FILE_2009).getroot()
for element in root.iterfind('task', namespaces=root.nsmap):
    id = element.attrib.get('id')
    if len(queries[id].relevant_train) > 0 or len(queries[id].irrelevant_train) > 0:
        continue
    for document in element.iterfind('document', namespaces=root.nsmap):
        doc_id = document.attrib.get('id')
        relevance = document.attrib.get('relevance')
        document.clear()
        if relevance == 'vital':
            queries[id].relevant_test.add(doc_id)
        else:    
            queries[id].irrelevant_test.add(doc_id)
    element.clear()    


print("Total number of queries: ", len(queries)) 


Total number of queries:  29231


In [17]:
def search(query_text, relevant_docs, irrelevant_docs, query_result_size=100, scroll_on=False):
    
    ids = list(relevant_docs)
    ids.extend(list(irrelevant_docs))

    query = {
        'query': {
            'bool': {
                'should': [
                    {
                        'match': {
                            'stemmed': {
                                'query': query_text,
                                'boost': '5.0'
                            }
                        }
                    },
                    {
                        'match': {
                            'titles': {
                                'query': query_text
                            }
                        }
                    }
                ],
                "filter": [
                    {
                        "ids": {
                            "values": ids
                        }
                    }
                ]

            }

        }
    }
    
    if not scroll_on:    
        query_result = es.search(index=INDEX, body=query, size=query_result_size)
        return query_result['hits']['hits']    
    else:
        result = []
        query_result = es.search(index=INDEX, body=query, scroll='10m', size=query_result_size)
        scroll_id = query_result['_scroll_id']
        scroll_size = len(query_result['hits']['hits'])
        while scroll_size > 0:
            result.extend(query_result['hits']['hits'])
            query_result = es.scroll(scroll_id=scroll_id, scroll='10m')
            scroll_id = query_result['_scroll_id']
            scroll_size = len(query_result['hits']['hits'])

        return result
    

In [18]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np


def count_query_coverage(query_words, text):
    matches = 0
    for word in query_words:
        matches += 1 if word in text else 0
    return matches / len(query_words)


def count_span(query_words, text_words):
    if len(query_words) > len(text_words):
        return 0
    
    words_cnt = {}
    for word in query_words:
        words_cnt[word] = 0
    
    query_words = set(query_words)
    unique_words_cnt = 0
    left = 0
    min_span = len(text_words) + 1
    for right in range(0, len(text_words)):
        word = text_words[right]
        if word in words_cnt:
            words_cnt[word] += 1
            if words_cnt[word] == 1:
                unique_words_cnt += 1
        
        if unique_words_cnt == len(query_words):
            word = text_words[left]
            while word not in words_cnt or words_cnt[word] > 1:
                left += 1
                if word in words_cnt:
                    words_cnt[word] -= 1
                word = text_words[left]
            min_span = min(min_span, right - left + 1)
            
    if min_span == len(text_words) + 1:
        return 0
    return len(query_words) / min_span


def get_vectors(query_text, relevant_docs, irrelevant_docs, scroll_on=False):
    lemmatized_query = mystem.lemmatize(query_text)
    query_words = [word.lower() for word in lemmatized_query if word.isalnum()]
    lemmatized_query = " ".join(query_words)
    result = search(lemmatized_query, relevant_docs, irrelevant_docs, 100, scroll_on)
    vectors = []
    for doc in result:
        relevant = 0
        if doc['_id'] in relevant_docs:
            relevant = 1
        elif len(irrelevant_docs) == 0 or doc['_id'] in irrelevant_docs:    
            relevant = 0
        else:
            continue
        bm25_score = doc['_score']
        title_match = count_query_coverage(query_words, doc['_source']['titles'])
        content_match = count_query_coverage(query_words, doc['_source']['stemmed'])
        span = count_span(query_words, doc['_source']['stemmed'])
        query_length = len(lemmatized_query)
        doc_length = sum([len(word) for word in doc['_source']['stemmed']])
        doc_pr = doc['_source']['pagerank']
        url_len = len(doc['_source']['url'])
        
        vector = [relevant, bm25_score, title_match, content_match, span, query_length, doc_length, doc_pr, url_len]
        vectors.append(vector)
    return vectors


def vector_to_string(array):
    result = f'{int(array[0])}'
    for i in range(1, len(array)):
        result += f' {i}:{array[i]}'
    return result


def write_vectors(vectors, file_name):
    #vectors = RobustScaler().fit_transform(np.asarray(vectors))
    for i in range(len(vectors)):
        vectors[i][0] = vectors[i].relevant
    with open(file_name, 'a') as file:
        for vector in vectors:
            file.write(vector_to_string(vector) + '\n')


In [19]:
train_vectors = []
test_vectors = []
test_qids = []
train_qids = []
for query_id in queries:
    query = queries[query_id]
    if len(query.relevant_test) > 0:
        test_vectors.extend(get_vectors(query.text, query.relevant_test, query.irrelevant_test))
    if len(query.relevant_train) > 0:
        train_vectors.extend(get_vectors(query.text, query.relevant_train, query.irrelevant_train, scroll_on=True))

In [None]:
write_vectors(train_vectors, "train.txt")
write_vectors(test_vectors, "test.txt")
# Rank here

for ignored_feature in range(1, len(train_vectors[0])):
    print(f'Ignored feature: {ignored_feature}')
    write_vectors([vector[:ignored_feature] + vector[ignored_feature + 1:] for vector in train_vectors], "train.txt")
    write_vectors([vector[:ignored_feature] + vector[ignored_feature + 1:] for vector in test_vectors], "test.txt")
    # Rank here
