# Prepare

In [1]:
from IPython.core.interactiveshell import InteractiveShell
import json
import os, re
import pandas as pd
import numpy as np
import time
import random
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import ast
import math
from jupyterthemes import jtplot

jtplot.style(theme='chesterish')

In [2]:
def load_jsonl(jsonl):
    with open(jsonl, 'r') as file:
        return [json.loads(line) for line in list(file)]


def load_json(file):
    with open(file, 'r') as f:
        return json.load(f)

In [3]:
cat_features = []

# Load

In [4]:
raw_validation, raw_test = load_jsonl('data/val.jsonl'), load_jsonl('data/test.jsonl')

In [5]:
for corpus in raw_validation:
    corpus['is_answered'] = True
for corpus in raw_test:
    corpus['is_answered'] = False

In [6]:
data = raw_validation + raw_test

In [7]:
for idx, item in enumerate(data):
    item['idx'] = idx
    for q in item['qas']:
        q['idx'] = idx

# Translate

In [8]:
def standardize(txt):
    return txt.replace('\n', '').replace('@highlight', '')

In [9]:
translated_answers, translated_questions, translated_texts = load_json(
    'data/answers_all.json'), load_json('data/questions_validation.json'), load_json(
        'data/texts_test.json')
translated_texts.update(load_json('data/texts_validation.json'))
translated_questions.update(load_json('data/questions_test.json'))

In [10]:
def translate(text, kind='text'):
    if kind == 'text':
        return translated_texts[text]
    elif kind == 'question':
        return translated_questions[text]
    elif kind == 'answer':
      return translated_answers[text]

In [11]:
for k, v in list(translated_answers.items()):
  translated_answers[k.strip()] = v.strip()

In [12]:
def fillthegaps():
    global translated_texts
    translated_texts['Обвинения в причастности к парижским терактам предъявлены в Бельгии уже восьми задержанным. Один из организаторов взрывов был перед терактами в Будапеште. В Бельгии задержаны еще два подозреваемых в связи с терактами в Париже, один из них гражданин Франции, второй - подданный Бельгии. Их обвиняют в участии в действиях, связанных с терроризмом, сообщили в четверг, 3 декабря, в генпрокуратуре в Брюсселе. Они останутся под арестом в течение месяца. Олланд постарался убедить мировых лидеров в необходимости более интенсивной борьбы с ИГ.В Германии за сутки выявлено более 100 новых заражений коронавирусомТысячи демонстрантов в Гамбурге выступили за прием беженцевКомментарий: Россия накануне эпидемии - виноватые назначены заранее'] = "Already eight detainees have been charged with involvement in the Paris terrorist attacks in Belgium. One of the organizers of the explosions was before the attacks in Budapest. In Belgium, two more suspects were detained in connection with the terrorist attacks in Paris, one of them is a citizen of France, the other is a citizen of Belgium. They are accused of participation in acts related to terrorism, reported on Thursday, December 3, at the Prosecutor General's Office in Brussels. They will remain under arrest for a month. Hollande tried to convince world leaders of the need for a more intense fight against IS. More than 100 new coronavirus infections were detected in Germany per day Thousands of demonstrators in Hamburg spoke in favor of accepting refugees."
    translated_texts['В Саитаме (Япония) пожилую женщину госпитализировали с подозрением на заражение вирусом Эбола. Об этом сообщает РИА Новости со ссылкой на агентство Киодо. Уточняется, что ранее 70-летняя японка посещала Демократическую Республику Конго. У нее была зафиксирована температура более 38 градусов. Пенсионерку доставили в одну из больниц Токио для выявления в ее крови вируса Эбола. В кризисном штабе при канцелярии премьер-министра Японии Синдзо Абэ уже была создана группа по сбору и анализу информации. Смертельный вирус достиг международного масштаба Смертельный вирус проник в еще одну страну Африки «Русал» и Минздрав завершили вакцинацию против Эболы в Гвинее'] = "In Saitama, Japan, an elderly woman was hospitalized with suspected Ebola infection. This was reported by RIA Novosti with reference to the Kyodo agency. It is specified that the 70-year-old Japanese woman previously visited the Democratic Republic of the Congo. She had a temperature of more than 38 degrees. The pensioner was taken to a hospital in Tokyo to be diagnosed with the Ebola virus in her blood. A group for collecting and analyzing information has already been set up in the crisis headquarters under the office of Japanese Prime Minister Shinzo Abe. The deadly virus has reached an international scale The deadly virus has penetrated another African country Rusal and the Ministry of Health completed vaccination against Ebola in Guinea"
    translated_texts['Физики из Университета Глазго обнаружили парадокс теории Эйнштейна, который может нарушать основополагающий принцип равноправия инерциальных систем отсчета (ИСО) и тем самым ставит под угрозу физическую картину мира. Объяснение противоречия было предложено в Journal of Modern Optics. Кратко об исследовании рассказывает издание Gizmodo. Согласно принципу относительности для разных ИСО, представленных покоящимися и двигающимися телами, законы физики проявляются одинаково. Однако ученые выяснили, что этот принцип может не выполняться для движущихся возбужденных атомов. Парадокс заключается в том, что атомы могут спонтанно испускать фотоны, но при этом на частицы света распространяется эффект Доплера. Представлено новое подтверждение общей теории относительности Существование машины времени доказали математически Физики запутали рекордное количество атомов'] = "Physicists from the University of Glasgow discovered the paradox of Einstein's theory, which can violate the fundamental principle of equality of inertial reference systems (IRF) and thereby endanger the physical picture of the world. An explanation for the controversy was suggested in the Journal of Modern Optics. The research is briefly described by the publication Gizmodo. According to the principle of relativity for different IFRs, represented by bodies at rest and moving bodies, the laws of physics manifest themselves in the same way. However, scientists have found that this principle may not hold for moving excited atoms. The paradox is that atoms can spontaneously emit photons, but the Doppler effect applies to light particles. New confirmation of general relativity presented The existence of a time machine has been proven mathematically Physicists have entangled a record number of atoms"
    translated_texts['Бывший президент Франции Николя Саркози подверг критике подход нынешнего главы государства Франсуа Олланда к отношениям с Москвой. Его слова приводит ТАСС. «Я сожалею о политике, которая проводится ныне в отношении России», — заявил он журналистам, комментируя известие об отмене визита в Париж президента Владимира Путина. Саркози, намеревающийся баллотироваться в 2017 году на пост президента, предостерег от «вступления в новую холодную войну». По его словам, «долг и обязанность Франции и Европы — вести диалог с Россией». «Как можно урегулировать ситуацию, если общаться друг с другом лишь посредством пресс-релизов?» — сказал Саркози.Олланд отказался считать отмену визита Путина помехой для дискуссии по СирииВо Франции сообщили об отмене визита Путина в ПарижПесков объяснил отмену визита Путина в Париж'] = """Former French President Nicolas Sarkozy criticized the current head of state François Hollande's approach to relations with Moscow. His words are quoted by TASS. “I regret the current policy towards Russia,” he told reporters, commenting on the news of the cancellation of President Vladimir Putin’s visit to Paris. Sarkozy, intending to run for president in 2017, has warned against "entering a new cold war." According to him, "the duty and obligation of France and Europe is to conduct a dialogue with Russia." "How can you resolve the situation if you communicate with each other only through press releases?" - Sarkozy said. Hollande refused to consider the cancellation of Putin's visit as a hindrance to the discussion on Syria France announced the cancellation of Putin's visit to Paris Sands explained the cancellation of Putin's visit to Paris"""
    translated_texts['Президент России Владимир Путин заявил, что выступает за бессрочный закон об особом статусе Донбасса. Об этом он заявил на пресс-конференции после переговоров с канцлером Германии Ангелой Меркель в Кремле, передает «Интерфакс». Глава государства призвал руководство Украины начать реализацию минских соглашений, несмотря на внутриполитические проблемы. «Мы понимаем все сложности внутриполитического процесса на Украине, но, если все заинтересованные стороны хотят добиться окончательного урегулирования, нужно идти по реализации минских соглашений», — сказал Путин. Он также отметил, что на переговорах с канцлером Германии «подробно обсуждалось» урегулирование внутриукраинского кризиса. Ранее министр иностранных дел Украины Вадим Пристайко заявил, что принятие «формулы Штайнмайера» не означает проведения выборов в Донбассе любой ценой.Украина отказалась проводить выборы в Донбассе любой ценойОписаны пять сценариев развития ситуации в ДонбассеНа Украине раскрыли план по введению «формулы Штайнмайера» в законодательство'] = """Russian President Vladimir Putin said he was in favor of an unlimited law on the special status of Donbass. He stated this at a press conference after talks with German Chancellor Angela Merkel in the Kremlin, Interfax reports. The head of state called on the Ukrainian leadership to start implementing the Minsk agreements, despite the internal political problems. “We understand all the complexities of the internal political process in Ukraine, but if all interested parties want to achieve a final settlement, we need to follow the implementation of the Minsk agreements,” Putin said. He also noted that during the talks with the German Chancellor, the settlement of the internal Ukrainian crisis was "discussed in detail". Earlier, the Minister of Foreign Affairs of Ukraine Vadim Prystaiko said that the adoption of the "Steinmeier formula" does not mean holding elections in Donbass at any cost."""

fillthegaps()

In [13]:
for item in data:
    passage = item['passage']
    question = standardize(item['qas'][0]['query'])
    item['qas'][0]['query_en'] = translate(standardize(item['qas'][0]['query']), kind='question')
    for entity in passage['entities']:
        entity['text'] = passage['text'][entity['start']:entity['end']]
        entity['text_en'] = translate(passage['text'][entity['start']:entity['end']].strip(), kind='answer')
        if 'answers' in item['qas'][0]:
            entity['label'] = 0
    if 'answers' in item['qas'][0]:
        for entity in item['qas'][0]['answers']:
            for e in passage['entities']:
                if entity['start'] == e['start'] and entity['end'] == e['end']:
                    e['label'] = 1
    passage['text_en'] = translate(standardize(passage['text']), kind='text')
    passage['text'] = passage['text']

# Make flatten

In [14]:
def simplify_data(d, split_answers=False, use_en=True):
    simplified = []
    temp = {}
    for item in d:
        temp['context_ru'] = item['passage']['text']
        temp['question_ru'] = item['qas'][0]['query']
        if use_en:
            temp['context_en'] = item['passage']['text_en']
            temp['question_en'] = item['qas'][0]['query_en']
        temp['is_answered'] = item['is_answered']
        if split_answers:
            for entity in set([tuple(x.items()) for x in item['passage']['entities']]):
                entity = dict(entity)
                temp['answer_ru'] = entity['text']
                temp['start'] = entity['start']
                temp['end'] = entity['end']
                temp['old_idx'] = item['idx']
                if use_en:
                    temp['answer_en'] = entity['text_en']
                if item['is_answered']:
                    temp['label'] = entity['label']
                simplified.append(temp.copy())
        else:
            temp['answers'] = item['passage']['entities']
            simplified.append(temp.copy())
    return simplified

In [15]:
train = pd.DataFrame(list(simplify_data(data, split_answers=True)))

In [16]:
train = train.drop_duplicates(['context_ru', 'question_ru', 'answer_ru'])

In [17]:
train = train.reset_index().drop(columns=['index'])

In [18]:
train.head()

Unnamed: 0,context_ru,question_ru,context_en,question_en,is_answered,answer_ru,start,end,old_idx,answer_en,label
0,Главной темой переговоров наряду с прежними сп...,"В него вошли @placeholder, Россия, Украина и Ф...","The main topic of the talks, along with previo...","It includes @placeholder, Russia, Ukraine and ...",True,Донбасса,173,181,0,Donbass,0
1,Главной темой переговоров наряду с прежними сп...,"В него вошли @placeholder, Россия, Украина и Ф...","The main topic of the talks, along with previo...","It includes @placeholder, Russia, Ukraine and ...",True,Украину,409,416,0,Ukraine,0
2,Главной темой переговоров наряду с прежними сп...,"В него вошли @placeholder, Россия, Украина и Ф...","The main topic of the talks, along with previo...","It includes @placeholder, Russia, Ukraine and ...",True,Мариуполе,429,438,0,Mariupol,0
3,Главной темой переговоров наряду с прежними сп...,"В него вошли @placeholder, Россия, Украина и Ф...","The main topic of the talks, along with previo...","It includes @placeholder, Russia, Ukraine and ...",True,Россия,1246,1252,0,Russia,0
4,Главной темой переговоров наряду с прежними сп...,"В него вошли @placeholder, Россия, Украина и Ф...","The main topic of the talks, along with previo...","It includes @placeholder, Russia, Ukraine and ...",True,Маса,881,885,0,Masa,0


# Yammy Features

## General

In [19]:
def harmonic_mean(arr):
    n = len(arr)
    if n == 0:
        return 0
    s = 0
    for i in arr:
        if i == 0:
            i = 10 ** -10
        s += 1 / i
    return n / s if s != 0 else 10 ** -10


def geometric_mean(arr):
    n = len(arr)
    if n == 0:
        return 0
    res = 1
    for i in arr:
        res = res * (i**(1 / n))
    return res


def square_mean(arr):
    n = len(arr)
    if n == 0:
        return 0
    s = 0
    for i in arr:
        s += i * i
    return math.sqrt(s / n)


def mmmm(feature, where=None, by=None, function=None):
    if where is None:
        where = train
    if by == 'context_idx':
        return where
    assert by != None, 'Missing required \'by\' parameter!'
    temp = where[[feature, by]].groupby([by]).agg(
        ['mean', 'std', 'median', 'max', 'min', 'sum', square_mean, geometric_mean, harmonic_mean])
    temp.columns = [
        '_'.join(col).strip() + '_by_' + by.split('_')[0]
        for col in temp.columns.values
    ]
    return pd.merge(temp, where, on=by)

## Dumb features

In [22]:
for feature in ['context', 'question', 'answer']:
    for locale in ['ru', 'en']:
        train['words_amount_in_' + feature + '_' + locale] = [t.count(' ') + 1 for t in train[feature + '_' + locale]]

In [23]:
for feature in ['context', 'question', 'answer']:
    train['words_amount_in_' + feature +
          '_difference'] = train['words_amount_in_' + feature +
                                 '_ru'] - train['words_amount_in_' + feature +
                                                '_en']

In [24]:
for feature in ['context', 'question', 'answer']:
    for locale in ['ru', 'en']:
        for by in ['old_idx']:
            train = mmmm('words_amount_in_' + feature + '_' + locale, by=by)

In [20]:
for feature in ['context', 'question']:
    for locale in ['ru', 'en']:
        train['amount_answers_in_' + feature + '_' + locale] = [t.count(a) + 1 for t, a in train[[feature + '_' + locale, 'answer_' + locale]].values]

In [21]:
for feature in ['context', 'question']:
    for locale in ['ru', 'en']:
        for by in ['old_idx']:
            train = mmmm('amount_answers_in_' + feature + '_' + locale, by=by)

In [43]:
alphabet = [chr(i) for i in range(ord('А'),ord('А')+64)] + list('1234567890') + [' ']

In [44]:
from langdetect import detect
result = []
for answer in tqdm(train['answer_ru'].values):
    if all(map(lambda x: True if x in alphabet else False, answer)):
        result.append('ru')
    else:
        result.append('en')

HBox(children=(FloatProgress(value=0.0, max=143064.0), HTML(value='')))




In [45]:
train['answer_language_manual'] = result

In [39]:
cat_features.append('answer_language_manual')

In [46]:
train[['answer_language', 'answer_language_manual', 'answer_ru']]

Unnamed: 0,answer_language,answer_language_manual,answer_ru
0,ru,ru,Донбасса
1,uk,ru,Украину
2,mk,ru,Мариуполе
3,ru,ru,Россия
4,bg,ru,Маса
...,...,...,...
143059,mk,ru,Синиша Мали
143060,ru,ru,РИА Новости
143061,ru,ru,Александр Вучич
143062,ru,ru,Сербия


# Tokenizing

In [217]:
!pip install nltk



In [47]:
import nltk
nltk.download("stopwords")
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, SnowballStemmer

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [48]:
stemmers = {'ru': SnowballStemmer('russian'), 'en': PorterStemmer()}

In [49]:
stop_words = {'ru': set(stopwords.words("russian")), 'en': set(stopwords.words("english"))}

In [50]:
tokenize = lambda text: nltk.word_tokenize(text)

In [51]:
stemmerize = lambda text, locale: set([stemmers[locale].stem(word) for word in nltk.word_tokenize(text) if not word.lower() in stop_words[locale] and word[0].isalpha()])

In [52]:
stemmed = {}

for locale in ['ru', 'en']:
    temp = []
    for corpus in tqdm(simplify_data(data, split_answers=False)):
        words_in_context = stemmerize(corpus['context_' + locale], locale)
        words_in_question = stemmerize(corpus['question_' + locale], locale)
        for answer in corpus['answers']:
            words_in_answer = stemmerize(answer['text' + ('_en' if locale == 'en' else '')], locale)
            temp.append({'stemmed_context_' + locale: words_in_context, 
                             'stemmed_question_' + locale: words_in_question, 
                             'stemmed_answer_' + locale: words_in_answer})
    stemmed[locale] = temp

HBox(children=(FloatProgress(value=0.0, max=14834.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=14834.0), HTML(value='')))




In [53]:
train = pd.concat([train, pd.DataFrame(stemmed['ru'])], axis=1)
train = pd.concat([train, pd.DataFrame(stemmed['en'])], axis=1)

In [54]:
for feature in ['context', 'question', 'answer']:
    for locale in ['ru', 'en']:
        train['stems_amount_in_' + feature + '_' + locale] = [len(t) for t in train['stemmed_' + feature + '_' + locale]]

In [55]:
for locale in ['ru', 'en']:
    train = pd.concat([train, pd.DataFrame([{'same_words_answer_question_' + locale: len(a & q), 
      'same_words_context_quesiton_' + locale: len(q & c), 
      'same_words_context_answer_' + locale: len(a & c)} for c, q, a in train[['stemmed_context_' + locale, 'stemmed_question_' + locale, 'stemmed_answer_' + locale]].values])], axis=1)

In [56]:
for feature in ['answer']:
    for locale in ['ru', 'en']:
        train['stems_same_stems_ratio_' + feature + '_' + locale] = train['stems_amount_in_' + feature + '_' + locale] / train['stems_amount_in_' + feature + '_' + locale]

In [57]:
for locale in ['ru', 'en']:
    for feature in ['same_words_answer_question', 'same_words_context_quesiton', 'same_words_context_answer', 'stems_same_stems_ratio_answer']:
        train = mmmm(feature + '_' + locale, by='old_idx')

In [58]:
import pymorphy2
morph = pymorphy2.MorphAnalyzer()

In [59]:
result = {}
for i, answer in enumerate(tqdm(train['answer_ru'].values)):
    parsed = morph.parse(answer)[0]
    result[i] = {'answer_ru_part_of_speach': parsed.tag.POS}
    for k, v in result[i].items():
        if v == None:
            result[i][k] = 'NONE'

HBox(children=(FloatProgress(value=0.0, max=143064.0), HTML(value='')))




In [60]:
train = pd.concat([train, pd.DataFrame(result).T], axis=1)

In [61]:
cat_features += list(result[0].keys())
#cat_features.remove('answer_ru_morph_score')

In [49]:
train.to_csv('data.csv')

# Windows

In [62]:
result = {}
for locale in 'ru',:
    for index, (start, end, text) in tqdm(list(enumerate(train[['start', 'end', 'context_' + locale]].values))):
        start, end = int(start), int(end)
        i = len(text[:start].split())
        new_text = [''.join(filter(lambda x: x.isalpha() or x == ' ' or x == '-', y)) for y in (text[:start] + text[end:]).split()]
        r = {'word_behind_entity_' + locale + '_in_context': 
             (new_text[i-1] if 0 < i < len(new_text) and new_text[i-1] != 'highlight' and new_text[i-1] != 'header' and text[start-2] not in '.,?!\n' else '-'), 'word_after_entity_' + locale + '_in_context': (new_text[i] if i < len(new_text) and new_text[i] != 'highlight' and new_text[i] != 'header' and (end == len(text)-1 or text[end+1] not in '.,?!\n') else '-')}
        if index not in result:
            result[index] = r
        else:
            result[index].update(r)


HBox(children=(FloatProgress(value=0.0, max=143064.0), HTML(value='')))




In [63]:
for locale in 'ru',:
    for index, (start, end, text) in tqdm(list(enumerate(train[['start', 'end', 'context_' + locale]].values))):
        start, end = int(start), int(end)
        i = len(text[:start].split())
        new_text = [''.join(filter(lambda x: x.isalpha() or x == ' ' or x == '-', y)) for y in (text[:start] + text[end:]).split()]
        r = {'word_behind_behind_entity_' + locale + '_in_context': 
             (new_text[i-2] if 1 < i < len(new_text) and new_text[i-2] != 'highlight' and new_text[i-2] != 'header' and text[start-2] not in '.,?!\n' else '-'), 
             'word_after_after_entity_' + locale + '_in_context': 
             (new_text[i+1] if i+1 < len(new_text) and new_text[i+1] != 'highlight' and new_text[i+1] != 'header' and (end == len(text)-1 or text[end+1] not in '.,?!\n') else '-')}
        if index not in result:
            result[index] = r
        else:
            result[index].update(r)

HBox(children=(FloatProgress(value=0.0, max=143064.0), HTML(value='')))




In [64]:
for locale in 'ru',:
    for index, (text,) in tqdm(list(enumerate(train[['question_' + locale]].values))):
        start, end = text.find('@placeholder'), text.find('@placeholder') + len('@placeholder')
        i = len(text[:start].split())
        new_text = [''.join(filter(lambda x: x.isalpha() or x == ' ' or x == '-', y)) for y in (text[:start] + text[end:]).split()]
        r = {'word_behind_entity_' + locale + '_in_question': 
             (new_text[i-1] if 0 < i < len(new_text) and new_text[i-1] != 'highlight' and new_text[i-1] != 'header' and text[start-2] not in '.,?!\n' else '-'), 
             'word_after_entity_' + locale + '_in_question': (new_text[i] if i < len(new_text) and new_text[i] != 'highlight' and new_text[i] != 'header' and (end == len(text)-1 or text[end+1] not in '.,?!\n') else '-')}
        if index not in result:
            result[index] = r
        else:
            result[index].update(r)

HBox(children=(FloatProgress(value=0.0, max=143064.0), HTML(value='')))




In [65]:
for locale in 'ru',:
    for index, (text,) in tqdm(list(enumerate(train[['question_' + locale]].values))):
        start, end = text.find('@placeholder'), text.find('@placeholder') + len('@placeholder')
        i = len(text[:start].split())
        new_text = [''.join(filter(lambda x: x.isalpha() or x == ' ' or x == '-', y)) for y in (text[:start] + text[end:]).split()]
        r = {'word_behind_behind_entity_' + locale + '_in_question': 
             (new_text[i-2] if 1 < i < len(new_text) and new_text[i-2] != 'highlight' and new_text[i-2] != 'header' and text[start-2] not in '.,?!\n' else '-'), 
             'word_after_after_entity_' + locale + '_in_question': (new_text[i+1] if i+1 < len(new_text) and new_text[i+1] != 'highlight' and new_text[i+1] != 'header' and (end == len(text)-1 or text[end+1] not in '.,?!\n') else '-')}
        if index not in result:
            result[index] = r
        else:
            result[index].update(r)

HBox(children=(FloatProgress(value=0.0, max=143064.0), HTML(value='')))




In [66]:
text_features = []

In [67]:
text_features += list(list(result.items())[2][1].keys())

In [80]:
cat_features += list(list(result.items())[2][1].keys())

In [68]:
train = pd.concat([train, pd.DataFrame(result).T], axis=1)

In [69]:
glossary = set()
for i in list(result.items()):
    for j in list(i[1].values()):
        glossary.add(j.lower())

In [70]:
dictionary = {}
for k in glossary:
    parsed = morph.parse(k)[0]
    dictionary[k] = parsed.tag.POS

In [None]:
dictionary

In [71]:
answers = {}
for k, d in result.items():
    for f, w in d.items():
        if k in answers:
            answers[k][f + '_part_of_speach'] = dictionary[w.lower()] if dictionary[w.lower()] != None else 'NONE'
        else:
            answers[k] = {f + '_part_of_speach': dictionary[w.lower()] if dictionary[w.lower()] != None else 'NONE'}

In [72]:
train = pd.concat([train, pd.DataFrame(answers).T], axis=1)

In [None]:
train.head()

In [73]:
cat_features += list(list(answers.items())[2][1].keys())

# Encoding

In [74]:
import gc
gc.collect()

15

In [75]:
from sentence_transformers import util
from sentence_transformers import SentenceTransformer, util

In [76]:
cosine_similarity = lambda a, b: util.pytorch_cos_sim(a, b)[0][0].item()

In [77]:
encoder = SentenceTransformer('stsb-roberta-large')

# MultiRC

In [78]:
val_mrc = list(np.load('data/multircformat_val.npy'))
test_mrc = list(np.load('data/multircformat_test_first8832.npy')) + list(np.load('data/multircformat_testfrom8896.npy'))

In [79]:
multirc = val_mrc + test_mrc

In [80]:
result = {}
counter = 0
for old_idx in tqdm(set(train['old_idx'].values)):
    answers = len(list(set([str(i) for i in train[train['old_idx'] == old_idx]['answer_ru'].values])))
    result[old_idx] = sum(multirc[counter:counter+answers])
    counter+=answers

HBox(children=(FloatProgress(value=0.0, max=14834.0), HTML(value='')))




In [81]:
train = pd.merge(train, pd.Series(result).reset_index().rename(columns={'index': 'old_idx', 0: 'multirc'}), on='old_idx')

In [82]:
for feature in 'multirc',:
    train = mmmm(feature, by='old_idx')

# T5

In [83]:
from fuzzywuzzy import fuzz

In [84]:
test_beatlesfan = [i.strip('\n') for i in open('beatlesfan.txt', 'r').readlines()]
val_beatlesfan = [i.strip('\n') for i in open('beatlesfan_val.txt', 'r').readlines()]

In [86]:
result = {}
for text_idx, answer in tqdm(list(enumerate(val_beatlesfan + test_beatlesfan))):
    encoded_answer = encoded_answers[answer] if answer in encoded_answers else encoder.encode(answer)
    for k, v in dict(train[train['old_idx']==text_idx]['answer_en']).items():
        try:
            encoded_variant = encoded_answers[v]
            result[k] = {'t5_beatles_record_fuzz_ratio':  (fuzz.ratio(v, answer) / 100), 
                     't5_beatles_record_fuzz_partial_ratio':  (fuzz.partial_ratio(v, answer) / 100),
                     't5_beatles_record_roberta_simmilarity':  cosine_similarity(encoded_answer, encoded_variant)}
        except:
            print('exception!')

HBox(children=(FloatProgress(value=0.0, max=14834.0), HTML(value='')))

exception!
exception!



In [87]:
train = pd.concat([train, pd.DataFrame(result).T], axis=1)

In [88]:
for feature in result[0].keys():
    train = mmmm(feature, by='old_idx')

  return result.astype(dtype)


# Top

In [89]:
val_top3 = np.load('data/val_top3.npy', allow_pickle=True)
test_top3 = np.load('data/test_top3.npy', allow_pickle=True)

In [74]:
opa = list(set([str(i) for i in train['answer_en'].values]))

In [None]:
encodes = encoder.encode(opa, show_progress_bar=True)

In [49]:
encoded_answers = {}
for i in range(len(opa)):
    encoded_answers[opa[i]] = encodes[i]

In [85]:
encoded_answers = np.load('encoded_answers.npy', allow_pickle=True).item()

In [None]:
encoded_answers

In [90]:
result = {}
for text_idx, top in tqdm(list(enumerate(list(val_top3)[:len(raw_validation)] + list(test_top3)[:len(raw_test)]))):
    encoded_entities = [encoded_answers[i] if i in encoded_answers else encoder.encode(top) for i in top]
    for k, v in dict(train[train['old_idx']==text_idx]['answer_en']).items():
        try:
            encoded_answer = encoded_answers[v]
            result[k] = {'t5_3b_record_top1_fuzz_ratio':  (fuzz.ratio(top[0], v) / 100) if top else 0, 
                     't5_3b_record_top2_fuzz_ratio':  (fuzz.ratio(top[1], v) / 100) if len(top) > 1 else 0, 
                     't5_3b_record_top3_fuzz_ratio':  (fuzz.ratio(top[2], v) / 100) if len(top) == 3 else 0,
                     't5_3b_record_top1_fuzz_partial_ratio':  (fuzz.partial_ratio(top[0], v) / 100) if top else 0, 
                     't5_3b_record_top2_fuzz_partial_ratio':  (fuzz.partial_ratio(top[1], v) / 100) if len(top) > 1 else 0, 
                     't5_3b_record_top3_fuzz_partial_ratio':  (fuzz.partial_ratio(top[2], v) / 100) if len(top) == 3 else 0,
                     't5_3b_record_top1_roberta_simmilarity':  cosine_similarity(encoded_entities[0], encoded_answer) if top else 0,
                     't5_3b_record_top2_roberta_simmilarity':  cosine_similarity(encoded_entities[1], encoded_answer) if len(top) > 1 else 0,
                     't5_3b_record_top3_roberta_simmilarity':  cosine_similarity(encoded_entities[2], encoded_answer) if len(top) == 3 else 0}


        except:
            print('exception')

HBox(children=(FloatProgress(value=0.0, max=14834.0), HTML(value='')))

exception
exception



In [91]:
train = pd.concat([train, pd.DataFrame(result).T], axis=1)

In [78]:
train.head()

Unnamed: 0,old_idx,t5_beatles_record_roberta_simmilarity_mean_by_old,t5_beatles_record_roberta_simmilarity_std_by_old,t5_beatles_record_roberta_simmilarity_median_by_old,t5_beatles_record_roberta_simmilarity_max_by_old,t5_beatles_record_roberta_simmilarity_min_by_old,t5_beatles_record_roberta_simmilarity_sum_by_old,t5_beatles_record_roberta_simmilarity_square_mean_by_old,t5_beatles_record_roberta_simmilarity_geometric_mean_by_old,t5_beatles_record_roberta_simmilarity_harmonic_mean_by_old,...,t5_beatles_record_roberta_simmilarity,t5_3b_record_top1_fuzz_ratio,t5_3b_record_top2_fuzz_ratio,t5_3b_record_top3_fuzz_ratio,t5_3b_record_top1_fuzz_partial_ratio,t5_3b_record_top2_fuzz_partial_ratio,t5_3b_record_top3_fuzz_partial_ratio,t5_3b_record_top1_roberta_simmilarity,t5_3b_record_top2_roberta_simmilarity,t5_3b_record_top3_roberta_simmilarity
0,0,0.350735,0.283964,0.29663,1.0,-0.031259,7.014695,0.446787,0.201527,0.030703,...,0.001868,0.12,0.18,0.3,0.29,0.5,0.32,0.001868,0.101491,0.295024
1,0,0.350735,0.283964,0.29663,1.0,-0.031259,7.014695,0.446787,0.201527,0.030703,...,0.32133,0.14,0.15,0.31,0.14,0.17,0.43,0.32133,0.437414,0.388387
2,0,0.350735,0.283964,0.29663,1.0,-0.031259,7.014695,0.446787,0.201527,0.030703,...,1.0,1.0,0.46,0.38,1.0,0.5,0.43,1.0,0.741638,-0.031259
3,0,0.350735,0.283964,0.29663,1.0,-0.031259,7.014695,0.446787,0.201527,0.030703,...,0.599747,0.14,0.15,0.23,0.14,0.17,0.29,0.599747,0.566922,0.019302
4,0,0.350735,0.283964,0.29663,1.0,-0.031259,7.014695,0.446787,0.201527,0.030703,...,0.303906,0.0,0.0,0.0,0.0,0.0,0.0,0.303906,0.261405,0.081897


In [92]:
for feature in result[0].keys():
    train = mmmm(feature, by='old_idx')

  return result.astype(dtype)


# Crazy simillarity

In [93]:
all_questions = list(set([str(i) for i in train['question_en'].values]))

In [69]:
encodes = encoder.encode(all_questions, show_progress_bar=True)

HBox(children=(FloatProgress(value=0.0, description='Batches', max=464.0, style=ProgressStyle(description_widt…




In [83]:
encoded_questions = {}
for i in range(len(all_questions)):
    encoded_questions[all_questions[i]] = encodes[i]

In [105]:
encoded_questions = np.load('encoded_questions.npy', allow_pickle=True).item()

In [None]:
encoded_questions

In [None]:
list(encoded_questions.items())

In [None]:
counter = 0
result = {}
for answer, question in tqdm(train[['answer_en', 'question_en']].values):
    try:
        result[counter] = {'answer_question_en_simiilarity_by_roberta': cosine_similarity(encoded_questions[question], encoded_answers[answer])}
    except:
        result[counter] = {'answer_question_en_simiilarity_by_roberta': 0}
        print('woops')
    counter += 1

In [92]:
train['answer_question_en_simiilarity_by_roberta'] = pd.DataFrame(result).T['answer_question_en_simiilarity_by_roberta']

In [93]:
train = mmmm('answer_question_en_simiilarity_by_roberta', by='old_idx')

  return result.astype(dtype)


In [29]:
train = pd.read_csv('admin.csv')

## Google Encoder

In [109]:
import tensorflow_hub as hub

In [110]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

INFO:absl:Using /tmp/tfhub_modules to cache modules.


In [111]:
all_answers_ru = list(set([str(i) for i in train['answer_ru'].values]))

In [112]:
encoded_answers_use_raw = embed(all_answers_ru)

In [113]:
encoded_answers_ru = {}
for i in range(len(all_answers_ru)):
    encoded_answers_ru[all_answers_ru[i]] = encoded_answers_use_raw[i]

In [114]:
all_questions_ru = list(set([str(i) for i in train['question_ru'].values]))

In [115]:
encoded_questions_use_raw = embed(all_questions_ru)

In [116]:
encoded_questions_ru = {}
for i in range(len(all_questions_ru)):
    encoded_questions_ru[all_questions_ru[i]] = encoded_questions_use_raw[i]

In [117]:
counter = 0
result = {}
for answer, question in tqdm(train[['answer_ru', 'question_ru']].values):
    result[counter] = {'answer_question_ru_simiilarity_by_use': cosine_similarity(np.array(encoded_questions_ru[question]), np.array(encoded_answers_ru[answer]))}
    result[counter] = {'answer_question_ru_simiilarity_by_use': 0}
    counter += 1

HBox(children=(FloatProgress(value=0.0, max=143064.0), HTML(value='')))




In [118]:
train['answer_question_ru_simiilarity_by_use'] = pd.DataFrame(result).T['answer_question_ru_simiilarity_by_use']





In [119]:
train = mmmm('answer_question_ru_simiilarity_by_use', by='old_idx')

# NER

In [120]:
ners = {}
for k, v in load_json('data/answers_ners.json').items():
    if all(map(lambda x: x in ['B-PER', 'I-PER'], v)):
        ners[k] = 'PER'
    elif all(map(lambda x: x in ['B-LOC', 'I-LOC'], v)):
        ners[k] = 'LOC'
    else:
        ners[k] = 'ORG'

In [121]:
train['answer_ru_ner'] = [ners[answer] for answer in train['answer_ru'].values]

In [122]:
result = {}
for k, v in dict(train.groupby(['old_idx', 'answer_ru_ner']).size()).items():
    if k[0] not in result:
        result[k[0]] = {'answer_ru_ner_' + k[1].lower() + '_amount_in_context': v}
    else:
        result[k[0]]['answer_ru_ner_' + k[1].lower() + '_amount_in_context'] = v

temp = pd.DataFrame(result).T.fillna(0)
temp['old_idx'] = temp.index
train = pd.merge(train, temp, on='old_idx')

In [123]:
cat_features += ['answer_ru_ner']

In [169]:
fuzya = pd.read_csv('fuzyawuzya.csv')

In [170]:
fuzya = fuzya.fillna(0)

In [171]:
fuzya = fuzya.drop(columns=['Unnamed: 0', 'old_idx'])

In [174]:
train = pd.concat([fuzya, train], axis=1)

# CatBoost Forever!

In [140]:
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.metrics import f1_score, accuracy_score

In [191]:
temp = train.drop(columns=['old_idx', 'start', 'end', 'stemmed_context_ru', 'stemmed_question_ru', 'stemmed_answer_ru', 'stemmed_context_en', 'stemmed_question_en', 'stemmed_answer_en', 'context_ru', 'question_ru', 'answer_ru', 'context_en', 'question_en', 'answer_en'])

In [192]:
temp = temp.loc[:,~temp.columns.duplicated()]

In [193]:
temp = temp.fillna(0)

In [194]:
ttrain, validation = temp[train['is_answered'] == True], temp[train['is_answered'] == False]

In [195]:
X_train, y_train = ttrain.drop(columns=['label']), ttrain['label']

In [196]:
train_size = 0.80
X_train, X_test, y_train, y_test = X_train.iloc[:int(len(X_train.index) * train_size)], X_train.iloc[int(len(X_train.index) * train_size) + 1:], y_train.iloc[:int(len(y_train.index) * train_size)], y_train.iloc[int(len(y_train.index) * train_size) + 1:] 

In [197]:
catboost_classifier = CatBoostClassifier(iterations=3000, depth=3, cat_features=cat_features+text_features)

In [198]:
catboost_classifier.fit(X_train,
          y_train,
          verbose=True, 
          eval_set=(X_test, y_test))

Learning rate set to 0.054077
0:	learn: 0.6214642	test: 0.6271792	best: 0.6271792 (0)	total: 50.1ms	remaining: 2m 30s
1:	learn: 0.5554205	test: 0.5657714	best: 0.5657714 (1)	total: 93.3ms	remaining: 2m 19s
2:	learn: 0.5036370	test: 0.5181298	best: 0.5181298 (2)	total: 131ms	remaining: 2m 11s
3:	learn: 0.4546620	test: 0.4733591	best: 0.4733591 (3)	total: 164ms	remaining: 2m 3s
4:	learn: 0.4147905	test: 0.4361501	best: 0.4361501 (4)	total: 195ms	remaining: 1m 56s
5:	learn: 0.3778964	test: 0.4011842	best: 0.4011842 (5)	total: 225ms	remaining: 1m 52s
6:	learn: 0.3520818	test: 0.3776438	best: 0.3776438 (6)	total: 253ms	remaining: 1m 48s
7:	learn: 0.3275469	test: 0.3553921	best: 0.3553921 (7)	total: 290ms	remaining: 1m 48s
8:	learn: 0.3065755	test: 0.3362762	best: 0.3362762 (8)	total: 320ms	remaining: 1m 46s
9:	learn: 0.2891476	test: 0.3205455	best: 0.3205455 (9)	total: 347ms	remaining: 1m 43s
10:	learn: 0.2736181	test: 0.3066302	best: 0.3066302 (10)	total: 379ms	remaining: 1m 42s
11:	learn:

93:	learn: 0.1522843	test: 0.2081319	best: 0.2081319 (93)	total: 3.7s	remaining: 1m 54s
94:	learn: 0.1521545	test: 0.2081623	best: 0.2081319 (93)	total: 3.75s	remaining: 1m 54s
95:	learn: 0.1520288	test: 0.2081337	best: 0.2081319 (93)	total: 3.77s	remaining: 1m 54s
96:	learn: 0.1518699	test: 0.2080304	best: 0.2080304 (96)	total: 3.82s	remaining: 1m 54s
97:	learn: 0.1517788	test: 0.2079887	best: 0.2079887 (97)	total: 3.85s	remaining: 1m 53s
98:	learn: 0.1516700	test: 0.2079167	best: 0.2079167 (98)	total: 4.3s	remaining: 2m 6s
99:	learn: 0.1515330	test: 0.2078536	best: 0.2078536 (99)	total: 4.33s	remaining: 2m 5s
100:	learn: 0.1510143	test: 0.2077419	best: 0.2077419 (100)	total: 4.4s	remaining: 2m 6s
101:	learn: 0.1509471	test: 0.2077499	best: 0.2077419 (100)	total: 4.42s	remaining: 2m 5s
102:	learn: 0.1508429	test: 0.2076275	best: 0.2076275 (102)	total: 4.46s	remaining: 2m 5s
103:	learn: 0.1502416	test: 0.2075578	best: 0.2075578 (103)	total: 4.52s	remaining: 2m 5s
104:	learn: 0.1497915	

190:	learn: 0.1419777	test: 0.2017008	best: 0.2017008 (190)	total: 9.48s	remaining: 2m 19s
191:	learn: 0.1419259	test: 0.2016871	best: 0.2016871 (191)	total: 9.52s	remaining: 2m 19s
192:	learn: 0.1418893	test: 0.2017048	best: 0.2016871 (191)	total: 9.55s	remaining: 2m 18s
193:	learn: 0.1418204	test: 0.2016292	best: 0.2016292 (193)	total: 9.61s	remaining: 2m 18s
194:	learn: 0.1417501	test: 0.2014729	best: 0.2014729 (194)	total: 9.64s	remaining: 2m 18s
195:	learn: 0.1417247	test: 0.2014408	best: 0.2014408 (195)	total: 9.67s	remaining: 2m 18s
196:	learn: 0.1417243	test: 0.2014356	best: 0.2014356 (196)	total: 9.7s	remaining: 2m 18s
197:	learn: 0.1416619	test: 0.2014576	best: 0.2014356 (196)	total: 9.74s	remaining: 2m 17s
198:	learn: 0.1416123	test: 0.2014500	best: 0.2014356 (196)	total: 9.78s	remaining: 2m 17s
199:	learn: 0.1416122	test: 0.2014475	best: 0.2014356 (196)	total: 9.81s	remaining: 2m 17s
200:	learn: 0.1415543	test: 0.2014192	best: 0.2014192 (200)	total: 9.84s	remaining: 2m 17s


283:	learn: 0.1364314	test: 0.1988214	best: 0.1988214 (283)	total: 13.5s	remaining: 2m 9s
284:	learn: 0.1362265	test: 0.1986351	best: 0.1986351 (284)	total: 13.6s	remaining: 2m 9s
285:	learn: 0.1362232	test: 0.1986320	best: 0.1986320 (285)	total: 13.6s	remaining: 2m 9s
286:	learn: 0.1362020	test: 0.1986095	best: 0.1986095 (286)	total: 13.6s	remaining: 2m 8s
287:	learn: 0.1361442	test: 0.1985312	best: 0.1985312 (287)	total: 13.7s	remaining: 2m 8s
288:	learn: 0.1360012	test: 0.1984188	best: 0.1984188 (288)	total: 13.7s	remaining: 2m 8s
289:	learn: 0.1359157	test: 0.1983509	best: 0.1983509 (289)	total: 13.8s	remaining: 2m 8s
290:	learn: 0.1358092	test: 0.1983363	best: 0.1983363 (290)	total: 13.8s	remaining: 2m 8s
291:	learn: 0.1358004	test: 0.1983258	best: 0.1983258 (291)	total: 13.8s	remaining: 2m 8s
292:	learn: 0.1356777	test: 0.1981564	best: 0.1981564 (292)	total: 13.9s	remaining: 2m 8s
293:	learn: 0.1356067	test: 0.1981681	best: 0.1981564 (292)	total: 13.9s	remaining: 2m 8s
294:	learn

377:	learn: 0.1319874	test: 0.1968644	best: 0.1968475 (366)	total: 17.5s	remaining: 2m 1s
378:	learn: 0.1319251	test: 0.1968706	best: 0.1968475 (366)	total: 17.5s	remaining: 2m 1s
379:	learn: 0.1318625	test: 0.1969458	best: 0.1968475 (366)	total: 17.6s	remaining: 2m 1s
380:	learn: 0.1318337	test: 0.1968853	best: 0.1968475 (366)	total: 17.6s	remaining: 2m 1s
381:	learn: 0.1318054	test: 0.1969047	best: 0.1968475 (366)	total: 17.7s	remaining: 2m 1s
382:	learn: 0.1317636	test: 0.1969512	best: 0.1968475 (366)	total: 17.7s	remaining: 2m 1s
383:	learn: 0.1317378	test: 0.1969615	best: 0.1968475 (366)	total: 17.8s	remaining: 2m 1s
384:	learn: 0.1317058	test: 0.1969859	best: 0.1968475 (366)	total: 17.8s	remaining: 2m
385:	learn: 0.1316875	test: 0.1969911	best: 0.1968475 (366)	total: 17.8s	remaining: 2m
386:	learn: 0.1316547	test: 0.1969629	best: 0.1968475 (366)	total: 17.9s	remaining: 2m
387:	learn: 0.1316460	test: 0.1969648	best: 0.1968475 (366)	total: 17.9s	remaining: 2m
388:	learn: 0.1316237	

468:	learn: 0.1291157	test: 0.1963046	best: 0.1962669 (437)	total: 21.3s	remaining: 1m 55s
469:	learn: 0.1290842	test: 0.1963171	best: 0.1962669 (437)	total: 21.4s	remaining: 1m 55s
470:	learn: 0.1290593	test: 0.1963221	best: 0.1962669 (437)	total: 21.4s	remaining: 1m 54s
471:	learn: 0.1290361	test: 0.1962957	best: 0.1962669 (437)	total: 21.5s	remaining: 1m 54s
472:	learn: 0.1290064	test: 0.1962380	best: 0.1962380 (472)	total: 21.5s	remaining: 1m 54s
473:	learn: 0.1290021	test: 0.1962372	best: 0.1962372 (473)	total: 21.5s	remaining: 1m 54s
474:	learn: 0.1288981	test: 0.1962439	best: 0.1962372 (473)	total: 21.6s	remaining: 1m 54s
475:	learn: 0.1288068	test: 0.1962458	best: 0.1962372 (473)	total: 21.7s	remaining: 1m 55s
476:	learn: 0.1287239	test: 0.1962494	best: 0.1962372 (473)	total: 21.8s	remaining: 1m 55s
477:	learn: 0.1286679	test: 0.1962137	best: 0.1962137 (477)	total: 21.9s	remaining: 1m 55s
478:	learn: 0.1285979	test: 0.1962256	best: 0.1962137 (477)	total: 22s	remaining: 1m 56s
4

564:	learn: 0.1258473	test: 0.1959754	best: 0.1959593 (560)	total: 26s	remaining: 1m 51s
565:	learn: 0.1258290	test: 0.1959843	best: 0.1959593 (560)	total: 26s	remaining: 1m 51s
566:	learn: 0.1258186	test: 0.1959833	best: 0.1959593 (560)	total: 26.1s	remaining: 1m 51s
567:	learn: 0.1258110	test: 0.1959787	best: 0.1959593 (560)	total: 26.1s	remaining: 1m 51s
568:	learn: 0.1258110	test: 0.1959786	best: 0.1959593 (560)	total: 26.1s	remaining: 1m 51s
569:	learn: 0.1257796	test: 0.1959549	best: 0.1959549 (569)	total: 26.1s	remaining: 1m 51s
570:	learn: 0.1257508	test: 0.1959908	best: 0.1959549 (569)	total: 26.2s	remaining: 1m 51s
571:	learn: 0.1257081	test: 0.1959996	best: 0.1959549 (569)	total: 26.2s	remaining: 1m 51s
572:	learn: 0.1256817	test: 0.1960049	best: 0.1959549 (569)	total: 26.3s	remaining: 1m 51s
573:	learn: 0.1256804	test: 0.1960040	best: 0.1959549 (569)	total: 26.4s	remaining: 1m 51s
574:	learn: 0.1256347	test: 0.1960013	best: 0.1959549 (569)	total: 26.4s	remaining: 1m 51s
575

659:	learn: 0.1234517	test: 0.1957922	best: 0.1957575 (656)	total: 30.2s	remaining: 1m 47s
660:	learn: 0.1234273	test: 0.1958013	best: 0.1957575 (656)	total: 30.3s	remaining: 1m 47s
661:	learn: 0.1233480	test: 0.1957534	best: 0.1957534 (661)	total: 30.3s	remaining: 1m 47s
662:	learn: 0.1232404	test: 0.1957712	best: 0.1957534 (661)	total: 30.4s	remaining: 1m 47s
663:	learn: 0.1231810	test: 0.1957813	best: 0.1957534 (661)	total: 30.4s	remaining: 1m 46s
664:	learn: 0.1231397	test: 0.1957365	best: 0.1957365 (664)	total: 30.4s	remaining: 1m 46s
665:	learn: 0.1231301	test: 0.1957464	best: 0.1957365 (664)	total: 30.5s	remaining: 1m 46s
666:	learn: 0.1231099	test: 0.1957630	best: 0.1957365 (664)	total: 30.5s	remaining: 1m 46s
667:	learn: 0.1230988	test: 0.1957638	best: 0.1957365 (664)	total: 30.6s	remaining: 1m 46s
668:	learn: 0.1230983	test: 0.1957624	best: 0.1957365 (664)	total: 30.6s	remaining: 1m 46s
669:	learn: 0.1230726	test: 0.1957674	best: 0.1957365 (664)	total: 30.7s	remaining: 1m 46s

750:	learn: 0.1215518	test: 0.1956609	best: 0.1955759 (729)	total: 34.4s	remaining: 1m 43s
751:	learn: 0.1215451	test: 0.1956716	best: 0.1955759 (729)	total: 34.5s	remaining: 1m 43s
752:	learn: 0.1215348	test: 0.1956767	best: 0.1955759 (729)	total: 34.5s	remaining: 1m 42s
753:	learn: 0.1215276	test: 0.1956793	best: 0.1955759 (729)	total: 34.5s	remaining: 1m 42s
754:	learn: 0.1215194	test: 0.1956659	best: 0.1955759 (729)	total: 34.5s	remaining: 1m 42s
755:	learn: 0.1215116	test: 0.1956421	best: 0.1955759 (729)	total: 34.6s	remaining: 1m 42s
756:	learn: 0.1214339	test: 0.1956434	best: 0.1955759 (729)	total: 34.6s	remaining: 1m 42s
757:	learn: 0.1214302	test: 0.1956509	best: 0.1955759 (729)	total: 34.7s	remaining: 1m 42s
758:	learn: 0.1214156	test: 0.1956748	best: 0.1955759 (729)	total: 34.7s	remaining: 1m 42s
759:	learn: 0.1213905	test: 0.1956423	best: 0.1955759 (729)	total: 34.7s	remaining: 1m 42s
760:	learn: 0.1213521	test: 0.1956567	best: 0.1955759 (729)	total: 34.8s	remaining: 1m 42s

843:	learn: 0.1199506	test: 0.1958823	best: 0.1955759 (729)	total: 38.2s	remaining: 1m 37s
844:	learn: 0.1199489	test: 0.1958852	best: 0.1955759 (729)	total: 38.2s	remaining: 1m 37s
845:	learn: 0.1199350	test: 0.1958740	best: 0.1955759 (729)	total: 38.3s	remaining: 1m 37s
846:	learn: 0.1199211	test: 0.1958536	best: 0.1955759 (729)	total: 38.3s	remaining: 1m 37s
847:	learn: 0.1199129	test: 0.1958522	best: 0.1955759 (729)	total: 38.3s	remaining: 1m 37s
848:	learn: 0.1198981	test: 0.1958584	best: 0.1955759 (729)	total: 38.4s	remaining: 1m 37s
849:	learn: 0.1198794	test: 0.1958503	best: 0.1955759 (729)	total: 38.4s	remaining: 1m 37s
850:	learn: 0.1198563	test: 0.1958511	best: 0.1955759 (729)	total: 38.4s	remaining: 1m 37s
851:	learn: 0.1197960	test: 0.1958592	best: 0.1955759 (729)	total: 38.5s	remaining: 1m 36s
852:	learn: 0.1197959	test: 0.1958585	best: 0.1955759 (729)	total: 38.5s	remaining: 1m 36s
853:	learn: 0.1197831	test: 0.1958828	best: 0.1955759 (729)	total: 38.6s	remaining: 1m 36s

936:	learn: 0.1183719	test: 0.1957725	best: 0.1955759 (729)	total: 41.9s	remaining: 1m 32s
937:	learn: 0.1183697	test: 0.1957737	best: 0.1955759 (729)	total: 42s	remaining: 1m 32s
938:	learn: 0.1183315	test: 0.1957844	best: 0.1955759 (729)	total: 42.1s	remaining: 1m 32s
939:	learn: 0.1183218	test: 0.1957572	best: 0.1955759 (729)	total: 42.1s	remaining: 1m 32s
940:	learn: 0.1183125	test: 0.1957669	best: 0.1955759 (729)	total: 42.1s	remaining: 1m 32s
941:	learn: 0.1182919	test: 0.1957732	best: 0.1955759 (729)	total: 42.2s	remaining: 1m 32s
942:	learn: 0.1182688	test: 0.1957845	best: 0.1955759 (729)	total: 42.2s	remaining: 1m 32s
943:	learn: 0.1182548	test: 0.1957878	best: 0.1955759 (729)	total: 42.3s	remaining: 1m 32s
944:	learn: 0.1182238	test: 0.1957917	best: 0.1955759 (729)	total: 42.3s	remaining: 1m 31s
945:	learn: 0.1181926	test: 0.1958073	best: 0.1955759 (729)	total: 42.3s	remaining: 1m 31s
946:	learn: 0.1181814	test: 0.1957945	best: 0.1955759 (729)	total: 42.4s	remaining: 1m 31s
9

1030:	learn: 0.1168424	test: 0.1957841	best: 0.1955759 (729)	total: 46.3s	remaining: 1m 28s
1031:	learn: 0.1168413	test: 0.1957844	best: 0.1955759 (729)	total: 46.4s	remaining: 1m 28s
1032:	learn: 0.1168380	test: 0.1957751	best: 0.1955759 (729)	total: 46.5s	remaining: 1m 28s
1033:	learn: 0.1168350	test: 0.1957686	best: 0.1955759 (729)	total: 46.5s	remaining: 1m 28s
1034:	learn: 0.1168346	test: 0.1957680	best: 0.1955759 (729)	total: 46.5s	remaining: 1m 28s
1035:	learn: 0.1168323	test: 0.1957706	best: 0.1955759 (729)	total: 46.6s	remaining: 1m 28s
1036:	learn: 0.1168316	test: 0.1957707	best: 0.1955759 (729)	total: 46.6s	remaining: 1m 28s
1037:	learn: 0.1168295	test: 0.1957678	best: 0.1955759 (729)	total: 46.7s	remaining: 1m 28s
1038:	learn: 0.1168288	test: 0.1957674	best: 0.1955759 (729)	total: 46.7s	remaining: 1m 28s
1039:	learn: 0.1168264	test: 0.1957728	best: 0.1955759 (729)	total: 46.7s	remaining: 1m 28s
1040:	learn: 0.1168039	test: 0.1957746	best: 0.1955759 (729)	total: 46.8s	remain

1121:	learn: 0.1155765	test: 0.1959081	best: 0.1955759 (729)	total: 50.6s	remaining: 1m 24s
1122:	learn: 0.1155388	test: 0.1959339	best: 0.1955759 (729)	total: 50.6s	remaining: 1m 24s
1123:	learn: 0.1155245	test: 0.1959280	best: 0.1955759 (729)	total: 50.6s	remaining: 1m 24s
1124:	learn: 0.1154925	test: 0.1959366	best: 0.1955759 (729)	total: 50.7s	remaining: 1m 24s
1125:	learn: 0.1154815	test: 0.1959308	best: 0.1955759 (729)	total: 50.7s	remaining: 1m 24s
1126:	learn: 0.1154475	test: 0.1959481	best: 0.1955759 (729)	total: 50.7s	remaining: 1m 24s
1127:	learn: 0.1154452	test: 0.1959437	best: 0.1955759 (729)	total: 50.8s	remaining: 1m 24s
1128:	learn: 0.1154286	test: 0.1959489	best: 0.1955759 (729)	total: 50.8s	remaining: 1m 24s
1129:	learn: 0.1154005	test: 0.1959584	best: 0.1955759 (729)	total: 50.9s	remaining: 1m 24s
1130:	learn: 0.1153998	test: 0.1959580	best: 0.1955759 (729)	total: 50.9s	remaining: 1m 24s
1131:	learn: 0.1153939	test: 0.1959520	best: 0.1955759 (729)	total: 50.9s	remain

1212:	learn: 0.1143935	test: 0.1960007	best: 0.1955759 (729)	total: 54.2s	remaining: 1m 19s
1213:	learn: 0.1143914	test: 0.1960125	best: 0.1955759 (729)	total: 54.3s	remaining: 1m 19s
1214:	learn: 0.1143880	test: 0.1960128	best: 0.1955759 (729)	total: 54.3s	remaining: 1m 19s
1215:	learn: 0.1143860	test: 0.1960140	best: 0.1955759 (729)	total: 54.4s	remaining: 1m 19s
1216:	learn: 0.1143734	test: 0.1960139	best: 0.1955759 (729)	total: 54.4s	remaining: 1m 19s
1217:	learn: 0.1143665	test: 0.1960132	best: 0.1955759 (729)	total: 54.4s	remaining: 1m 19s
1218:	learn: 0.1143440	test: 0.1960252	best: 0.1955759 (729)	total: 54.5s	remaining: 1m 19s
1219:	learn: 0.1143261	test: 0.1960173	best: 0.1955759 (729)	total: 54.5s	remaining: 1m 19s
1220:	learn: 0.1143236	test: 0.1960178	best: 0.1955759 (729)	total: 54.5s	remaining: 1m 19s
1221:	learn: 0.1142928	test: 0.1960474	best: 0.1955759 (729)	total: 54.6s	remaining: 1m 19s
1222:	learn: 0.1142905	test: 0.1960523	best: 0.1955759 (729)	total: 54.6s	remain

1307:	learn: 0.1132184	test: 0.1963206	best: 0.1955759 (729)	total: 59.4s	remaining: 1m 16s
1308:	learn: 0.1132184	test: 0.1963204	best: 0.1955759 (729)	total: 59.5s	remaining: 1m 16s
1309:	learn: 0.1132135	test: 0.1963260	best: 0.1955759 (729)	total: 59.5s	remaining: 1m 16s
1310:	learn: 0.1132040	test: 0.1963205	best: 0.1955759 (729)	total: 59.6s	remaining: 1m 16s
1311:	learn: 0.1131773	test: 0.1963139	best: 0.1955759 (729)	total: 59.6s	remaining: 1m 16s
1312:	learn: 0.1131631	test: 0.1963293	best: 0.1955759 (729)	total: 59.6s	remaining: 1m 16s
1313:	learn: 0.1131623	test: 0.1963261	best: 0.1955759 (729)	total: 59.7s	remaining: 1m 16s
1314:	learn: 0.1131574	test: 0.1963290	best: 0.1955759 (729)	total: 59.7s	remaining: 1m 16s
1315:	learn: 0.1131545	test: 0.1963300	best: 0.1955759 (729)	total: 59.8s	remaining: 1m 16s
1316:	learn: 0.1131465	test: 0.1963396	best: 0.1955759 (729)	total: 59.8s	remaining: 1m 16s
1317:	learn: 0.1131343	test: 0.1963405	best: 0.1955759 (729)	total: 59.8s	remain

1401:	learn: 0.1121589	test: 0.1964872	best: 0.1955759 (729)	total: 1m 3s	remaining: 1m 12s
1402:	learn: 0.1121544	test: 0.1964874	best: 0.1955759 (729)	total: 1m 3s	remaining: 1m 12s
1403:	learn: 0.1121453	test: 0.1964890	best: 0.1955759 (729)	total: 1m 3s	remaining: 1m 11s
1404:	learn: 0.1121330	test: 0.1964905	best: 0.1955759 (729)	total: 1m 3s	remaining: 1m 11s
1405:	learn: 0.1121328	test: 0.1964902	best: 0.1955759 (729)	total: 1m 3s	remaining: 1m 11s
1406:	learn: 0.1121297	test: 0.1964881	best: 0.1955759 (729)	total: 1m 3s	remaining: 1m 11s
1407:	learn: 0.1121074	test: 0.1964391	best: 0.1955759 (729)	total: 1m 3s	remaining: 1m 11s
1408:	learn: 0.1121057	test: 0.1964368	best: 0.1955759 (729)	total: 1m 3s	remaining: 1m 11s
1409:	learn: 0.1120930	test: 0.1964395	best: 0.1955759 (729)	total: 1m 3s	remaining: 1m 11s
1410:	learn: 0.1120835	test: 0.1964545	best: 0.1955759 (729)	total: 1m 3s	remaining: 1m 11s
1411:	learn: 0.1120793	test: 0.1964626	best: 0.1955759 (729)	total: 1m 3s	remain

1495:	learn: 0.1111343	test: 0.1963019	best: 0.1955759 (729)	total: 1m 8s	remaining: 1m 9s
1496:	learn: 0.1111072	test: 0.1963084	best: 0.1955759 (729)	total: 1m 8s	remaining: 1m 9s
1497:	learn: 0.1110722	test: 0.1962815	best: 0.1955759 (729)	total: 1m 8s	remaining: 1m 9s
1498:	learn: 0.1110590	test: 0.1962745	best: 0.1955759 (729)	total: 1m 8s	remaining: 1m 9s
1499:	learn: 0.1110482	test: 0.1962812	best: 0.1955759 (729)	total: 1m 8s	remaining: 1m 8s
1500:	learn: 0.1110439	test: 0.1962780	best: 0.1955759 (729)	total: 1m 9s	remaining: 1m 8s
1501:	learn: 0.1110383	test: 0.1962789	best: 0.1955759 (729)	total: 1m 9s	remaining: 1m 8s
1502:	learn: 0.1110246	test: 0.1962924	best: 0.1955759 (729)	total: 1m 9s	remaining: 1m 8s
1503:	learn: 0.1110206	test: 0.1962893	best: 0.1955759 (729)	total: 1m 9s	remaining: 1m 8s
1504:	learn: 0.1110200	test: 0.1962931	best: 0.1955759 (729)	total: 1m 9s	remaining: 1m 8s
1505:	learn: 0.1110165	test: 0.1962947	best: 0.1955759 (729)	total: 1m 9s	remaining: 1m 8s

1589:	learn: 0.1101603	test: 0.1961644	best: 0.1955759 (729)	total: 1m 12s	remaining: 1m 4s
1590:	learn: 0.1101573	test: 0.1961625	best: 0.1955759 (729)	total: 1m 12s	remaining: 1m 4s
1591:	learn: 0.1101425	test: 0.1961308	best: 0.1955759 (729)	total: 1m 12s	remaining: 1m 4s
1592:	learn: 0.1101380	test: 0.1961339	best: 0.1955759 (729)	total: 1m 12s	remaining: 1m 4s
1593:	learn: 0.1101376	test: 0.1961331	best: 0.1955759 (729)	total: 1m 12s	remaining: 1m 4s
1594:	learn: 0.1101342	test: 0.1961282	best: 0.1955759 (729)	total: 1m 12s	remaining: 1m 4s
1595:	learn: 0.1101114	test: 0.1960796	best: 0.1955759 (729)	total: 1m 12s	remaining: 1m 4s
1596:	learn: 0.1101049	test: 0.1960789	best: 0.1955759 (729)	total: 1m 12s	remaining: 1m 4s
1597:	learn: 0.1101004	test: 0.1960755	best: 0.1955759 (729)	total: 1m 13s	remaining: 1m 4s
1598:	learn: 0.1100895	test: 0.1960947	best: 0.1955759 (729)	total: 1m 13s	remaining: 1m 4s
1599:	learn: 0.1100862	test: 0.1961052	best: 0.1955759 (729)	total: 1m 13s	remai

1684:	learn: 0.1094405	test: 0.1962958	best: 0.1955759 (729)	total: 1m 16s	remaining: 59.7s
1685:	learn: 0.1094395	test: 0.1962953	best: 0.1955759 (729)	total: 1m 16s	remaining: 59.7s
1686:	learn: 0.1094305	test: 0.1962941	best: 0.1955759 (729)	total: 1m 16s	remaining: 59.6s
1687:	learn: 0.1094302	test: 0.1962928	best: 0.1955759 (729)	total: 1m 16s	remaining: 59.6s
1688:	learn: 0.1094255	test: 0.1962900	best: 0.1955759 (729)	total: 1m 16s	remaining: 59.5s
1689:	learn: 0.1094158	test: 0.1962972	best: 0.1955759 (729)	total: 1m 16s	remaining: 59.5s
1690:	learn: 0.1094144	test: 0.1962970	best: 0.1955759 (729)	total: 1m 16s	remaining: 59.4s
1691:	learn: 0.1094008	test: 0.1962936	best: 0.1955759 (729)	total: 1m 16s	remaining: 59.4s
1692:	learn: 0.1093958	test: 0.1962861	best: 0.1955759 (729)	total: 1m 16s	remaining: 59.3s
1693:	learn: 0.1093946	test: 0.1962872	best: 0.1955759 (729)	total: 1m 16s	remaining: 59.2s
1694:	learn: 0.1093909	test: 0.1962915	best: 0.1955759 (729)	total: 1m 16s	remai

1777:	learn: 0.1085507	test: 0.1962853	best: 0.1955759 (729)	total: 1m 20s	remaining: 55s
1778:	learn: 0.1085421	test: 0.1962920	best: 0.1955759 (729)	total: 1m 20s	remaining: 55s
1779:	learn: 0.1085263	test: 0.1962995	best: 0.1955759 (729)	total: 1m 20s	remaining: 54.9s
1780:	learn: 0.1085225	test: 0.1962976	best: 0.1955759 (729)	total: 1m 20s	remaining: 54.8s
1781:	learn: 0.1085060	test: 0.1963039	best: 0.1955759 (729)	total: 1m 20s	remaining: 54.8s
1782:	learn: 0.1084951	test: 0.1963121	best: 0.1955759 (729)	total: 1m 20s	remaining: 54.7s
1783:	learn: 0.1084918	test: 0.1963107	best: 0.1955759 (729)	total: 1m 20s	remaining: 54.7s
1784:	learn: 0.1084692	test: 0.1963126	best: 0.1955759 (729)	total: 1m 20s	remaining: 54.6s
1785:	learn: 0.1084682	test: 0.1963105	best: 0.1955759 (729)	total: 1m 20s	remaining: 54.6s
1786:	learn: 0.1084649	test: 0.1963024	best: 0.1955759 (729)	total: 1m 20s	remaining: 54.5s
1787:	learn: 0.1084589	test: 0.1963053	best: 0.1955759 (729)	total: 1m 20s	remaining

1870:	learn: 0.1077824	test: 0.1966175	best: 0.1955759 (729)	total: 1m 23s	remaining: 50.3s
1871:	learn: 0.1077704	test: 0.1966109	best: 0.1955759 (729)	total: 1m 23s	remaining: 50.2s
1872:	learn: 0.1077475	test: 0.1966270	best: 0.1955759 (729)	total: 1m 23s	remaining: 50.2s
1873:	learn: 0.1077220	test: 0.1966468	best: 0.1955759 (729)	total: 1m 23s	remaining: 50.1s
1874:	learn: 0.1077178	test: 0.1966437	best: 0.1955759 (729)	total: 1m 23s	remaining: 50.1s
1875:	learn: 0.1077050	test: 0.1966374	best: 0.1955759 (729)	total: 1m 23s	remaining: 50s
1876:	learn: 0.1076920	test: 0.1966381	best: 0.1955759 (729)	total: 1m 23s	remaining: 50s
1877:	learn: 0.1076701	test: 0.1966111	best: 0.1955759 (729)	total: 1m 23s	remaining: 49.9s
1878:	learn: 0.1076620	test: 0.1966022	best: 0.1955759 (729)	total: 1m 23s	remaining: 49.9s
1879:	learn: 0.1076567	test: 0.1965987	best: 0.1955759 (729)	total: 1m 23s	remaining: 49.8s
1880:	learn: 0.1076466	test: 0.1966207	best: 0.1955759 (729)	total: 1m 23s	remaining

1960:	learn: 0.1068976	test: 0.1968942	best: 0.1955759 (729)	total: 1m 29s	remaining: 47.6s
1961:	learn: 0.1068949	test: 0.1969003	best: 0.1955759 (729)	total: 1m 29s	remaining: 47.5s
1962:	learn: 0.1068883	test: 0.1968902	best: 0.1955759 (729)	total: 1m 29s	remaining: 47.5s
1963:	learn: 0.1068802	test: 0.1968896	best: 0.1955759 (729)	total: 1m 29s	remaining: 47.4s
1964:	learn: 0.1068764	test: 0.1968892	best: 0.1955759 (729)	total: 1m 29s	remaining: 47.4s
1965:	learn: 0.1068746	test: 0.1968891	best: 0.1955759 (729)	total: 1m 29s	remaining: 47.3s
1966:	learn: 0.1068713	test: 0.1968979	best: 0.1955759 (729)	total: 1m 30s	remaining: 47.3s
1967:	learn: 0.1068681	test: 0.1968979	best: 0.1955759 (729)	total: 1m 30s	remaining: 47.2s
1968:	learn: 0.1068656	test: 0.1968966	best: 0.1955759 (729)	total: 1m 30s	remaining: 47.2s
1969:	learn: 0.1068654	test: 0.1968960	best: 0.1955759 (729)	total: 1m 30s	remaining: 47.1s
1970:	learn: 0.1068543	test: 0.1968925	best: 0.1955759 (729)	total: 1m 30s	remai

2054:	learn: 0.1060546	test: 0.1969377	best: 0.1955759 (729)	total: 1m 33s	remaining: 43s
2055:	learn: 0.1060543	test: 0.1969369	best: 0.1955759 (729)	total: 1m 33s	remaining: 42.9s
2056:	learn: 0.1060505	test: 0.1969389	best: 0.1955759 (729)	total: 1m 33s	remaining: 42.9s
2057:	learn: 0.1060468	test: 0.1969389	best: 0.1955759 (729)	total: 1m 33s	remaining: 42.8s
2058:	learn: 0.1060406	test: 0.1969474	best: 0.1955759 (729)	total: 1m 33s	remaining: 42.8s
2059:	learn: 0.1060299	test: 0.1969442	best: 0.1955759 (729)	total: 1m 33s	remaining: 42.7s
2060:	learn: 0.1060102	test: 0.1969599	best: 0.1955759 (729)	total: 1m 33s	remaining: 42.7s
2061:	learn: 0.1060061	test: 0.1969617	best: 0.1955759 (729)	total: 1m 33s	remaining: 42.6s
2062:	learn: 0.1059983	test: 0.1969535	best: 0.1955759 (729)	total: 1m 33s	remaining: 42.6s
2063:	learn: 0.1059980	test: 0.1969530	best: 0.1955759 (729)	total: 1m 33s	remaining: 42.5s
2064:	learn: 0.1059953	test: 0.1969492	best: 0.1955759 (729)	total: 1m 33s	remaini

2145:	learn: 0.1053099	test: 0.1969915	best: 0.1955759 (729)	total: 1m 37s	remaining: 38.7s
2146:	learn: 0.1052854	test: 0.1969847	best: 0.1955759 (729)	total: 1m 37s	remaining: 38.6s
2147:	learn: 0.1052822	test: 0.1969846	best: 0.1955759 (729)	total: 1m 37s	remaining: 38.6s
2148:	learn: 0.1052546	test: 0.1969728	best: 0.1955759 (729)	total: 1m 37s	remaining: 38.5s
2149:	learn: 0.1052333	test: 0.1969797	best: 0.1955759 (729)	total: 1m 37s	remaining: 38.5s
2150:	learn: 0.1052205	test: 0.1969644	best: 0.1955759 (729)	total: 1m 37s	remaining: 38.5s
2151:	learn: 0.1052186	test: 0.1969642	best: 0.1955759 (729)	total: 1m 37s	remaining: 38.4s
2152:	learn: 0.1052168	test: 0.1969648	best: 0.1955759 (729)	total: 1m 37s	remaining: 38.4s
2153:	learn: 0.1052116	test: 0.1969699	best: 0.1955759 (729)	total: 1m 37s	remaining: 38.3s
2154:	learn: 0.1051980	test: 0.1969650	best: 0.1955759 (729)	total: 1m 37s	remaining: 38.3s
2155:	learn: 0.1051971	test: 0.1969685	best: 0.1955759 (729)	total: 1m 37s	remai

2238:	learn: 0.1044556	test: 0.1970965	best: 0.1955759 (729)	total: 1m 42s	remaining: 34.9s
2239:	learn: 0.1044479	test: 0.1970807	best: 0.1955759 (729)	total: 1m 42s	remaining: 34.9s
2240:	learn: 0.1044303	test: 0.1971058	best: 0.1955759 (729)	total: 1m 42s	remaining: 34.8s
2241:	learn: 0.1044279	test: 0.1971037	best: 0.1955759 (729)	total: 1m 42s	remaining: 34.8s
2242:	learn: 0.1044196	test: 0.1971119	best: 0.1955759 (729)	total: 1m 42s	remaining: 34.7s
2243:	learn: 0.1044175	test: 0.1971131	best: 0.1955759 (729)	total: 1m 42s	remaining: 34.7s
2244:	learn: 0.1044147	test: 0.1971140	best: 0.1955759 (729)	total: 1m 42s	remaining: 34.6s
2245:	learn: 0.1044072	test: 0.1971222	best: 0.1955759 (729)	total: 1m 42s	remaining: 34.6s
2246:	learn: 0.1044053	test: 0.1971239	best: 0.1955759 (729)	total: 1m 42s	remaining: 34.5s
2247:	learn: 0.1044030	test: 0.1971203	best: 0.1955759 (729)	total: 1m 43s	remaining: 34.5s
2248:	learn: 0.1043983	test: 0.1971343	best: 0.1955759 (729)	total: 1m 43s	remai

2330:	learn: 0.1036816	test: 0.1974400	best: 0.1955759 (729)	total: 1m 47s	remaining: 30.9s
2331:	learn: 0.1036683	test: 0.1974289	best: 0.1955759 (729)	total: 1m 47s	remaining: 30.8s
2332:	learn: 0.1036602	test: 0.1974412	best: 0.1955759 (729)	total: 1m 47s	remaining: 30.8s
2333:	learn: 0.1036583	test: 0.1974428	best: 0.1955759 (729)	total: 1m 47s	remaining: 30.7s
2334:	learn: 0.1036530	test: 0.1974542	best: 0.1955759 (729)	total: 1m 47s	remaining: 30.7s
2335:	learn: 0.1036306	test: 0.1974432	best: 0.1955759 (729)	total: 1m 47s	remaining: 30.6s
2336:	learn: 0.1036285	test: 0.1974466	best: 0.1955759 (729)	total: 1m 47s	remaining: 30.6s
2337:	learn: 0.1036138	test: 0.1974267	best: 0.1955759 (729)	total: 1m 47s	remaining: 30.6s
2338:	learn: 0.1036102	test: 0.1974313	best: 0.1955759 (729)	total: 1m 47s	remaining: 30.5s
2339:	learn: 0.1036037	test: 0.1974235	best: 0.1955759 (729)	total: 1m 48s	remaining: 30.5s
2340:	learn: 0.1035811	test: 0.1974256	best: 0.1955759 (729)	total: 1m 48s	remai

2424:	learn: 0.1029535	test: 0.1974644	best: 0.1955759 (729)	total: 1m 52s	remaining: 26.7s
2425:	learn: 0.1029427	test: 0.1974804	best: 0.1955759 (729)	total: 1m 52s	remaining: 26.6s
2426:	learn: 0.1029405	test: 0.1974844	best: 0.1955759 (729)	total: 1m 52s	remaining: 26.6s
2427:	learn: 0.1029386	test: 0.1974872	best: 0.1955759 (729)	total: 1m 52s	remaining: 26.5s
2428:	learn: 0.1029382	test: 0.1974880	best: 0.1955759 (729)	total: 1m 52s	remaining: 26.5s
2429:	learn: 0.1029341	test: 0.1974898	best: 0.1955759 (729)	total: 1m 52s	remaining: 26.4s
2430:	learn: 0.1029257	test: 0.1974890	best: 0.1955759 (729)	total: 1m 52s	remaining: 26.4s
2431:	learn: 0.1029226	test: 0.1974893	best: 0.1955759 (729)	total: 1m 52s	remaining: 26.3s
2432:	learn: 0.1029211	test: 0.1974885	best: 0.1955759 (729)	total: 1m 52s	remaining: 26.3s
2433:	learn: 0.1029022	test: 0.1974581	best: 0.1955759 (729)	total: 1m 52s	remaining: 26.2s
2434:	learn: 0.1028972	test: 0.1974597	best: 0.1955759 (729)	total: 1m 52s	remai

2516:	learn: 0.1022488	test: 0.1974330	best: 0.1955759 (729)	total: 1m 56s	remaining: 22.3s
2517:	learn: 0.1022464	test: 0.1974255	best: 0.1955759 (729)	total: 1m 56s	remaining: 22.3s
2518:	learn: 0.1022421	test: 0.1974192	best: 0.1955759 (729)	total: 1m 56s	remaining: 22.2s
2519:	learn: 0.1022390	test: 0.1974304	best: 0.1955759 (729)	total: 1m 56s	remaining: 22.2s
2520:	learn: 0.1022387	test: 0.1974310	best: 0.1955759 (729)	total: 1m 56s	remaining: 22.1s
2521:	learn: 0.1022266	test: 0.1974319	best: 0.1955759 (729)	total: 1m 56s	remaining: 22.1s
2522:	learn: 0.1022257	test: 0.1974287	best: 0.1955759 (729)	total: 1m 56s	remaining: 22s
2523:	learn: 0.1022148	test: 0.1974489	best: 0.1955759 (729)	total: 1m 56s	remaining: 22s
2524:	learn: 0.1022134	test: 0.1974496	best: 0.1955759 (729)	total: 1m 56s	remaining: 21.9s
2525:	learn: 0.1022022	test: 0.1974602	best: 0.1955759 (729)	total: 1m 56s	remaining: 21.9s
2526:	learn: 0.1022003	test: 0.1974613	best: 0.1955759 (729)	total: 1m 56s	remaining

2607:	learn: 0.1016546	test: 0.1975769	best: 0.1955759 (729)	total: 1m 59s	remaining: 18s
2608:	learn: 0.1016509	test: 0.1975919	best: 0.1955759 (729)	total: 2m	remaining: 18s
2609:	learn: 0.1016296	test: 0.1976134	best: 0.1955759 (729)	total: 2m	remaining: 17.9s
2610:	learn: 0.1016135	test: 0.1976189	best: 0.1955759 (729)	total: 2m	remaining: 17.9s
2611:	learn: 0.1016074	test: 0.1976195	best: 0.1955759 (729)	total: 2m	remaining: 17.8s
2612:	learn: 0.1015957	test: 0.1976877	best: 0.1955759 (729)	total: 2m	remaining: 17.8s
2613:	learn: 0.1015755	test: 0.1977042	best: 0.1955759 (729)	total: 2m	remaining: 17.8s
2614:	learn: 0.1015709	test: 0.1976848	best: 0.1955759 (729)	total: 2m 3s	remaining: 18.2s
2615:	learn: 0.1015696	test: 0.1976848	best: 0.1955759 (729)	total: 2m 3s	remaining: 18.1s
2616:	learn: 0.1015520	test: 0.1976639	best: 0.1955759 (729)	total: 2m 3s	remaining: 18.1s
2617:	learn: 0.1015506	test: 0.1976661	best: 0.1955759 (729)	total: 2m 3s	remaining: 18s
2618:	learn: 0.1015464

2700:	learn: 0.1008784	test: 0.1978626	best: 0.1955759 (729)	total: 2m 6s	remaining: 14s
2701:	learn: 0.1008680	test: 0.1978527	best: 0.1955759 (729)	total: 2m 6s	remaining: 14s
2702:	learn: 0.1008646	test: 0.1978609	best: 0.1955759 (729)	total: 2m 6s	remaining: 13.9s
2703:	learn: 0.1008641	test: 0.1978626	best: 0.1955759 (729)	total: 2m 6s	remaining: 13.9s
2704:	learn: 0.1008497	test: 0.1978985	best: 0.1955759 (729)	total: 2m 6s	remaining: 13.8s
2705:	learn: 0.1008398	test: 0.1978966	best: 0.1955759 (729)	total: 2m 6s	remaining: 13.8s
2706:	learn: 0.1008328	test: 0.1978868	best: 0.1955759 (729)	total: 2m 6s	remaining: 13.7s
2707:	learn: 0.1008322	test: 0.1978866	best: 0.1955759 (729)	total: 2m 6s	remaining: 13.7s
2708:	learn: 0.1008297	test: 0.1978896	best: 0.1955759 (729)	total: 2m 7s	remaining: 13.6s
2709:	learn: 0.1008287	test: 0.1978897	best: 0.1955759 (729)	total: 2m 7s	remaining: 13.6s
2710:	learn: 0.1008267	test: 0.1978944	best: 0.1955759 (729)	total: 2m 7s	remaining: 13.5s
271

2794:	learn: 0.1003051	test: 0.1979624	best: 0.1955759 (729)	total: 2m 10s	remaining: 9.61s
2795:	learn: 0.1002841	test: 0.1979122	best: 0.1955759 (729)	total: 2m 11s	remaining: 9.56s
2796:	learn: 0.1002806	test: 0.1979142	best: 0.1955759 (729)	total: 2m 11s	remaining: 9.51s
2797:	learn: 0.1002769	test: 0.1979079	best: 0.1955759 (729)	total: 2m 11s	remaining: 9.46s
2798:	learn: 0.1002757	test: 0.1979105	best: 0.1955759 (729)	total: 2m 11s	remaining: 9.42s
2799:	learn: 0.1002735	test: 0.1979085	best: 0.1955759 (729)	total: 2m 11s	remaining: 9.37s
2800:	learn: 0.1002717	test: 0.1979108	best: 0.1955759 (729)	total: 2m 11s	remaining: 9.32s
2801:	learn: 0.1002715	test: 0.1979105	best: 0.1955759 (729)	total: 2m 11s	remaining: 9.27s
2802:	learn: 0.1002621	test: 0.1979151	best: 0.1955759 (729)	total: 2m 11s	remaining: 9.22s
2803:	learn: 0.1002593	test: 0.1979110	best: 0.1955759 (729)	total: 2m 11s	remaining: 9.18s
2804:	learn: 0.1002467	test: 0.1979153	best: 0.1955759 (729)	total: 2m 11s	remai

2884:	learn: 0.0996429	test: 0.1981089	best: 0.1955759 (729)	total: 2m 15s	remaining: 5.4s
2885:	learn: 0.0996271	test: 0.1981217	best: 0.1955759 (729)	total: 2m 15s	remaining: 5.36s
2886:	learn: 0.0996254	test: 0.1981181	best: 0.1955759 (729)	total: 2m 15s	remaining: 5.31s
2887:	learn: 0.0996233	test: 0.1981175	best: 0.1955759 (729)	total: 2m 15s	remaining: 5.26s
2888:	learn: 0.0996152	test: 0.1981191	best: 0.1955759 (729)	total: 2m 15s	remaining: 5.22s
2889:	learn: 0.0996100	test: 0.1981052	best: 0.1955759 (729)	total: 2m 15s	remaining: 5.17s
2890:	learn: 0.0995958	test: 0.1981023	best: 0.1955759 (729)	total: 2m 15s	remaining: 5.12s
2891:	learn: 0.0995933	test: 0.1981050	best: 0.1955759 (729)	total: 2m 15s	remaining: 5.08s
2892:	learn: 0.0995881	test: 0.1981159	best: 0.1955759 (729)	total: 2m 15s	remaining: 5.03s
2893:	learn: 0.0995824	test: 0.1981247	best: 0.1955759 (729)	total: 2m 16s	remaining: 4.98s
2894:	learn: 0.0995820	test: 0.1981234	best: 0.1955759 (729)	total: 2m 16s	remain

2975:	learn: 0.0989262	test: 0.1982023	best: 0.1955759 (729)	total: 2m 19s	remaining: 1.12s
2976:	learn: 0.0989231	test: 0.1981997	best: 0.1955759 (729)	total: 2m 19s	remaining: 1.08s
2977:	learn: 0.0989173	test: 0.1981979	best: 0.1955759 (729)	total: 2m 19s	remaining: 1.03s
2978:	learn: 0.0989108	test: 0.1981953	best: 0.1955759 (729)	total: 2m 19s	remaining: 984ms
2979:	learn: 0.0989029	test: 0.1981906	best: 0.1955759 (729)	total: 2m 19s	remaining: 937ms
2980:	learn: 0.0988862	test: 0.1981936	best: 0.1955759 (729)	total: 2m 19s	remaining: 890ms
2981:	learn: 0.0988852	test: 0.1981954	best: 0.1955759 (729)	total: 2m 19s	remaining: 843ms
2982:	learn: 0.0988846	test: 0.1981958	best: 0.1955759 (729)	total: 2m 19s	remaining: 796ms
2983:	learn: 0.0988843	test: 0.1981947	best: 0.1955759 (729)	total: 2m 19s	remaining: 749ms
2984:	learn: 0.0988746	test: 0.1982089	best: 0.1955759 (729)	total: 2m 19s	remaining: 702ms
2985:	learn: 0.0988692	test: 0.1982173	best: 0.1955759 (729)	total: 2m 19s	remai

<catboost.core.CatBoostClassifier at 0x7f9382114df0>

In [187]:
f1_score(catboost_classifier.predict(X_test), y_test) # 0.8655 0.8591

0.8555972952667167

In [None]:
predictions = catboost_classifier.predict_proba(validation.drop(columns=['label']))

In [122]:
answers = list(map(lambda x: x[1], predictions))
answers = pd.concat([pd.Series(answers), train[train['is_answered'] != True]['old_idx'].reset_index()], axis=1)
answers = answers.rename(columns={0: 'proba'})
result = {}
for proba, index, context_index in answers.values:
    if context_index in result:
        if result[context_index][0] < proba:
            result[context_index][0] = proba
            result[context_index][1] = int(index)
    else:
        result[context_index] = [proba, int(index)]
result = [i[1] for i in result.values()]
result = list(train.iloc[result]['answer_ru'])

In [294]:
answers = list(map(lambda x: x[1], predictions))

In [295]:
answers = pd.concat([pd.Series(answers), train[train['is_answered'] != True]['old_idx'].reset_index()], axis=1)

In [296]:
answers = answers.rename(columns={0: 'proba'})

In [304]:
result = {}
for proba, index, context_index in answers.values:
    if context_index in result:
        if result[context_index][0] < proba:
            result[context_index][0] = proba
            result[context_index][1] = int(index)
    else:
        result[context_index] = [proba, int(index)]

In [306]:
result = [i[1] for i in result.values()]

In [311]:
result = list(train.iloc[result]['answer_ru'])

In [312]:
result

['Косово',
 'Sunday Mirror',
 'Греции',
 'Амир',
 'Германия',
 'Мултана',
 'Украине',
 'Россию',
 'НАТО',
 'Финляндии',
 'Эквадора',
 'Вашингтона',
 'Рихарда Зорге',
 'Сирии',
 'Штайнбрюк',
 'Пакистане',
 'AP',
 'Южного Кавказа',
 'Трампом',
 'Газпром',
 'ГДР',
 'Украины',
 'ОБСЕ',
 'Греции',
 'Гаити',
 'Саудовской Аравии',
 'Берлина',
 'Кирилл',
 'Корбан',
 'Haribo',
 'Киева',
 'Другой России',
 'Россия',
 'СМИ',
 'НДПГ',
 'США',
 'Италия',
 'Франциск',
 'Украины',
 'Сирии',
 'DW',
 'США',
 'Австрии',
 'Лубянке',
 'Украина',
 'Украину',
 'ВТО',
 'Deutsche Post',
 'Чечни',
 'Daimler',
 'Греции',
 'Украине',
 'Германии',
 'Восточной Европы',
 'Льва Марголина',
 'Бельгии',
 'Европарламенте',
 'В защиту прав избирателей',
 'Ганновере',
 'США',
 'Ирина Прохорова',
 'Тибета',
 'Украины',
 'Украину',
 'России',
 'Германии',
 'США',
 'Греция',
 'Триполи',
 'Facebook',
 'Госдумы',
 'Германии',
 'Москвой',
 'Чечни',
 'Россия',
 'Германии',
 'США',
 'Франция',
 'Владимир Дмитриев',
 'Саудовской 

In [233]:
validation['answer_ru'].values[0]

'Косово'

In [239]:
predictions

array([[0.04460124, 0.95539876],
       [0.04460124, 0.95539876],
       [0.29217422, 0.70782578],
       ...,
       [0.10518228, 0.89481772],
       [0.97246189, 0.02753811],
       [0.98286344, 0.01713656]])

In [272]:
validation['preds'] = list(map(lambda x: x[1], predictions))

In [258]:
answers = list(map(lambda x: x[1], predictions))

In [259]:
answers = pd.concat([pd.Series(answers), train[train['is_answered'] != True]['old_idx'].reset_index()], axis=1)

In [274]:
validation[['answer_ru', 'preds']].head(30)

Unnamed: 0,answer_ru,preds
108749,Косово,0.955399
108750,Косово,0.955399
108751,Косовом поле,0.707826
108752,Косово,0.955399
108753,Косово,0.955425
108754,ЕС,0.007646
108755,Косово,0.955399
108756,Косово,0.955425
108757,Евросоюза,0.003666
108758,Косово,0.955399


In [276]:
validation = train[train['is_answered'] != True]

In [267]:
answers.groupby(['old_idx']).max([0])

Unnamed: 0_level_0,0,index
old_idx,Unnamed: 1_level_1,Unnamed: 2_level_1
7577,0.955425,108765
7578,0.758530,108776
7579,0.938087,108785
7580,0.859016,108797
7581,0.869472,108803
...,...,...
14829,0.982722,205693
14830,0.751562,205707
14831,0.812709,205716
14832,0.947594,205734


In [245]:
list(answers)

['Косово',
 'Британию',
 'Греции',
 'Амир',
 'Россия',
 'Страсбурге',
 'Сергею Андрушко',
 'Россию',
 'России',
 'Александр Кудашефф',
 'Беккенбауэр',
 'Boeing',
 'Рихарда Зорге',
 'Путину',
 'ФРГ',
 'Афганистане',
 'Исламское государство',
 'Киевом',
 'Катара',
 'Центральной',
 'Венгрия',
 'Украины',
 'Украины',
 'Италии',
 'Гаити',
 'Тегеран',
 'Берлина',
 'Синода РПЦ',
 'Корбан',
 'Haribo',
 'Киева',
 'Таисию Осипову',
 'Феликс Кравачек',
 'Косово',
 'Нижней Саксонии',
 'Airbus',
 'Италия',
 'Константинопольской православной церкви',
 'Украины',
 'Сирии',
 'Германии',
 'Южной Корее',
 'Сергей Руденко',
 'Константина Котова',
 'Украина',
 'СНБО',
 'Boeing',
 'Гамбурге',
 'Instagram',
 'Mercedes Grand Prix Petronas',
 'Греции',
 'Брюсселем',
 'SIPRI',
 'Хауна',
 'Нацбанка',
 'ФРГ',
 'Астерикс',
 'РФ',
 'Берлине',
 'США',
 'Ирина Прохорова',
 'Европарламенте',
 'Крыму',
 'PGNiG',
 'Германию',
 'РОИ',
 'Россией',
 'Россия',
 'РФ',
 'Германии',
 'Михаил Захаров',
 'Федерального ведомства

In [212]:
answers = list(map(lambda x: x[1], predictions))
answers = pd.concat([pd.Series(answers), train[train['is_answered'] != True]['old_idx'].reset_index()], axis=1)
train.iloc[answers.groupby(['old_idx']).max([0])['index']]['answer_ru'].values

array(['Косово', 'Британию', 'Греции', ..., 'США', 'Советского Союза',
       'РИА Новости'], dtype=object)

In [None]:
dict(sorted(list(zip(catboost_classifier.get_feature_importance(), X_train.columns))))

In [154]:
validation.columns[196]

'stems_same_stems_ratio_answer_en'

In [188]:
save_predictions(catboost_classifier.predict_proba(validation.drop(columns=['label'])), filename='pleeeseeees_long')

Successfully saved! Filename:  pleeeseeees_long.jsonl


In [138]:
def save_predictions(predictions, filename="solution"):
    if '.jsonl' not in filename:
        filename += ".jsonl"
    
    answers = list(map(lambda x: x[1], predictions))
    answers = pd.concat([pd.Series(answers), train[train['is_answered'] != True]['old_idx'].reset_index()], axis=1)
    answers = answers.rename(columns={0: 'proba'})
    result = {}
    for proba, index, context_index in answers.values:
        if context_index in result:
            if result[context_index][0] < proba:
                result[context_index][0] = proba
                result[context_index][1] = int(index)
        else:
            result[context_index] = [proba, int(index)]
    result = [i[1] for i in result.values()]
    result = list(train.iloc[result]['answer_ru'])

    with open(filename, 'w') as fp:
          for i, o in enumerate(result):
              d = {"idx": i, "text": o}
              fp.write(json.dumps(d, ensure_ascii=False)+"\n")
    print("Successfully saved! Filename: ", filename)

In [207]:
answers

array(['Косово', 'Британию', 'Греции', ..., 'США', 'Советского Союза',
       'РИА Новости'], dtype=object)