In [16]:
import pandas as pd
import nltk
import string
import random
import os
import csv
import numpy as np
from nltk import ToktokTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from pymystem3 import Mystem
from xgboost import XGBRegressor


stop_words = set([word.strip() for word in open(os.path.join("resources/russian"), "r")])
mystem = Mystem()
tokenizer = ToktokTokenizer()
tfidf = TfidfVectorizer()
indexes = {}

In [17]:
class ParaQuestion:
    def __init__(self, para, question, id, answer): 
        self.para, unstemmed = parse_text(para)
        self.question = set(parse_text(question)[0][0])
        self.id = id
        closest_sent_ind = self.calc_closest()
        self.closest_sent = self.para[closest_sent_ind]
        self.closest_sent_unst = unstemmed[closest_sent_ind]
        self.answer = None if answer is None else parse_text(answer)[0][0]
        self.ans_vec = None

    def para_words(self):
        return ' '.join([word for sent in self.para for word in sent if self.answer is not None or word in indexes])

    def tfidf(self, word):
        if word not in indexes:
            return 0
        return self.para_tfidf[0, indexes[word]]

    def precalc(self):
        self.para_tfidf = tfidf.transform([self.para_words()])
        self.sums = [0]
        self.sums_m = [0]
        for i in range(len(self.closest_sent)):
            self.sums_m.append(self.sums_m[i])
            val = self.tfidf(self.closest_sent[i])
            if self.closest_sent[i] in self.question:
                self.sums_m[i + 1] += val
            self.sums.append(self.sums[i] + val)

    def to_vectors(self):
        self.precalc()
        vectors = []
        for right in range(len(self.closest_sent)):
            for left in range(right + 1):
                span_len = right - left + 1
                sent_len = len(self.closest_sent)
                left_len = left
                right_len = sent_len - left_len - span_len
                span_tfidf = self.sums[right + 1] - self.sums[left]
                span_match_tfidf = self.sums_m[right + 1] - self.sums_m[left]
                sent_tfidf = self.sums_m[len(self.closest_sent)]
                left_tfidf = self.sums_m[left]
                right_tfidf = self.sums_m[len(self.closest_sent)] - self.sums_m[right + 1]
                vectors.append([span_len, sent_len, left_len, right_len, span_tfidf, span_match_tfidf, sent_tfidf, left_tfidf, right_tfidf])

                if self.answer is not None and self.answer == self.closest_sent[left : right + 1]:
                    self.ans_vec = len(vectors) - 1

        return vectors

    def calc_closest(self):
        max_match = 0
        index = 0
        for ind in range(len(self.para)):
            match = 0
            marked = set()
            for word in self.para[ind]:
                if word in self.question and word not in stop_words and word not in marked:
                    match += 1
                    marked.add(word)
            if match >= max_match:
                max_match = match
                index = ind
        return index


def parse_text(text):
    sentences = nltk.sent_tokenize(text, language="russian")
    sentences = [tokenizer.tokenize(sentence) for sentence in sentences]
    return [[process_word(word, True) for word in sentence if word not in string.punctuation] for sentence in sentences], [[word for word in sentence if word not in string.punctuation] for sentence in sentences]


def process_word(word, lemma):
    word = word.lower()
    if word[-1] == "[" or word[-1] == '.':
        word = word[:-1]
    return "".join([s.strip() for s in mystem.lemmatize(word)]) if lemma else word


def read_csv(file_name, sep=','):
    csv = pd.read_csv(file_name, sep=sep)
    return [ParaQuestion(para[2], para[3], para[1], para[4] if len(para) > 4 else None) for para in csv.values]

In [18]:
test = read_csv("resources/dataset_281937_1.txt", '\t')
tfidf.fit([doc.para_words() for doc in read_csv("resources/train_qa.csv")] + [doc.para_words() for doc in test])
features = tfidf.get_feature_names()
for i in range(len(features)):
    indexes[features[i]] = i

In [161]:
vectors = []
labels = []
cnt = 0
for doc in train:
    vecs = doc.to_vectors()
    vectors.extend(vecs)
    if doc.ans_vec is not None:
        labels.extend([0] * doc.ans_vec)
        labels.append(1)
        labels.extend([0] * (len(vecs) - doc.ans_vec - 1))
    else:
        labels.extend([0] * len(vecs))
        cnt += 1
print(cnt)

5922


In [None]:
with open('vectors.csv', mode='w') as vectors_file:
    writer = csv.writer(vectors_file, delimiter='\t')
    for i in range(len(vectors)):
        writer.writerow(vectors[i] + [labels[i]])

In [80]:
#csv = pd.read_csv("vectors.csv", sep="\t")
vectors = []
labels = []
for vector in csv.values:
    if vector[-1] == 1 or random.random() < 0.008:
        vectors.append(vector[:-1])
        labels.append(vector[-1])
print(len(vectors))

230411


In [140]:
xgb_clf = XGBRegressor(objective='binary:logistic', n_estimators=300, max_depth=5, learning_rate=0.33)
xgb_clf.fit(np.array(vectors), np.array(labels))

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0,
       importance_type='gain', learning_rate=0.33, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=300,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=1, verbosity=1)

In [142]:
test = read_csv("resources/dataset_281937_1.txt", '\t')
with open("result.txt", "w") as result_file:
    for doc in test:
       vecs = doc.to_vectors()
       pr = xgb_clf.predict(vecs)
       ind = np.argmax(pr)
       left = vecs[ind][2]
       right = vecs[ind][0] + left - 1
       result_file.write(f"{doc.id}\t{' '.join(doc.closest_sent_unst[left:right + 1])}\n")