# Промежуточная аттестация. Применение Вопрос-ответного поиска

Для реализации задачи вопросно-ответного поиска в промышленной отрасли, будет использована БД с данными.

Для решения задания будет применен алгоритм для поиска документов содержащих что-либо связанной с вопросом. Это <a href="https://en.wikipedia.org/wiki/Okapi_BM25">Okapi BM25</a>.

In [1]:
!pip install rank_bm25 -q

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path, PurePath

import nltk
nltk.download(['stopwords', 'punkt'])

from nltk.corpus import stopwords
import re
import string
import torch

from rank_bm25 import BM25Okapi # Search engine

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Датасет найден на просторах интернета с различными данными (с парой вопрос-ответ)

In [15]:
from google.colab import drive

drive.mount('/content/drive/')
PATH_TO_DATA = '/content/drive/MyDrive'

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [16]:
metadata_path = f'{PATH_TO_DATA}/metadata.csv'
metadata_df = pd.read_csv(metadata_path, low_memory=False)
metadata_df = metadata_df.dropna(subset=['abstract', 'title']).reset_index(drop=True)

Создадим класс реализующий движок поиска, использующий алгоритм Okapi BM25.

In [19]:
from rank_bm25 import BM25Okapi

english_stopwords = list(set(stopwords.words('english')))

class CovidSearchEngine:
    def __init__(self, corpus: pd.DataFrame):
        self.corpus = corpus
        self.columns = corpus.columns

        # Объединение
        raw_search_str = self.corpus.abstract.fillna('') + ' ' + self.corpus.title.fillna('')

        self.index = raw_search_str.apply(self.preprocess).to_frame()
        self.index.columns = ['terms']
        self.index.index = self.corpus.index
        self.bm25 = BM25Okapi(self.index.terms.tolist())

    def preprocess(self, text: str) -> list[str]:
        # предобработка текста
        return self.tokenize(self.remove_special_character(text.lower()))

    def remove_special_character(self, text: str) -> str:
        return text.translate(str.maketrans('', '', string.punctuation))

    def tokenize(self, text: str) -> list[str]:
        words = nltk.word_tokenize(text)
        return list(set([word for word in words
                         if len(word) > 1
                         and not word in english_stopwords
                         and not word.isnumeric()
                        ])
                   )


    def search(self, query: str, num=3) -> pd.DataFrame:
        """
        Метод поиска `num` наиболее подходящих корпусов.
        Параметр (опц.) `num` - количество возвращаемых корпусов
        """
        # получаем оценки схожести вопроса
        search_terms = self.preprocess(query)
        doc_scores = self.bm25.get_scores(search_terms)

        ind = np.argsort(doc_scores)[::-1][:num]

        # извлекаем наиболее подходящие контекст
        results = self.corpus.iloc[ind][self.columns]
        results['score'] = doc_scores[ind]
        results = results[results.score > 0]
        return results.reset_index()

In [20]:
cse = CovidSearchEngine(metadata_df)

In [21]:
!pip install transformers -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.7/311.7 kB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m48.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m51.6 MB/s[0m eta [36m0:00:00[0m
[?25h

Использую предобученную модель, которая была предобучена на модели <a href="https://rajpurkar.github.io/SQuAD-explorer/">SQuAD</a> (Stanford Question Answering Dataset).

In [22]:
import torch
from transformers import BertTokenizer
from transformers import BertForQuestionAnswering

torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'

BERT_SQUAD = 'bert-large-uncased-whole-word-masking-finetuned-squad'

model = BertForQuestionAnswering.from_pretrained(BERT_SQUAD)
tokenizer = BertTokenizer.from_pretrained(BERT_SQUAD)

model = model.to(torch_device)
model.eval()

print()


(…)finetuned-squad/resolve/main/config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


(…)squad/resolve/main/tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

(…)g-finetuned-squad/resolve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

(…)etuned-squad/resolve/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]




Переобученная BERT-QA модель получает на вход склеенные `input_ids` вопроса и контекста. Можно сказать, что получаем два индекса в контексте: начало и конец. Далее конвертируя токены получаем ответ.

In [23]:
def answer_question(question, context):
    encoded_dict = tokenizer.encode_plus(
        question,
        context,
        add_special_tokens=True,
        max_length=256,
        pad_to_max_length=True,
        return_tensors='pt'
    )

    input_ids = encoded_dict['input_ids'].to(torch_device)
    token_type_ids = encoded_dict['token_type_ids'].to(torch_device)

    output = model(input_ids, token_type_ids=token_type_ids)

    all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
    start_index = torch.argmax(output.start_logits)
    end_index = torch.argmax(output.end_logits)

    answer = tokenizer.convert_tokens_to_string(all_tokens[start_index:end_index+1])
    answer = answer.replace('[CLS]', '').replace('[SEP]', ' ')
    return answer

В демонстрации будет передан вопрос. Затем выбран какое то количество корпусов, где будет искаться ответ на вопрос (по алгоритму Okapi BM25)

In [24]:
NUM_CONTEXT_FOR_EACH_QUESTION = 3


def get_all_context(query, num_results):
    # Находит для данного вопроса `num_results` наиболее подходящих корпусов

    papers_df = cse.search(query, num_results)
    return papers_df['abstract'].str.replace("Abstract", "").tolist()


def get_all_answers(question, all_contexts):
    # Получение ответов от всех корпусов для данного вопроса

    all_answers = []

    for context in all_contexts:
        all_answers.append(answer_question(question, context))
    return all_answers


def create_output_results(question,
                          all_contexts,
                          all_answers,
                          summary_answer='',
                          summary_context=''):
    # Функция формирующая словарь со всеми заданными вопросами и ответами с корпусами
    output = {}
    output['question'] = question
    results = []
    for c, a in zip(all_contexts, all_answers):

        span = {}
        span['context'] = c
        span['answer'] = a
        results.append(span)

    output['results'] = results

    return output


def get_results(question,
                summarize=False,
                num_results=NUM_CONTEXT_FOR_EACH_QUESTION,
                verbose=True):
    # Входная точка демонстрации

    all_contexts = get_all_context(question, num_results)

    all_answers = get_all_answers(question, all_contexts)

    return create_output_results(question,
                                 all_contexts,
                                 all_answers)

In [26]:
questions =  [

    "How does smoking affect patients?",
    "How does pregnancy affect patients?",
    "What is the fatality rate of 2019-nCoV?",
    "Can animals transmit 2019-nCoV?",
    "What telemedicine and cybercare methods are most effective?",
    "How is artificial intelligence being used in real time health delivery?",
    "What adjunctive or supportive methods can help patients?",
    "What diagnostic tests (tools) exist or are being developed to detect 2019-nCoV?"
]


In [27]:
all_answers = [get_results(q) for q in questions]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some 

In [28]:
for q in all_answers:
    print('-'*42)
    print(f'Вопрос: {q["question"]}')
    for i, a in enumerate(q['results'], 1):
        print(f'Ответ {i}: {a["answer"]}')
    print('-'*42)


------------------------------------------
Вопрос: How does smoking affect patients?
Ответ 1: smoking , vitamin d levels , and sunlight exposure are the most relevant
Ответ 2: counseling for smoking cessation
Ответ 3: no relationship between smoking habit , age , or sex and the presence of inclusion bodies
------------------------------------------
------------------------------------------
Вопрос: How does pregnancy affect patients?
Ответ 1: physiological changes and drug compliance during pregnancy can affect asthma control in varying degrees
Ответ 2: pregnancy loss , calving rates , and abnormalities in newborn calves
Ответ 3: intrauterine infections adversely affect pregnancies and / or neonates
------------------------------------------
------------------------------------------
Вопрос: What is the fatality rate of 2019-nCoV?
Ответ 1: 0 . 3 % to 0 . 6 %
Ответ 2: 2 . 2 %
Ответ 3:  what is the fatality rate of 2019 - ncov ?  
------------------------------------------
--------------

Видим ответы. Для улучшения предиктов можно увеличить количество сканируемых корпусов либо дообучить модель на данных по вопросам и ответам.

Функционал также доступен в библиотеке <a href="https://github.com/amaiya/ktrain/tree/master">`ktrain`</a> - обертка над TensorFlow.

In [29]:
!pip install ktrain -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.3/25.3 MB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m52.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m653.6/653.6 kB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m66.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.8/468.8 kB[0m [31m46.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l

In [30]:
metadata_df["raw_search_text"] = metadata_df.abstract.fillna('') + ' ' + metadata_df.title.fillna('')

In [31]:
import shutil
from ktrain.text import SimpleQA


INDEXDIR = '/tmp/myindex'
try:
    shutil.rmtree(INDEXDIR)
except FileNotFoundError:
    print('OK')
except OSError as e:
    print(e)
except Exception as e:
    raise e

OK


In [32]:
docs = metadata_df["raw_search_text"].tolist()

In [33]:
# аналогично классу CovidSearchEngine в данной ячейке создается движок и индексы для быстрого поиска нужных контекстов

SimpleQA.initialize_index(INDEXDIR)
SimpleQA.index_from_list(
    docs,
    INDEXDIR,
    commit_every=len(docs),
    multisegment=True,
    procs=4,
    breakup_docs=True
)

In [34]:


qa = SimpleQA(INDEXDIR)

(…)rt-base-uncased/resolve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

(…)cased/resolve/main/tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

(…)bert-base-uncased/resolve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

(…)base-uncased/resolve/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [35]:


answers = qa.ask('How long is the incubation period for the virus?')
qa.display_answers(answers[:3])

Unnamed: 0,Candidate Answer,Context,Confidence,Document Reference
0,is the delay from infection until onset of symptoms,"the incubation period is the delay from infection until onset of symptoms , and varies from person to person.",0.782764,16601
1,: 21 days,background : 21 days has been regarded as the appropriate quarantine period for holding individuals potentially exposed to ebola virus (ev) to reduce risk of contagion.,0.141524,15194
2,a short,the wells – riley equation for modelling airborne infection in indoor environments is incorporated into an seir epidemic model with a short incubation period to simulate the transmission dynamics of airborne infectious diseases in ventilated rooms.,0.046417,30864
