In [1]:
import multiprocessing

import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

import re

from gensim.models import FastText,KeyedVectors,fasttext

from sklearn import metrics

from scipy import spatial

import torch

from transformers import AutoTokenizer, AutoModel

'''import tensorflow as tf
from tensorflow.keras.layers import Reshape,BatchNormalization,MaxPooling2D,Lambda
from tensorflow.keras.layers import Dense,Activation,Reshape,Conv2D,LeakyReLU,concatenate,Flatten
from tensorflow.keras import Input
from tensorflow.keras.models import Sequential,Model
from tensorflow.keras import backend as K
from tensorflow.keras.optimizers import RMSprop, Adam
'''
from sklearn.model_selection import train_test_split



## FAQ

### Get data

In [2]:
def get_faq(urls,question_data,answer_data):
    #
    #  urls - list [urls1,urls2,..]
    #  question_data, answer_data - list [tag_value,class_value]  
    #
    #  return faq - list shape (n,2), where [:,0] - question, [:,1] - answer
    #
    question_tag,question_class=question_data
    answer_tag,answer_class=answer_data
    
    rs =[ requests.get(url) for url in urls]
    responses = [r.text.encode('utf-8') for r in rs]
    soups = [BeautifulSoup(response) for response in responses]

    faq=[]
    for soup in soups:
        question=soup.findAll(question_tag, class_=question_class)
        answer=soup.findAll(answer_tag,class_=answer_class)
        temp_questions=[]
        temp_answers=[]
        for q in question:
            txt=str(q.text)
            txt=txt.replace('  ','')
            txt=txt.replace('"','')
            txt=txt.replace('	','')
            txt=txt.replace('\n','')
            temp_questions.append(txt)
        for ans in answer:
            txt=str(ans.text)
            txt=txt.replace('  ','')
            txt=txt.replace('	','')
            txt=txt.replace('\n','')
            temp_answers.append(txt)
        
        temp_questions=np.array(temp_questions).reshape((-1,1))
        temp_answers=np.array(temp_answers).reshape((-1,1))
        temp_faq=np.concatenate([temp_questions,temp_answers],axis=1)
        faq.append(temp_faq)
            

    return faq

In [None]:
urls1=['https://www.mosenergosbyt.ru/common/lobby/questions/category_5749.php',
     'https://www.mosenergosbyt.ru/common/lobby/questions/category_5739.php',
     'https://www.mosenergosbyt.ru/common/lobby/questions/category_48100.php',
     'https://www.mosenergosbyt.ru/common/lobby/questions/category_48101.php',
     'https://www.mosenergosbyt.ru/common/lobby/questions/category_48102.php',
     'https://www.mosenergosbyt.ru/common/lobby/questions/category_48103.php',
     'https://www.mosenergosbyt.ru/common/lobby/questions/category_5745.php',
     'https://www.mosenergosbyt.ru/common/lobby/questions/category_5763.php',
     'https://www.mosenergosbyt.ru/common/lobby/questions/category_5754.php',
     'https://www.mosenergosbyt.ru/common/lobby/questions/category_5716.php']

urls2=['https://mosoblgaz.ru/company/query_answer/']

faq1=get_faq(urls1,['button','btn btn-link collapsed'],['div','collapse'])
faq2=get_faq(urls2,['div','faq-list-item__title js-accordion-item-title'],
             ['div','faq-list-item__text-inner js-accordion-item-inner'])


In [None]:
df=[]
for document in faq1:
    for line in document:
        df.append(line)
    

df=pd.DataFrame(df,columns = ['Question','Answer'])
df.to_csv('faq.csv',index=False,encoding='utf-8')

In [3]:
faq=pd.read_csv('faq.csv')
faq.head(5)

Unnamed: 0,Question,Answer
0,Как изменить ФИО собственника?,Для переоформления лицевого счета на нового вл...
1,Как исправить ошибку (или опечатку) в ФИОсобст...,Подать заявку на изменение данных можно в личн...
2,Как изменить телефон?,Изменить номер мобильного телефона можно самос...
3,Как изменить количество проживающих?,Изменить данные о количестве проживающих можно...
4,Как изменить количество комнат?,Изменить данные о количестве комнат можно толь...


### Question answering

In [143]:
class SearchEngine():
    def __init__(self,faq,sbert_name=False,fasttext_path=False):
        self.faq=faq
        if sbert_name:
            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
            self.sbert_model = AutoModel.from_pretrained(sbert_name)
            self.sbert_model.to(self.device)
            self.sbert_tokenizer = AutoTokenizer.from_pretrained(sbert_name)
            self.sbert_faq_embs=np.array([self.vectorize_sentence_sbert(sent) for sent in self.faq[:,0]])
            
        if fasttext_path:
            self.ft_model= fasttext.FastTextKeyedVectors.load(fasttext_path)
            self.ft_faq_embs=np.array([self.vectorize_sentence_ft(sent) for sent in self.faq[:,0]])
        
    def vectorize_sentence_ft(self,sentence):
        txt=sentence.split()
        val=0
        for word in txt:
            val+=self.ft_model[word]

        return val/len(txt)
    

    def vectorize_sentence_sbert(self,sentence):
    
        encoded_input = self.sbert_tokenizer(sentence,
                                             padding=True,
                                             truncation=True,
                                             max_length=24,
                                             return_tensors='pt').to(self.device)

   #     with torch.no_grad():
        model_output = self.sbert_model(**encoded_input)
            
        #Perform pooling. In this case, mean pooling
        sentence_embedding = self.mean_pooling(model_output, encoded_input['attention_mask'])
        sentence_embedding = np.squeeze(sentence_embedding)
        
        return sentence_embedding.cpu().data.numpy()
    

    
    @classmethod
    def mean_pooling(cls,model_output, attention_mask):
        #Mean Pooling - Take attention mask into account for correct averaging
        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        return sum_embeddings / sum_mask
    
    def search_faq(self,question,eps,minimal_score,mode):
        if mode=='fasttext':
            vectorizers=[self.vectorize_sentence_ft]
            faq_embs=[self.ft_faq_embs]
        elif mode=='sbert':
            vectorizers=[self.vectorize_sentence_sbert]
            faq_embs=[self.sbert_faq_embs]
        elif mode=='both':
            vectorizers=[self.vectorize_sentence_ft,self.vectorize_sentence_sbert]
            faq_embs=[self.ft_faq_embs,self.sbert_faq_embs]
            
            
        question_emb=[vectorizer(question) for vectorizer in vectorizers]
        score=np.zeros((self.faq.shape[0],len(vectorizers)))

        for i in range(len(self.faq)):
            faq_emb=[]
            for f in faq_embs:
                faq_emb.append(f[i])
            
            for j in range(len(vectorizers)):
                
                distance=1-spatial.distance.cosine( faq_emb[j],question_emb[j])
                score[i,j]=distance
                
        if mode=='both':
            new_score=np.zeros((faq.shape[0],1))
            temp_score=score.copy()
            for i,scr in enumerate(temp_score):
                n=len(scr)
                mul=1
                for val in scr:
                    mul*=val
                new_score[i]=mul**(1/n)
            del(score)
            score=new_score.copy()
            
        
        faq_and_score=np.concatenate([faq,score],axis=1)
        faq_and_score=faq_and_score[faq_and_score[:, 2].argsort()]
        faq_and_score=faq_and_score[::-1]


        max_score=faq_and_score[0,2]
        display_num=0

        for scr in faq_and_score[:,2]:
            if max_score-scr<eps and scr > minimal_score:
                display_num+=1
            else:
                break

        return faq_and_score[:display_num]

In [144]:
%%time
sbert_name="sberbank-ai/sbert_large_mt_nlu_ru"
# https://rusvectores.org/ru/models/
fasttext_path='214/model.model'
engine=SearchEngine(faq.values,sbert_name,fasttext_path)


Wall time: 17.1 s


In [110]:
vectorizer=vectorizer_obj.vectorize_sentence_sbert
eps=0.2
minimal_score=0.6

faq_embs=np.array([vectorizer(sent) for sent in faq.values[:,0]])

In [132]:
%%time
question=faq['Question'][70]+' пук кряк пиво интерал Автоматический платеж'
#question=faq['Question'][70]+faq['Question'][71]
print('Question: ',question)
print('---------------------')


#search_faq(vectorizer,question,faq.values,faq_embs,eps,minimal_score)

Question:  Как отключить «Автоматический платеж»? пук кряк пиво интерал Автоматический платеж
---------------------
Wall time: 0 ns


In [162]:
%%time
noise=' пук кряк пиво интерал'
noise2=' а то я не умею'
question=faq['Question'][50] +noise2
#question='Как оформить льготу'
print('Question: ',question)
print('---------------------')

eps=0.4
minimal_score=0.5

mode='sbert'

output=engine.search_faq(question,eps,minimal_score,mode=mode)

cleaned_question=clean_question(vectorizer_obj.vectorize_sentence_ft,question,output[:,0],min_val)
print(output)
print('---------------------')
print('Cleaned question:',cleaned_question)
print('---------------------')


Question:  В какой срок выполняется заявка? а то я не умею
---------------------
[['В какой срок выполняется заявка?'
  'В соответствии с требованиями Федерального закона от 27.12.2018 № 522-ФЗ «О внесении изменений в отдельные законодательные акты Российской Федерации в связи с развитием систем учёта электрической энергии (мощности) в Российской Федерации» и постановления Правительстваот 29.06.2020 № 950 «О внесении изменений в некоторые акты Правительства Российской Федерации по вопросам совершенствования организации учёта электрической энергии» с 1 июля 2020 года обязанность по замене неисправных приборов учёта электроэнергии или счётчиков электроэнергии с истекшим межповерочным интервалом, в жилых помещениях многоквартирных жилых домов возложена на гарантирующего поставщика электроэнергии. В Москве и Московской области гарантирующим поставщиком электроэнергии является АО «Мосэнергосбыт». В индивидуальных жилых домах эта обязанность возложена на сетевые организации ПАО «Россети Моск

In [58]:
def clean_question(vectorizer,question,pred_questions,min_score):
    question_words=question.split()
    question_words_embs=[vectorizer(word) for word in question_words]
    
    pred_question_words=[]
    for pred_question in pred_questions:
        words=pred_question.split()
        pred_question_words=pred_question_words+words
    
    pred_question_words_embs=[vectorizer(word) for word in pred_question_words]
    
    dlt_inx=[]
    
    for i,question_word_emb in enumerate(question_words_embs):
        word_score=0
        for j,pred_question_word_emb in enumerate(pred_question_words_embs):
            score=1-spatial.distance.cosine(question_word_emb, pred_question_word_emb)
            if score > word_score:
                word_score=score
                
        if word_score<min_score:
            dlt_inx.append(i)
            
    for i in reversed(dlt_inx):
        question_words.pop(i)
    
    return " ".join(question_words)

## Search engine

FastText documents embeddings vs questions

document=questions+answers on one theme

In [None]:
questions=[]
documents=[]
class_num=len(faq1)
questions2class=dict()
questions_classes=[]



for i,document in enumerate(faq1):
    
    temp_questions=[]
    for line in document:
        question=line[0]
        temp_questions.append(question)
        questions2class[question]=i
    questions.append(temp_questions)
        
        
    temp_documents=[]
    for line in document:
        temp_documents.append(line[0])
        temp_documents.append(line[1])

        
    documents.append(temp_documents)




In [None]:
documents_embs=np.zeros((len(documents),300))
for i,document in enumerate(documents):
    documents_embs[i]=sentence_embedding(ft_model,document)
    
questions_embs=[]
for i,question_document in enumerate(questions):
    temp_questions_embs=[]
    for question in question_document:
        temp_questions_embs.append(sentence_embedding(ft_model,[question]))
    questions_embs.append(temp_questions_embs)
    

In [None]:
for i,document in enumerate(documents_embs):

    pairs_number=len(questions_embs[i])
    pairs=[]
    labels=[]
    #  true pairs
    for question in questions_embs[i]:
        pairs.append([document,question])
        labels.append(1)
        
    # false document
    for j in range(pairs_number):
        flag=True
        rnd_indx=np.random.randint(0,len(documents_embs))
        while  rnd_indx==i:
                rnd_indx=np.random.randint(0,len(documents_embs))
        pairs.append([documents_embs[rnd_indx],question])
        labels.append(0)
        
    # false questions
    for j in range(pairs_number):
        flag=True
        rnd_indx=np.random.randint(0,len(documents_embs))
        while rnd_indx==i:
                rnd_indx=np.random.randint(0,len(documents_embs))
                
        rnd_indx_question=np.random.randint(0,len(questions_embs[rnd_indx]))
        pairs.append([document,questions_embs[rnd_indx][rnd_indx_question]])
        labels.append(0)
    

In [None]:
pairs=np.array(pairs).astype('float32')
labels=np.array(labels).astype('float32')
x_train,x_test,y_train,y_test=train_test_split(pairs,labels)


In [None]:
def euclidean_distance(vects):
    x, y = vects
    sum_square = K.sum(K.square(x - y), axis=1, keepdims=True)
    return K.sqrt(K.maximum(sum_square, K.epsilon()))


def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)

def contrastive_loss_with_margin(margin):
    def contrastive_loss(y_true, y_pred):
    
        square_pred = K.square(y_pred)
        margin_square = K.square(K.maximum(margin - y_pred, 0))
        return K.mean( (1 - y_true) * square_pred+y_true*margin_square)
    
    return contrastive_loss

In [None]:
def get_model():
    #
    #   it is also good idea to use VGG model, but it requires powerfull GPU
    #   model = Sequential(VGG16(weights='imagenet', include_top=False, input_shape=image_shape).layers)
    #
    model=Sequential()
    
    model.add(Reshape((15,20,1),input_shape=emb_shape))
#    model.add(Conv2D(32,(3,3)))
#    model.add(BatchNormalization())
        
    
    model.add(Flatten())
    model.add(Dense(256))
    model.add(LeakyReLU(0.2))
    model.add(Dense(128))
    model.add(LeakyReLU(0.2))

    return model

In [None]:
emb_shape=(300,)
base_network = get_model()

input_a = Input(shape=emb_shape)
vect_output_a = base_network(input_a)

input_b = Input(shape=emb_shape)
vect_output_b = base_network(input_b)

x = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([vect_output_a, vect_output_b])
output= Dense(1,activation='sigmoid')(x)

# specify the inputs and output of the model
model = Model([input_a, input_b], output)

In [None]:
optim = RMSprop(  learning_rate=0.015)
#optim = Adam(  learning_rate=0.015)
model.compile(loss=contrastive_loss_with_margin(margin=1), optimizer=optim)
history = model.fit([x_train[:,0],x_train[:,1]], 
                    y_train, 
                    epochs=30, 
                    batch_size=1,
                    validation_split=0.2)

In [None]:
model.summary()

In [None]:
def compute_accuracy(y_true, y_pred):
    '''Compute classification accuracy with a fixed threshold on distances.
    '''
    pred = y_pred.ravel() > 0.5
    return round(np.mean(pred == y_true),4)

In [None]:
loss = round(model.evaluate(x=[x_test[:,0],x_test[:,1]], y=y_test),4)

y_pred_train = model.predict([x_train[:,0],x_train[:,1]])
train_accuracy = compute_accuracy(y_train, y_pred_train)

y_pred_test = model.predict([x_test[:,0], x_test[:,1]])
test_accuracy = compute_accuracy(y_test, y_pred_test)

print("Loss = {}, Train Accuracy = {} Test Accuracy = {}".format(loss, train_accuracy, test_accuracy))

## Paragraph ranker

In [None]:
from deeppavlov import build_model, configs

## Question Answering

[Reference](https://medium.com/swlh/fine-tuning-bert-for-text-classification-and-question-answering-using-tensorflow-framework-4d09daeb3330#id_token=eyJhbGciOiJSUzI1NiIsImtpZCI6IjFiZjhhODRkM2VjZDc3ZTlmMmFkNWYwNmZmZDI2MDcwMWRkMDZkOTAiLCJ0eXAiOiJKV1QifQ.eyJpc3MiOiJodHRwczovL2FjY291bnRzLmdvb2dsZS5jb20iLCJuYmYiOjE2MjU5MDYxNDIsImF1ZCI6IjIxNjI5NjAzNTgzNC1rMWs2cWUwNjBzMnRwMmEyamFtNGxqZGNtczAwc3R0Zy5hcHBzLmdvb2dsZXVzZXJjb250ZW50LmNvbSIsInN1YiI6IjEwNjU4MDUxNjE5NzU3NTk0MTk2NCIsImhkIjoibWllbS5oc2UucnUiLCJlbWFpbCI6ImRna2FncmFtYW55YW5AbWllbS5oc2UucnUiLCJlbWFpbF92ZXJpZmllZCI6dHJ1ZSwiYXpwIjoiMjE2Mjk2MDM1ODM0LWsxazZxZTA2MHMydHAyYTJqYW00bGpkY21zMDBzdHRnLmFwcHMuZ29vZ2xldXNlcmNvbnRlbnQuY29tIiwibmFtZSI6ItCU0LDQstC40LQg0JrQsNCz0YDQsNC80LDQvdGP0L0iLCJwaWN0dXJlIjoiaHR0cHM6Ly9saDMuZ29vZ2xldXNlcmNvbnRlbnQuY29tL2EtL0FPaDE0R2hGdEZGSkZMaDlyUFV0QXdZLXlyZWlRczJObjEyQ2xpc3hmX3hQPXM5Ni1jIiwiZ2l2ZW5fbmFtZSI6ItCU0LDQstC40LQiLCJmYW1pbHlfbmFtZSI6ItCa0LDQs9GA0LDQvNCw0L3Rj9C9IiwiaWF0IjoxNjI1OTA2NDQyLCJleHAiOjE2MjU5MTAwNDIsImp0aSI6IjhiYmVjYmRjYWRiZTAyZDZkNjQ1MTMxY2Q2NzJhZTI4MWViMTdkNWYifQ.LRr773uwpuGgpYnO7WsltRMVbC3JdFg7DBIlPtnLhN11OpKyNZ5X3y2ZGLx72_tJaSXZS52xdLcEsLuM4Tk1Ta_4ifnuAMkkeJMYJct0DpJOqXGYWS9S2atl8JLvbQaLyNrseNHneLtzRgtj--Htk1Lq0red-VmGxO849tzhKfjfQDPw-PsKxnhKMxHoEHJs91z-djL_L8ATQ6p86TcSWzWGsi4Ya69TmkMiRw-W2eMaZjN11gjSXUCnJKUUC122fluOmDzsGnUbtpoSGD-86mTa368FTFLbQSC7M8SGiZKo_pIZCcKrGjBQM-bJFAMs3-I8J7nM6782Vlh_CZiBew)

[просто ссылка с хорошей инфой]( https://lilianweng.github.io/lil-log/2020/10/29/open-domain-question-answering.html )

In [None]:
def preprocess(context,questions,answer):
    
    t_context = tokenizer(context,padding=True)
    t_questions = tokenizer(questions,padding=True)
        
    # create inputs as usual
    input_ids = t_context['input_ids'] + t_questions['input_ids']
    token_type_ids = [0] * len(t_context['input_ids']) + t_questions['attention_mask']
    attention_mask = [1] * len(input_ids)

    return input_ids,token_type_ids,attention_mask

In [None]:
questions=list(faq1[:,1].reshape((1,-1))[0])
context=" ".join(questions)
input_ids,token_type_ids,attention_mask=preprocess(context,questions,0)

In [None]:
len(context.split())

In [None]:
string=questions[0].replace('\n','')
len(string.replace('/','').split())

In [None]:
class Sample:
    def __init__(self, question, context, start_char_idx=None, answer_text=None, all_answers=None):
        self.question = question
        self.context = context
        self.start_char_idx = start_char_idx
        self.answer_text = answer_text
        self.all_answers = all_answers
        self.skip = False
        self.start_token_idx = -1
        self.end_token_idx = -1

    def preprocess(self):
        # clean context and question
        context = " ".join(str(self.context).split())
        question = " ".join(str(self.question).split())
        # tokenize context and question
        tokenized_context = tokenizer.encode(context)
        tokenized_question = tokenizer.encode(question)
        
        # if this is validation or training sample, preprocess answer
        if self.answer_text is not None:
            answer = " ".join(str(self.answer_text).split())
            # check if end character index is in the context
            end_char_idx = self.start_char_idx + len(answer)
            if end_char_idx >= len(context):
                self.skip = True
                return
            # mark all the character indexes in context that are also in answer     
            is_char_in_ans = [0] * len(context)
            for idx in range(self.start_char_idx, end_char_idx):
                is_char_in_ans[idx] = 1
            ans_token_idx = []
            # find all the tokens that are in the answers
            for idx, (start, end) in enumerate(tokenized_context.offsets):
                if sum(is_char_in_ans[start:end]) > 0:
                    ans_token_idx.append(idx)
            if len(ans_token_idx) == 0:
                self.skip = True
                return
            # get start and end token indexes
            self.start_token_idx = ans_token_idx[0]
            self.end_token_idx = ans_token_idx[-1]
        # create inputs as usual
        input_ids = tokenized_context.ids + tokenized_question.ids[1:]
        token_type_ids = [0] * len(tokenized_context.ids) + [1] * len(tokenized_question.ids[1:])
        attention_mask = [1] * len(input_ids)
        padding_length = max_seq_length - len(input_ids)
        # add padding if necessary
        if padding_length > 0:
            input_ids = input_ids + ([0] * padding_length)
            attention_mask = attention_mask + ([0] * padding_length)
            token_type_ids = token_type_ids + ([0] * padding_length)
        elif padding_length < 0:
            self.skip = True
            return
        self.input_word_ids = input_ids
        self.input_type_ids = token_type_ids
        self.input_mask = attention_mask
        self.context_token_to_char = tokenized_context.offsets

In [None]:
def create_squad_examples(raw_data):
    squad_examples = []
    for item in raw_data["data"]:
        for para in item["paragraphs"]:
            context = para["context"]
            for qa in para["qas"]:
                question = qa["question"]
                if "answers" in qa:
                    answer_text = qa["answers"][0]["text"]
                    all_answers = [_["text"] for _ in qa["answers"]]
                    start_char_idx = qa["answers"][0]["answer_start"]
                    squad_eg = Sample(question, context, start_char_idx, answer_text, all_answers)
                else:
                    squad_eg = Sample(question, context)
                squad_eg.preprocess()
                squad_examples.append(squad_eg)
    return squad_examples


def create_inputs_targets(squad_examples):
    dataset_dict = {
        "input_word_ids": [],
        "input_type_ids": [],
        "input_mask": [],
        "start_token_idx": [],
        "end_token_idx": [],
    }
    for item in squad_examples:
        if item.skip == False:
            for key in dataset_dict:
                dataset_dict[key].append(getattr(item, key))
    for key in dataset_dict:
        dataset_dict[key] = np.array(dataset_dict[key])
    x = [dataset_dict["input_word_ids"],
         dataset_dict["input_mask"],
         dataset_dict["input_type_ids"]]
    y = [dataset_dict["start_token_idx"], dataset_dict["end_token_idx"]]
    return x, y

In [None]:
class ValidationCallback(keras.callbacks.Callback):

    def normalize_text(self, text):
        # convert to lower case
        text = text.lower()
        # remove redundant whitespaces
        text = "".join(ch for ch in text if ch not in set(string.punctuation))
        # remove articles
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        text = re.sub(regex, " ", text)
        text = " ".join(text.split())
        return text

    def __init__(self, x_eval, y_eval):
        self.x_eval = x_eval
        self.y_eval = y_eval

    def on_epoch_end(self, epoch, logs=None):
        # get the offsets of the first and last tokens of predicted answers
        pred_start, pred_end = self.model.predict(self.x_eval)
        count = 0
        eval_examples_no_skip = [_ for _ in eval_squad_examples if _.skip == False]
        # for every pair of offsets
        for idx, (start, end) in enumerate(zip(pred_start, pred_end)):
            # take the required Sample object with the ground-truth answers in it
            squad_eg = eval_examples_no_skip[idx]
            # use offsets to get back the span of text corresponding to
            # our predicted first and last tokens
            offsets = squad_eg.context_token_to_char
            start = np.argmax(start)
            end = np.argmax(end)
            if start >= len(offsets):
                continue
            pred_char_start = offsets[start][0]
            if end < len(offsets):
                pred_char_end = offsets[end][1]
                pred_ans = squad_eg.context[pred_char_start:pred_char_end]
            else:
                pred_ans = squad_eg.context[pred_char_start:]
            normalized_pred_ans = self.normalize_text(pred_ans)
            # clean the real answers
            normalized_true_ans = [self.normalize_text(_) for _ in squad_eg.all_answers]
            # check if the predicted answer is in an array of the ground-truth answers
            if normalized_pred_ans in normalized_true_ans:
                count += 1
        acc = count / len(self.y_eval[0])
        print(f"\nepoch={epoch + 1}, exact match score={acc:.2f}")

In [None]:
train_path = keras.utils.get_file("train.json", "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json")
eval_path = keras.utils.get_file("eval.json", "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json")

with open(train_path) as f:
    raw_train_data = json.load(f)
    
with open(eval_path) as f:
    raw_eval_data = json.load(f)

max_seq_length = 384

input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name='input_word_ids')
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name='input_mask')
input_type_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name='input_type_ids')
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2", trainable=True)
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, input_type_ids])
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy().decode("utf-8")
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertWordPieceTokenizer(vocab=vocab_file, lowercase=True)
train_squad_examples = create_squad_examples(raw_train_data)
x_train, y_train = create_inputs_targets(train_squad_examples)
print(f"{len(train_squad_examples)} training points created.")
eval_squad_examples = create_squad_examples(raw_eval_data)
x_eval, y_eval = create_inputs_targets(eval_squad_examples)
print(f"{len(eval_squad_examples)} evaluation points created.")
start_logits = layers.Dense(1, name="start_logit", use_bias=False)(sequence_output)
start_logits = layers.Flatten()(start_logits)
end_logits = layers.Dense(1, name="end_logit", use_bias=False)(sequence_output)
end_logits = layers.Flatten()(end_logits)
start_probs = layers.Activation(keras.activations.softmax)(start_logits)
end_probs = layers.Activation(keras.activations.softmax)(end_logits)
model = keras.Model(inputs=[input_word_ids, input_mask, input_type_ids], outputs=[start_probs, end_probs])
loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False)
optimizer = keras.optimizers.Adam(lr=1e-5, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
model.compile(optimizer=optimizer, loss=[loss, loss])
model.summary()
model.fit(x_train, y_train, epochs=2, batch_size=8, callbacks=[ValidationCallback(x_eval, y_eval)])
model.save_weights("./weights.h5")

In [None]:
data = {"data":
    [
        {"title": "Project Apollo",
         "paragraphs": [
             {
                 "context": "The Apollo program, also known as Project Apollo, was the third United States human "
                            "spaceflight program carried out by the National Aeronautics and Space Administration ("
                            "NASA), which accomplished landing the first humans on the Moon from 1969 to 1972. First "
                            "conceived during Dwight D. Eisenhower's administration as a three-man spacecraft to "
                            "follow the one-man Project Mercury which put the first Americans in space, Apollo was "
                            "later dedicated to President John F. Kennedy's national goal of landing a man on the "
                            "Moon and returning him safely to the Earth by the end of the 1960s, which he proposed in "
                            "a May 25, 1961, address to Congress. Project Mercury was followed by the two-man Project "
                            "Gemini. The first manned flight of Apollo was in 1968. Apollo ran from 1961 to 1972, "
                            "and was supported by the two man Gemini program which ran concurrently with it from 1962 "
                            "to 1966. Gemini missions developed some of the space travel techniques that were "
                            "necessary for the success of the Apollo missions. Apollo used Saturn family rockets as "
                            "launch vehicles. Apollo/Saturn vehicles were also used for an Apollo Applications "
                            "Program, which consisted of Skylab, a space station that supported three manned missions "
                            "in 1973-74, and the Apollo-Soyuz Test Project, a joint Earth orbit mission with the "
                            "Soviet Union in 1975.",
                 "qas": [
                     {"question": "What project put the first Americans into space?",
                      "id": "Q1"
                      },
                     {"question": "What program was created to carry out these projects and missions?",
                      "id": "Q2"
                      },
                     {"question": "What year did the first manned Apollo flight occur?",
                      "id": "Q3"
                      },
                     {"question": "What President is credited with the original notion of putting Americans in space?",
                      "id": "Q4"
                      },
                     {"question": "Who did the U.S. collaborate with on an Earth orbit mission in 1975?",
                      "id": "Q5"
                      },
                     {"question": "How long did Project Apollo run?",
                      "id": "Q6"
                      },
                     {"question": "What program helped develop space travel techniques that Project Apollo used?",
                      "id": "Q7"
                      },
                     {"question": "What space station supported three manned missions in 1973-1974?",
                      "id": "Q8"
                      }
                 ]}]}]}

test_samples = create_squad_examples(data)
x_test, _ = create_inputs_targets(test_samples)
pred_start, pred_end = model.predict(x_test)
for idx, (start, end) in enumerate(zip(pred_start, pred_end)):
    test_sample = test_samples[idx]
    offsets = test_sample.context_token_to_char
    start = np.argmax(start)
    end = np.argmax(end)
    pred_ans = None
    if start >= len(offsets):
        continue
    pred_char_start = offsets[start][0]
    if end < len(offsets):
        pred_ans = test_sample.context[pred_char_start:offsets[end][1]]
    else:
        pred_ans = test_sample.context[pred_char_start:]
    print("Q: " + test_sample.question)
    print("A: " + pred_ans)