In [2]:
import pandas as pd
import numpy as np
!pip install transformers
import keras
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers
import re
import string



In [3]:
data = pd.read_csv('../input/datacsv/data.csv')
data.head()

Unnamed: 0,text,question,answer,all_answers,start_id,end_id
0,"The Vatican Apostolic Library (), more commonl...",When was the Vat formally opened,It was formally established in 1475,Formally established in 1475,151,179
1,"The Vatican Apostolic Library (), more commonl...",what is the library for,research,he Vatican Library is a research library,454,494
2,"The Vatican Apostolic Library (), more commonl...",for what subjects,"history, and law",Vatican Library is a research library for hist...,457,511
3,"The Vatican Apostolic Library (), more commonl...",and,"philosophy, science and theology",Vatican Library is a research library for hist...,457,545
4,"The Vatican Apostolic Library (), more commonl...",what was started in 2014,a project,"March 2014, the Vatican Library began an initi...",769,879


In [39]:
class Sample:
  def __init__(self, context, question, start_char_idx= None, answer_text = None, all_answers = None):
    self.skip = False
    self.context = context
    self.question = question
    self.answer_text = answer_text
    self.start_char_idx = start_char_idx
    self.start_token_idx = -1
    self.end_token_idx = -1
    self.all_answers = all_answers

  def pre_process(self):
    
    context = " ".join(str(self.context).split())
    
    question = " ".join(str(self.question).split())
    tokenized_context = tokenizer.encode(context)
    tokenized_question = tokenizer.encode(question)

    if self.answer_text is not None:
        answer = " ".join(str(self.answer_text).split())

        end_char_id = self.start_char_idx + len(self.answer_text)
        if end_char_id >= len(context):
            self.skip = True
            return
      
        is_ans_in_context = [0]*len(context)

        for i in range(self.start_char_idx, end_char_id):
            is_ans_in_context[i] = 1
      
        ans_token_id = []

        for idx, (start, end) in enumerate(tokenized_context.offsets):
            if sum(is_ans_in_context[start:end])>0:
                  ans_token_id.append(idx)
      
        if len(ans_token_id)==0:
            self.skip = True
            return
      
        self.start_token_idx = ans_token_id[0]
        self.end_token_idx = ans_token_id[-1]

    input_ids = tokenized_context.ids + tokenized_question.ids[:]
    token_type_ids = [0]*len(tokenized_context.ids) + [1]*len(tokenized_question.ids)
    attention_mask = [0]*len(input_ids)

    padding_len = max_len - len(input_ids)

    if padding_len >0:
        input_ids = input_ids + ([0]*padding_len)
        attention_mask = attention_mask + ([0]*padding_len)
        token_type_ids = token_type_ids + ([0]*padding_len)
    
    elif padding_len<0:
        self.skip = True
        return
    
    self.context_token_to_char = tokenized_context.offsets
    self.input_word_ids = input_ids
    self.input_type_ids = token_type_ids
    self.input_mask = attention_mask


In [5]:
max_len = 400

def get_examples(data):
  examples = []
#   print(len(data['question']))
  for i in range(len(data['question'])):
    if data['answer'][i] is not None:
      ex = Sample(data['text'][i], data['question'][i], data['start_id'][i], data['answer'][i],[data['answer'][i], data['all_answers'][i]] )
    else:
      ex = Sample(data['text'][i], data['question'][i])
    ex.pre_process()
    examples.append(ex)
#     if i==50:
#         break
    
  return examples
  
def create_dataset(examples):
  dataset_dict = {
        "input_word_ids": [],
        "input_type_ids": [],
        "input_mask": [],
        "start_token_idx": [],
        "end_token_idx": [],
    }
  for ex in examples:
    if ex.skip==False:
      # print(ex.input_type_ids)
      for key in dataset_dict:
        dataset_dict[key].append(getattr(ex, key))
  for key in dataset_dict:
    dataset_dict[key] = np.array(dataset_dict[key])
    print(key, dataset_dict[key].shape)

#   data_pd = pd.DataFrame.from_dict(dataset_dict)
#   data_pd.to_pickle("/content/drive/MyDrive/NLP/final_dataset.pkl")

  train_x = np.array([dataset_dict["input_word_ids"],
         dataset_dict["input_mask"],
         dataset_dict["input_type_ids"]])
  train_y = [dataset_dict["start_token_idx"], dataset_dict["end_token_idx"]]

  return train_x, train_y



In [None]:
def generator(data, batch_size):
    num_of_samples = len(x_train[0])
    num_of_batches = num_of_samples//batch_size
    
    counter = 0
    while 1:
        examples = get_examples(data.iloc[batch_size*counter:batch_size*(counter+1), :])
        X_batch, Y_batch_start, Y_batch_end = create_dataset(examples)
#         Y_batch_end = y_train_end[:][batch_size*counter : batch_size*(counter+1)][:]
        counter +=1
        yield [X_batch[0], X_batch[1], X_batch[2]], (Y_batch_start, Y_batch_end)
        
        if counter>=num_of_batches:
            counter=0
        

In [6]:
class ValidationCallback(keras.callbacks.Callback): 
    def normalize_text(self, text): 
        text = text.lower() 
        text = "".join(ch for ch in text if ch not in set(string.punctuation))

        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        text = re.sub(regex, " ", text)
        text = " ".join(text.split())
        return text
    
    def __init__(self, x_eval, y_eval): 
        self.x_eval = x_eval 
        self.y_eval = y_eval

    def on_epoch_end(self, epoch, logs = None): 
        pred_start, pred_end = self.model.predict(self.x_eval) 
        count = 0 
        eval_examples_no_skip = [_ for _ in examples if _.skip==False]

        for idx, (start, end) in enumerate(zip(pred_start, pred_end)):
          squad_eg = eval_examples_no_skip[idx]

          offsets = squad_eg.context_token_to_char
          start = np.argmax(start)
          end = np.argmin(end)

          if start >= len(offsets):
            continue

          pred_char_start = offsets[start][0]
          if end<len(offsets):
            pred_char_end = offsets[end][1]
            pred_char_ans = squad_eg.context[pred_char_start:pred_char_end]

          else:
            pred_char_ans = squad_eg.context[pred_char_start:]

          normalized_pred_ans = self.normalize_text(pred_char_ans)
          normalized_true_ans = [self.normalize_text(_) for _ in squad_eg.all_answers]

          if normalized_pred_ans in normalized_true_ans:
            count +=1

        acc = count/len(self.y_eval[0])
        print(f"\nepoch={epoch + 1}, exact match score={acc:.2f}") 

In [7]:
import tensorflow_hub as hub
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from tokenizers import BertWordPieceTokenizer

input_word_ids = tf.keras.layers.Input(shape = (max_len, ), dtype = tf.int32, name = 'input_word_ids')
input_mask = tf.keras.layers.Input(shape = (max_len, ), dtype = tf.int32, name = 'input_mask')
input_type_ids = tf.keras.layers.Input(shape = (max_len, ), dtype = tf.int32, name = 'input_type_ids')

bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2", trainable = True)
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, input_type_ids])

vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy().decode("utf-8")
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertWordPieceTokenizer(vocab = vocab_file, lowercase = True )



In [18]:
examples = get_examples(data)

In [16]:
len(examples)

108647

In [6]:
# x_train, y_train = create_dataset(examples[:10])
# x_eval, y_eval = create_dataset(examples[10:20])

NameError: name 'examples' is not defined

In [19]:
x_train, y_train = create_dataset(examples[:100000])
x_eval, y_eval = create_dataset(examples[100000:])

input_word_ids (76924, 400)
input_type_ids (76924, 400)
input_mask (76924, 400)
start_token_idx (76924,)
end_token_idx (76924,)
input_word_ids (6666, 400)
input_type_ids (6666, 400)
input_mask (6666, 400)
start_token_idx (6666,)
end_token_idx (6666,)


In [8]:
start_logits = layers.Dense(1, name = 'start_logit', use_bias = False)(sequence_output)
start_logits = layers.Flatten()(start_logits)

end_logits = layers.Dense(1, name = 'end_logit', use_bias = False)(sequence_output)
end_logits = layers.Flatten()(end_logits)

start_prob = layers.Activation(keras.activations.softmax)(start_logits)

end_prob = layers.Activation(keras.activations.softmax)(end_logits)

model = keras.Model(inputs = [input_word_ids, input_mask, input_type_ids], outputs = [start_prob, end_prob])
loss = keras.losses.SparseCategoricalCrossentropy(from_logits = False)
optimizer = keras.optimizers.Adam(learning_rate = 1e-5, beta_1 = 0.9, beta_2=0.98, epsilon=1e-9)
model.compile(optimizer=optimizer, loss=[loss, loss])
model.summary()
print("length of dataset: ", len(x_train[0]))


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 400)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 400)]        0                                            
__________________________________________________________________________________________________
input_type_ids (InputLayer)     [(None, 400)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 109482241   input_word_ids[0][0]             
                                                                 input_mask[0][0]             

NameError: name 'x_train' is not defined

In [None]:
y_train_start[10]

In [20]:
# model.fit(generator(source_text,target_text,batch_size), epochs=5, steps_per_epoch = source_text.shape[0]/batch_size, validation_data = generator(source_text_val,target_text_val,batch_size), validation_steps = source_text_val.shape[0]/batch_size)
batch_size = 8
# print(generator(x_train, y_train_start, y_train_end, batch_size))
# x_eval, y_eval = generator(x_eval, y_eval, batch_size)
# , validation_data = generator(data[:20], batch_size), validation_steps = 5
# model.fit(generator(data, batch_size), epochs=2, steps_per_epoch = len(data['question'])/batch_size )
model.fit([x_train[0], x_train[1], x_train[2]], [y_train[0], y_train[1]], epochs = 5, batch_size = batch_size, validation_data = ([x_eval[0], x_eval[1], x_eval[2]], [y_eval[0], y_eval[1]]))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f962ec6dc10>

In [1]:
model.save("../output/bert_model.h5")



NameError: name 'model' is not defined

In [None]:
model.load_weights("../output/bert_model.h5")

In [13]:
!mkdir /output/kaggle/working/bert

mkdir: cannot create directory ‘/output/kaggle/working/bert’: No such file or directory


In [10]:
!cd /kaggle/bert 

In [11]:
model.save("/kaggle/bert/bert.h5")

In [45]:
ex = Sample(data['text'][30], data['question'][30])

In [46]:
def test_preprocess(example):
    dataset_dict = {
        "input_word_ids": [],
        "input_type_ids": [],
        "input_mask": [],
        "start_token_idx": [],
        "end_token_idx": [],
    }
    if example.skip==False:
      # print(ex.input_type_ids)
      for key in dataset_dict:
        dataset_dict[key].append(getattr(example, key))
    for key in dataset_dict:
        dataset_dict[key] = np.array(dataset_dict[key])
    print(key, dataset_dict[key].shape)

#   data_pd = pd.DataFrame.from_dict(dataset_dict)
#   data_pd.to_pickle("/content/drive/MyDrive/NLP/final_dataset.pkl")

    train_x = np.array([dataset_dict["input_word_ids"],
         dataset_dict["input_mask"],
         dataset_dict["input_type_ids"]])
    train_y = [dataset_dict["start_token_idx"], dataset_dict["end_token_idx"]]

    return train_x, train_y

ex.pre_process()
x_test, y_test = test_preprocess(ex)
Y = model.predict([x_test[0], x_test[1], x_test[2]])

end_token_idx (1,)


In [47]:
context = data['text'][30]
print(data['question'][30])
print(np.argmax(Y[0]), np.argmax(Y[1]))
print(data['start_id'][1], data['end_id'][1])
print(data['text'][30].split()[np.argmax(Y[0]) : np.argmax(Y[1])])
print(data['answer'][30])

What was she willing to give up
153 371
454 494
['If', 'she', 'could', 'rule', 'this', 'gun-man,', 'as', 'Venters', 'had', 'called', 'him,', 'if', 'she', 'could', 'even', 'keep', 'him', 'from', 'shedding', 'blood,', 'what', 'strategy', 'to', 'play', 'his', 'flame', 'and', 'his', 'presence', 'against', 'the', 'game', 'of', 'oppression', 'her', 'churchmen', 'were', 'waging', 'against', 'her?', 'Never', 'would', 'she', 'forget', 'the', 'effect', 'on', 'Tull', 'and', 'his', 'men', 'when', 'Venters', 'shouted', "Lassiter's", 'name.', 'If', 'she', 'could', 'not', 'wholly', 'control', 'Lassiter,', 'then', 'what', 'she', 'could', 'do', 'might', 'put', 'off', 'the', 'fatal', 'day.', 'One', 'of', 'her', 'safe', 'racers', 'was', 'a', 'dark', 'bay,', 'and', 'she', 'called', 'him', 'Bells', 'because', 'of', 'the', 'way', 'he', 'struck', 'his', 'iron', 'shoes', 'on', 'the', 'stones.', 'When', 'Jerd', 'led', 'out', 'this', 'slender,', 'beautifully', 'built', 'horse', 'Lassiter', 'suddenly', 'became',

In [49]:
print(Y[1].shape)

(1, 400)


In [50]:
import sklearn
sklearn.save("bert_model")

from IPython.display import FileLink
FileLink(‘bert_model.pth’)

SyntaxError: invalid character in identifier (<ipython-input-50-39d1a4bf68c1>, line 5)