# BERT (from HuggingFace Transformers) for Text Extraction


## Setup


In [280]:
!pip install transformers



In [281]:
!pip install tokenizers




In [282]:
#Imports

import os
import re
import json
import string
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, TFBertModel, BertConfig

max_len = 384
configuration = BertConfig()  # default parameters and configuration for BERT


## Set-up BERT tokenizer


In [283]:
# Save the slow pretrained tokenizer
slow_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
save_path = "bert_base_uncased/"
if not os.path.exists(save_path):
    os.makedirs(save_path)
slow_tokenizer.save_pretrained(save_path)

# Load the fast tokenizer from saved file
tokenizer = BertWordPieceTokenizer("bert_base_uncased/vocab.txt", lowercase=True)


## Neccessary Functions

In [284]:
class SquadExample:
    def __init__(self, question, context, start_char_idx, answer_text, all_answers):
        self.question = question
        self.context = context
        self.start_char_idx = start_char_idx
        self.answer_text = answer_text
        self.all_answers = all_answers
        self.skip = False

    def preprocess(self):
        context = self.context
        question = self.question
        answer_text = self.answer_text
        start_char_idx = self.start_char_idx

        # Clean context, answer and question
        context = " ".join(str(context).split())
        question = " ".join(str(question).split())
        answer = " ".join(str(answer_text).split())

        # Find end character index of answer in context
        end_char_idx = start_char_idx + len(answer)
        if end_char_idx >= len(context):
            self.skip = True
            return

        # Mark the character indexes in context that are in answer
        is_char_in_ans = [0] * len(context)
        for idx in range(start_char_idx, end_char_idx):
            is_char_in_ans[idx] = 1

        # Tokenize context
        tokenized_context = tokenizer.encode(context)

        # Find tokens that were created from answer characters
        ans_token_idx = []
        for idx, (start, end) in enumerate(tokenized_context.offsets):
            if sum(is_char_in_ans[start:end]) > 0:
                ans_token_idx.append(idx)

        if len(ans_token_idx) == 0:
            self.skip = True
            return

        # Find start and end token index for tokens from answer
        start_token_idx = ans_token_idx[0]
        end_token_idx = ans_token_idx[-1]

        # Tokenize question
        tokenized_question = tokenizer.encode(question)

        # Create inputs
        input_ids = tokenized_context.ids + tokenized_question.ids[1:]
        token_type_ids = [0] * len(tokenized_context.ids) + [1] * len(
            tokenized_question.ids[1:]
        )
        attention_mask = [1] * len(input_ids)

        # Pad and create attention masks.
        # Skip if truncation is needed
        padding_length = max_len - len(input_ids)
        if padding_length > 0:  # pad
            input_ids = input_ids + ([0] * padding_length)
            attention_mask = attention_mask + ([0] * padding_length)
            token_type_ids = token_type_ids + ([0] * padding_length)
        elif padding_length < 0:  # skip
            self.skip = True
            return

        self.input_ids = input_ids
        self.token_type_ids = token_type_ids
        self.attention_mask = attention_mask
        self.start_token_idx = start_token_idx
        self.end_token_idx = end_token_idx
        self.context_token_to_char = tokenized_context.offsets


In [285]:
def create_squad_examples(raw_data):
    squad_examples = []
    for item in raw_data["data"]:
        for para in item["paragraphs"]:
            context = para["context"]
            for qa in para["qas"]:
                question = qa["question"]
                answer_text = qa["answers"][0]["text"]
                all_answers = [_["text"] for _ in qa["answers"]]
                start_char_idx = qa["answers"][0]["answer_start"]
                squad_eg = SquadExample(
                    question, context, start_char_idx, answer_text, all_answers
                )
                squad_eg.preprocess()
                squad_examples.append(squad_eg)
    return squad_examples


In [286]:
def create_inputs_targets(squad_examples):
    dataset_dict = {
        "input_ids": [],
        "token_type_ids": [],
        "attention_mask": [],
        "start_token_idx": [],
        "end_token_idx": [],
    }
    for item in squad_examples:
        if item.skip == False:
            for key in dataset_dict:
                dataset_dict[key].append(getattr(item, key))
    for key in dataset_dict:
        dataset_dict[key] = np.array(dataset_dict[key])

    x = [
        dataset_dict["input_ids"],
        dataset_dict["token_type_ids"],
        dataset_dict["attention_mask"],
    ]
    y = [dataset_dict["start_token_idx"], dataset_dict["end_token_idx"]]
    return x, y


# Load Data

In [287]:
ft_data = pd.read_csv('/content/faq.csv')

In [288]:
ft_data.head()

Unnamed: 0,Question,Unnamed: 1,Answer,Changed answer,answer_start,Context_index
0,Are session pre-class quizzes graded?,Grading,No. Pre-class quiz is to just check your under...,pre class quizzes will not be graded,322,0
1,When is the deadline for the submission of ses...,Deadlines,5 PM on the day of the following lecture.,before five pm on the day of the following lec...,97,1
2,Will there be any extension allowed for the qu...,Deadlines,Only for exceptional cases.,Extensions will be allowed only in exceptional...,149,1
3,How many hours will I need to dedicate to succ...,General,About 15 hours per week.,about fifteen hours a week,26,2
4,Who will grade my exercise?,Grading,The exercises are auto-graded once you click t...,they are auto graded by the edstem platform,456,0


In [289]:
f = open('/content/Custom_Context.txt', 'r')
content = f.read()

In [290]:
content

"['Your final grade will be determined based on the weighted average of the exercises, participation, homework and a final project. The faculty reserves the flexibility to change the grade thresholds for the final grade. Exercises and the post class quizzes comprise of a total of thirty five percent of the final grade. The pre class quizzes are only for your understanding and will not be graded. The exercises need to be submitted before set deadlines and they are auto graded by the edstem platform. You need to be careful before changing some part of your code if you approach a question differently as the auto grader does not accept alternative solutions please follow the instructions. Ten percent of your grade is participation. Attendance is graded and has a small percent in the final grade. Labs are not graded. Check the Course Information page for more.']\n\n['All the assignments have set deadlines. The exercise and post class quizzes have to be submitted before five pm on the day of

In [291]:
split_text = content.splitlines()

In [292]:
context_0 = split_text[0][2:-2]
context_1 = split_text[2][2:-2]
context_2 = split_text[4][2:-2]
context_3 = split_text[6][2:-2]
context_4 = split_text[8][2:-2]


In [293]:
ft_data['answer_end'] =0
for i in range(len(ft_data)):
  ft_data['answer_end'][i] = len(ft_data['Changed answer'][i]) + ft_data['answer_start'][i]  

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [294]:
ft_data['context'] = ''
for i in range(len(ft_data)):
  if ft_data['Context_index'][i] == 0:
    ft_data['context'][i] = context_0

  elif ft_data['Context_index'][i] == 1:
    ft_data['context'][i] = context_1

  elif ft_data['Context_index'][i] == 2:
    ft_data['context'][i] = context_2

  elif ft_data['Context_index'][i] == 3:
    ft_data['context'][i] = context_3

  elif ft_data['Context_index'][i] == 4:
    ft_data['context'][i] = context_4

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice fr

In [295]:
ft_data.head()

Unnamed: 0,Question,Unnamed: 1,Answer,Changed answer,answer_start,Context_index,answer_end,context
0,Are session pre-class quizzes graded?,Grading,No. Pre-class quiz is to just check your under...,pre class quizzes will not be graded,322,0,358,Your final grade will be determined based on t...
1,When is the deadline for the submission of ses...,Deadlines,5 PM on the day of the following lecture.,before five pm on the day of the following lec...,97,1,147,All the assignments have set deadlines. The ex...
2,Will there be any extension allowed for the qu...,Deadlines,Only for exceptional cases.,Extensions will be allowed only in exceptional...,149,1,201,All the assignments have set deadlines. The ex...
3,How many hours will I need to dedicate to succ...,General,About 15 hours per week.,about fifteen hours a week,26,2,52,We expect you to work for about fifteen hours ...
4,Who will grade my exercise?,Grading,The exercises are auto-graded once you click t...,they are auto graded by the edstem platform,456,0,499,Your final grade will be determined based on t...


In [296]:
ft_data['answer_end'] =0
for i in range(len(ft_data)):
  ft_data['answer_end'][i] = len(ft_data['Changed answer'][i]) + ft_data['answer_start'][i]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [297]:
ft_data.columns = ['question', 'context_title', 'answer', 'answer_text', 'answer_start', 'context_id', 'answer_end', 'context']
ft_data.head()


Unnamed: 0,question,context_title,answer,answer_text,answer_start,context_id,answer_end,context
0,Are session pre-class quizzes graded?,Grading,No. Pre-class quiz is to just check your under...,pre class quizzes will not be graded,322,0,358,Your final grade will be determined based on t...
1,When is the deadline for the submission of ses...,Deadlines,5 PM on the day of the following lecture.,before five pm on the day of the following lec...,97,1,147,All the assignments have set deadlines. The ex...
2,Will there be any extension allowed for the qu...,Deadlines,Only for exceptional cases.,Extensions will be allowed only in exceptional...,149,1,201,All the assignments have set deadlines. The ex...
3,How many hours will I need to dedicate to succ...,General,About 15 hours per week.,about fifteen hours a week,26,2,52,We expect you to work for about fifteen hours ...
4,Who will grade my exercise?,Grading,The exercises are auto-graded once you click t...,they are auto graded by the edstem platform,456,0,499,Your final grade will be determined based on t...


In [298]:
squad_examples_ft = []

all_answers = ft_data['answer_text']

for i in range(len(ft_data)): 
  context = ft_data['context'][i]
  question = ft_data['question'][i]
  answer_text = ft_data['answer_text'][i]
  start_char_idx = ft_data['answer_start'][i]

  squad_eg = SquadExample(question, context, start_char_idx, answer_text, all_answers)
  squad_eg.preprocess()
  squad_examples_ft.append(squad_eg)

In [299]:
ft_examples = squad_examples_ft
x_train_ft, y_train_ft = create_inputs_targets(ft_examples)
print(f"{len(ft_examples)} training points created.")

33 training points created.


# BERT Model


In [300]:
def create_model():
    ## BERT encoder
    encoder = TFBertModel.from_pretrained("bert-base-uncased")

    ## QA Model
    input_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
    token_type_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
    attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32)
    embedding = encoder(
        input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
    )[0]

    start_logits = layers.Dense(1, name="start_logit", use_bias=False)(embedding)
    start_logits = layers.Flatten()(start_logits)

    end_logits = layers.Dense(1, name="end_logit", use_bias=False)(embedding)
    end_logits = layers.Flatten()(end_logits)

    start_probs = layers.Activation(keras.activations.softmax)(start_logits)
    end_probs = layers.Activation(keras.activations.softmax)(end_logits)

    model = keras.Model(
        inputs=[input_ids, token_type_ids, attention_mask],
        outputs=[start_probs, end_probs],
    )
    loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False)
    optimizer = keras.optimizers.Adam(lr=5e-5)
    model.compile(optimizer=optimizer, loss=[loss, loss])
    return model

This code should preferably be run on Google Colab TPU runtime.
With Colab TPUs, each epoch will take 5-6 minutes.


In [301]:
use_tpu = True
if use_tpu:
    # Create distribution strategy
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)

    # Create model
    with strategy.scope():
        model = create_model()
else:
    model = create_model()

model.summary()





INFO:tensorflow:Initializing the TPU system: grpc://10.88.114.122:8470


INFO:tensorflow:Initializing the TPU system: grpc://10.88.114.122:8470


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)
Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.










Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_13 (InputLayer)           [(None, 384)]        0                                            
__________________________________________________________________________________________________
input_15 (InputLayer)           [(None, 384)]        0                                            
__________________________________________________________________________________________________
input_14 (InputLayer)           [(None, 384)]        0                                            
__________________________________________________________________________________________________
tf_bert_model_4 (TFBertModel)   TFBaseModelOutputWit 109482240   input_13[0][0]                   
                                                                 input_15[0][0]             

  "The `lr` argument is deprecated, use `learning_rate` instead.")


# Training and Fine Tuning


In [302]:
use_tpu = True
if use_tpu:
    # Create distribution strategy
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)

    # Create model
    with strategy.scope():
        model = create_model()
else:
    model = create_model()

model.summary()

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.










Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_16 (InputLayer)           [(None, 384)]        0                                            
__________________________________________________________________________________________________
input_18 (InputLayer)           [(None, 384)]        0                                            
__________________________________________________________________________________________________
input_17 (InputLayer)           [(None, 384)]        0                                            
__________________________________________________________________________________________________
tf_bert_model_5 (TFBertModel)   TFBaseModelOutputWit 109482240   input_16[0][0]                   
                                                                 input_18[0][0]             

  "The `lr` argument is deprecated, use `learning_rate` instead.")


In [303]:
model.fit(
    x_train_ft,
    y_train_ft,
    epochs=600,  # For demonstration, 3 epochs are recommended
    batch_size=64
)

Epoch 2/600
Epoch 3/600
Epoch 4/600
Epoch 5/600
Epoch 6/600
Epoch 7/600
Epoch 8/600
Epoch 9/600
Epoch 10/600
Epoch 11/600
Epoch 12/600
Epoch 13/600
Epoch 14/600
Epoch 15/600
Epoch 16/600
Epoch 17/600
Epoch 18/600
Epoch 19/600
Epoch 20/600
Epoch 21/600
Epoch 22/600
Epoch 23/600
Epoch 24/600
Epoch 25/600
Epoch 26/600
Epoch 27/600
Epoch 28/600
Epoch 29/600
Epoch 30/600
Epoch 31/600
Epoch 32/600
Epoch 33/600
Epoch 34/600
Epoch 35/600
Epoch 36/600
Epoch 37/600
Epoch 38/600
Epoch 39/600
Epoch 40/600
Epoch 41/600
Epoch 42/600
Epoch 43/600
Epoch 44/600
Epoch 45/600
Epoch 46/600
Epoch 47/600
Epoch 48/600
Epoch 49/600
Epoch 50/600
Epoch 51/600
Epoch 52/600
Epoch 53/600
Epoch 54/600
Epoch 55/600
Epoch 56/600
Epoch 57/600
Epoch 58/600
Epoch 59/600
Epoch 60/600
Epoch 61/600
Epoch 62/600
Epoch 63/600
Epoch 64/600
Epoch 65/600
Epoch 66/600
Epoch 67/600
Epoch 68/600
Epoch 69/600
Epoch 70/600
Epoch 71/600
Epoch 72/600
Epoch 73/600
Epoch 74/600
Epoch 75/600
Epoch 76/600
Epoch 77/600
Epoch 78/600
Epoch 7

<tensorflow.python.keras.callbacks.History at 0x7f3569dfa350>

# Prediction

In [304]:
test_data = pd.read_csv("/content/Copy of test - test.csv")

In [305]:
test_data.head()

Unnamed: 0,Questions,Context_id
0,Will the pre-class session be recorded?,4
1,What is the deadline for quiz submission?,1
2,What is the deadline for exercise submission?,1
3,How many hours do I need to complete this course?,2
4,Who will grade the exercise?,0


In [306]:

test_data['context'] = ''
for i in range(len(test_data)):
  if test_data['Context_id'][i] == 0:
    test_data['context'][i] = context_0

  elif test_data['Context_id'][i] == 1:
    test_data['context'][i] = context_1

  elif test_data['Context_id'][i] == 2:
    test_data['context'][i] = context_2

  elif test_data['Context_id'][i] == 3:
    test_data['context'][i] = context_3

  elif test_data['Context_id'][i] == 4:
    test_data['context'][i] = context_4

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http

In [307]:
test_data.drop('Context_id', axis = 1, inplace = True)

In [308]:
#To preprocess the test questions
def preprocess_test(context, question):
  tokenized_context = tokenizer.encode(context)
  tokenized_question = tokenizer.encode(question)
  input_ids = tokenized_context.ids + tokenized_question.ids[1:]
  token_type_ids = [0] * len(tokenized_context.ids) + [1] * len(tokenized_question.ids[1:])
  attention_mask = [1] * len(input_ids)
  padding_length = max_len - len(input_ids)
  if padding_length > 0:
    input_ids = input_ids + ([0] * padding_length)
    attention_mask = attention_mask + ([0] * padding_length)
    token_type_ids = token_type_ids + ([0] * padding_length)
  elif padding_length < 0:
    None;
  input_ids = np.array(input_ids)
  token_type_ids = np.array(token_type_ids)
  attention_mask = np.array(attention_mask)
  x_test = [input_ids.reshape(1, 384), token_type_ids.reshape(1, 384), attention_mask.reshape(1, 384)]
  return x_test



#Tokens to translate
import nltk
nltk.download('punkt')

context_list = [quest for quest in test_data.context]
from nltk.tokenize import word_tokenize
tokenized_context = [word_tokenize(i) for i in context_list]


#To process Predictions
def process_output(prediction, i):
  start_idx = prediction[0][0].argmax()
  end_idx = prediction[1][0].argmax()
  answer = tokenized_context[i][start_idx:end_idx]
  return ' '.join(answer)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [309]:
for i in range(len(test_data)):
  x_test = preprocess_test(test_data.context[i], test_data.Questions[i])
  y_test = model.predict(x_test)
  answer = process_output(y_test, i)
  print('Question: ', test_data.Questions[i])
  print('Answer: ', answer)
  print('------------------------------------------------')
  print('                                                 ')
  

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond_8/Identity:0' shape=(None, 384) dtype=int64>, <tf.Tensor 'cond_8/Identity_1:0' shape=(None, 384) dtype=int64>, <tf.Tensor 'cond_8/Identity_2:0' shape=(None, 384) dtype=int64>]










Question:  Will the pre-class session be recorded?
Answer:  sessions will be recorded
------------------------------------------------
                                                 
Question:  What is the deadline for quiz submission?
Answer:  on the day of the following lecture . Extensions
------------------------------------------------
                                                 
Question:  What is the deadline for exercise submission?
Answer:  on the day of the following lecture . Extensions
------------------------------------------------
                                                 
Question:  How many hours do I need to complete this course?
Answer:  fifteen hours a week
------------------------------------------------
                                                 
Question:  Who will grade the exercise?
Answer:  the edstem platform . You need to be
------------------------------------------------
                                                 
Question:  Why i

In [310]:
#model.save_weights('bert_weights_final.h5')