In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
!pip install transformers 
!pip install --upgrade tensorflow

Requirement already up-to-date: tensorflow in /usr/local/lib/python3.6/dist-packages (2.2.0rc2)


Import all required modules
 


In [0]:
import json 
import dask 
import dask.bag as db
import pandas as pd
import numpy as np
import torch 
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers  import TFBertModel, BertTokenizer, BertConfig
# from transformers import DistilBertForQuestionAnswering, DistilBertConfig
from sklearn.model_selection import train_test_split
from dask import delayed 
import tensorflow as tf 
from tensorflow.keras.initializers import glorot_uniform
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.activations import softmax
import os

Set the path where train data is located 


In [0]:
train_path = '/content/drive/My Drive/SQUAD/train-v2.0.json'

Create the tokenizer


In [0]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Create some fuctions, which will be helpful for data pre-processing


In [0]:
def find_sublist(lst, sub_lst):
    """
    This function looks for the sub-list in a list and returns the starting and
    ending indexes. If the sub-list was not found, 0 and 1 are returned.
    """
    for i in range(len(lst)):
        if lst[i:i+len(sub_lst)] == sub_lst:
            return i, i+len(sub_lst)
    return 0, 1

In [0]:
def encode_tokens(tokens, tokenizer, max_seq_length):
    """
    This fuction converts tokens into token ids depending on the tokenizer. In 
    addition, it adds zero-padding according to max-seq-length. If there are 
    500 token-ids in a sequance, this function will add 12 zeros into the end.
    """
    encoded_tokens = tokenizer.encode(tokens)
    padding = [0] * (max_seq_length - len(tokens) - 2)
    encoded_tokens += padding
    return encoded_tokens

In [0]:
def divide_sentences(context_sentences, tokenizer, empty_spaces):
    """
    In pretrained BERT model, the maximum sequance length is 512. If length of 
    an article is more than 512, then we have to divide this article somehow. If 
    we just divide the article, we may lose precious information from previous 
    words in the sentence. That is why we make a 'sentence-wise' division, which 
    means, that we divide articles by sentences, ton the words.

    This function returns the list of sentences.
    """
    sentences = []
    s = empty_spaces
    
    span = []
    for sentence in context_sentences:
        tokenized_sentence = tokenizer.tokenize(sentence)
        if len(span) + len(tokenized_sentence) < s:
            span += tokenized_sentence
        else:
            sentences.append(span)
            span = []
            s = empty_spaces
            
    sentences.append(span)

    return sentences

In [0]:
def one_hot(array, dim=512):
    """
    Converts a list to one-hot representation
    """
    np_ = np.array(array)
    new_array = np.zeros(shape=(np_.shape[0], dim))
    new_array[np.arange(np_.shape[0]), np_] = 1
    return np.array(new_array, dtype=np.int32)

In [0]:
class Articles:
    """
    Class for preprocessing SQUAD 2.0 dataset 
    """
    def __init__(self, train_path, batch_size=128):
        self.train_path = train_path
        self.batch_size = batch_size
        self.answers = None
        self.segment_ids = None
        self.token_ids = None
        self.start = None
        self.stop = None
        self.mask = None
        self.data = None


    def download_data(self):
        """
        This method reads the file, given as train_path and creates Pandas DataFrame
        for a better representation of data. 
        """
        questions = []
        start = []
        end = []
        answers = []
        contexts = []
        is_impossible = []

        with open(self.train_path, 'r') as f:
            text = f.read()
            data = json.loads(text)

            n = len(data["data"])

            for i in tqdm(range(n), position=0):
                for j in range(len(data["data"][i]["paragraphs"])):
                    for k in range(len(data["data"][i]["paragraphs"][j]["qas"])):
                        for l in range(len(data["data"][i]["paragraphs"][j]["qas"][k]["answers"])):

                            q = data["data"][i]["paragraphs"][j]["qas"][k]["question"]
                            s = data["data"][i]["paragraphs"][j]["qas"][k]["answers"][l]["answer_start"]
                            a = data["data"][i]["paragraphs"][j]["qas"][k]["answers"][l]["text"]
                            e = s + len(a)
                            c = data["data"][i]["paragraphs"][j]["context"]
                            is_imp = data["data"][i]["paragraphs"][j]["qas"][k]["is_impossible"]

                            questions.append(q)
                            start.append(s)
                            end.append(e)
                            answers.append(a)
                            contexts.append(c)
                            is_impossible.append(is_imp)

        self.data = pd.DataFrame({"context": contexts, "question": questions, 
                                  "start": start, "end": end, "answer": answers, 
                                  "is_impossible": is_impossible})
        

    def process(self, max_seq_length, cls_token="[CLS]", sep_token="[SEP]", fraction=100):
        """
        Converts data in DatFrame into the representations compatible with Bert
        """
        encoded_sep_token = tokenizer.encode(sep_token)[1]
        encoded_tokens_matrix = []
        mask_matrix = []
        segment_ids_matrix = []
        start = []
        stop = []
        for i in tqdm(range(int(self.data.shape[0]/ fraction))):
            context = self.data["context"][i]
            context_sentences = context.split(".")
            question = self.data["question"][i]
            answer = self.data["answer"][i]

            tokenized_question = tokenizer.tokenize(question)
            empty_spaces = max_seq_length - (len(tokenized_question) + 3)
            divided_sentences = divide_sentences(context_sentences, tokenizer, empty_spaces)

            for el in divided_sentences:
                tokens = []
                tokens += tokenized_question
                tokens.append(sep_token)
                tokens += el
            
                tokenized_answer = tokenizer.tokenize(answer)
                changed_start, changed_end = find_sublist(tokens, tokenized_answer)

                encoded_tokens = encode_tokens(tokens, tokenizer, 512)
                mask = [1 if el != 0 else 0 for el in encoded_tokens]
                segment_ids = [0 if j < encoded_tokens.index(encoded_sep_token) or encoded_tokens[j] == 0 else 1 for j in range(len(encoded_tokens))]
                
                encoded_tokens_matrix.append(encoded_tokens)
                mask_matrix.append(mask)
                segment_ids_matrix.append(segment_ids)
                start.append(changed_start)
                stop.append(changed_end)


        self.token_ids = np.array(encoded_tokens_matrix, dtype=np.int32)
        self.mask = np.array(mask_matrix, dtype=np.int32)
        self.segment_ids = np.array(segment_ids_matrix, dtype=np.int32)
        self.start = one_hot(start)
        self.stop = one_hot(stop)

In [11]:
a = Articles(train_path, batch_size=16)
a.download_data()
a.process(512, fraction=1)

100%|██████████| 442/442 [00:00<00:00, 2243.84it/s]
100%|██████████| 86821/86821 [05:50<00:00, 247.38it/s]


In [0]:
def create_model():
    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=1024,
        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
    
    bert = TFBertModel.from_pretrained('bert-base-uncased')
    
    dense = Dense(2)


    input_ids = Input(shape=(512,), dtype=tf.int32)
    mask = Input(shape=(512,), dtype=tf.int32)
    segment_ids = Input(shape=(512,), dtype=tf.int32)

    last_hidden_states, pooled_output = bert(input_ids, attention_mask=mask, token_type_ids=segment_ids)

    med = dense(last_hidden_states)

    start_logits, end_logits = tf.split(med, 2, axis=-1)
    start = tf.squeeze(start_logits, axis=-1)
    end = tf.squeeze(end_logits, axis=-1)

    start = softmax(start)
    end = softmax(end)

    model = Model(inputs=[input_ids, mask, segment_ids], outputs=[start, end])
    
    return model



In [13]:
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)
strategy = tf.distribute.experimental.TPUStrategy(resolver)

INFO:absl:Entering into master device scope: /job:worker/replica:0/task:0/device:CPU:0


INFO:tensorflow:Initializing the TPU system: grpc://10.27.102.90:8470


INFO:tensorflow:Initializing the TPU system: grpc://10.27.102.90:8470


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


In [14]:
optimizer = tf.optimizers.Adam(learning_rate=0.00001, decay=1e-5)
with strategy.scope():
    model = create_model()
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
    model.fit([a.token_ids, a.mask, a.segment_ids], [a.start, a.stop], epochs = 10)

Epoch 1/10
















Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
 443/2717 [===>..........................] - ETA: 7:14 - loss: 0.7434 - tf_op_layer_Softmax_1_loss: 0.3049 - tf_op_layer_Softmax_accuracy: 0.8394 - tf_op_layer_Softmax_1_accuracy: 0.8940 - tf_op_layer_Softmax_loss: 0.4385

KeyboardInterrupt: ignored