# Setup

In [4]:
!pip install -q tf-models-official==2.3.0
!pip install transformers

[K     |████████████████████████████████| 849kB 17.2MB/s 
[K     |████████████████████████████████| 174kB 23.0MB/s 
[K     |████████████████████████████████| 102kB 13.8MB/s 
[K     |████████████████████████████████| 37.6MB 85kB/s 
[K     |████████████████████████████████| 1.2MB 49.8MB/s 
[K     |████████████████████████████████| 358kB 56.5MB/s 
[?25h  Building wheel for py-cpuinfo (setup.py) ... [?25l[?25hdone
Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/88/b1/41130a228dd656a1a31ba281598a968320283f48d42782845f6ba567f00b/transformers-4.2.2-py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 18.2MB/s 
[?25hCollecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 53.2MB/s 
Collecting sacremoses
[?25l  Downloading

In [5]:
import os
import json

import re

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
from sklearn.model_selection import train_test_split

This directory contains the configuration, vocabulary, and a pre-trained checkpoint used in this tutorial:

# Dataset

In [6]:
# import sys
# !git clone "https://github.com/giuliofortini/NLP_SQuAD_Project"
# sys.path.append("NLP_SQuAD_Project/")

from google.colab import files
try:
  with open('training_set.json') as f:
    json_data = json.load(f)
except:
  from google.colab import drive
  drive.mount('/content/drive')
  with open('/content/drive/My Drive/SQUAD/training_set.json') as f:
    json_data = json.load(f)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
data = pd.json_normalize(json_data['data'])
data

Unnamed: 0,title,paragraphs
0,University_of_Notre_Dame,"[{'context': 'Architecturally, the school has ..."
1,Beyoncé,[{'context': 'Beyoncé Giselle Knowles-Carter (...
2,Montana,[{'context': 'Montana i/mɒnˈtænə/ is a state i...
3,Genocide,"[{'context': 'The phrase ""in whole or in part""..."
4,Antibiotics,[{'context': 'The emergence of resistance of b...
...,...,...
437,Police,[{'context': 'A police force is a constituted ...
438,"Punjab,_Pakistan","[{'context': 'Punjab (Urdu, Punjabi: پنجاب, pa..."
439,Infection,[{'context': ' Among the vast varieties of mic...
440,Hunting,[{'context': 'Hunting is the practice of killi...


In [8]:
train, test = train_test_split(data, test_size=0.2)

In [9]:
def preprocess_df(df):
  temp = []
  title_dict = {}
  contexts = []

  for i, row in df.iterrows():
    for context in row['paragraphs']:
      contexts.append(context['context'])
      for qa in context['qas']:
        question_id = qa['id']
        question = qa['question']
        for answer in qa['answers']:
          answer_text = answer['text']
          answer_start = answer['answer_start']
          answer_end = answer_start+len(answer_text)
          temp.append([question_id, question, answer_text, answer_start, answer_end, i, len(contexts)-1])


  context_dict = dict(enumerate(contexts))

  df = pd.DataFrame(temp, columns=['question_id', 'question_text', 'answer_text', 'answer_start', 'answer_end', 'title_id', 'context_id'])
  
  return df, context_dict

train_df, train_context_dict = preprocess_df(train)
test_df, test_context_dict = preprocess_df(test)

In [10]:
train_df

Unnamed: 0,question_id,question_text,answer_text,answer_start,answer_end,title_id,context_id
0,570c2257ec8fbc190045bc62,What type of land is Antarctica considered to be?,desert,160,166,153,0
1,570c2257ec8fbc190045bc63,What is Antarctica's annual precipitation alon...,200 mm (8 in),202,215,153,0
2,570c2257ec8fbc190045bc64,What is Antarctica's average temperature durin...,−63 °C (−81 °F),392,407,153,0
3,570c2257ec8fbc190045bc65,Approximately how many people live throughout ...,"anywhere from 1,000 to 5,000",453,481,153,0
4,570c2257ec8fbc190045bc66,What are some animals native to Antarctica?,"mites, nematodes, penguins, seals and tardigrades",697,746,153,0
...,...,...,...,...,...,...,...
69866,5706b6aa2eaba6190074ac60,In what city is the Asian and Pacific Centre f...,New Delhi,183,192,107,14967
69867,5706b6aa2eaba6190074ac61,What major city is home to most regional UN of...,New Delhi,194,203,107,14967
69868,5706b6aa2eaba6190074ac62,What is one regional UN office located in New ...,UNDP,260,264,107,14967
69869,5706b6aa2eaba6190074ac63,The regional office of the World Bank in India...,New Delhi,194,203,107,14967


In [11]:
import random
def print_squad_sample(train_data, context_dict, line_length=120, separator_length=150):
  sample = train_data.sample(frac=1).head(1)
  context = context_dict[sample['context_id'].item()]
  print('='*separator_length)
  print('CONTEXT: ')
  print('='*separator_length)
  lines = [''.join(context[idx:idx+line_length]) for idx in range(0, len(context), line_length)]
  for l in lines:
      print(l)
  #print(context)
  print('='*separator_length)
  questions = train_data[train_data['context_id'] == sample['context_id'].item()]
  print('QUESTION:', ' '*(3*separator_length//4), 'ANSWER:')
  for idx, row in questions.iterrows():
    question = row.question_text
    answer = row.answer_text
    print(question, ' '*(3*separator_length//4-len(question)+9), answer)

print_squad_sample(train_df, train_context_dict)

CONTEXT: 
About that time the "executionist movement" (Polish: "egzekucja praw"--"execution of the laws") began to take form. Its 
members would seek to curb the power of the magnates at the Sejm and to strengthen the power of king and country. In 156
2 at the Sejm in Piotrków they would force the magnates to return many leased crown lands to the king, and the king to c
reate a standing army (wojsko kwarciane). One of the most famous members of this movement was Jan Zamoyski. After his de
ath in 1605, the movement lost its political force.
QUESTION:                                                                                                                  ANSWER:
What was the movement called also known as execution of laws?                                                              "executionist movement
What were the intentions of executionists movement?                                                                        seek to curb the power of the magnates at the Sejm and

# Encoding

#Tokenizer

In [12]:
from transformers import BertTokenizer, BertModel

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

print('[[CLS], [SEP]] = {}'.format(tokenizer.convert_tokens_to_ids(['[CLS]', '[SEP]'])))

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…


[[CLS], [SEP]] = [101, 102]


In [13]:
# Split the sentence into tokens.
tokenized_text = tokenizer.tokenize(train_df['question_text'][0])

# Print out the tokens.
print(tokenized_text)

# Map the token strings to their vocabulary indeces.
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

# Display the words with their indeces.
for tup in zip(tokenized_text, indexed_tokens):
    print('{:<12} {:>_6}'.format(tup[0], tup[1]))

['what', 'type', 'of', 'land', 'is', 'antarctica', 'considered', 'to', 'be', '?']
what          2,054
type          2,828
of            1,997
land          2,455
is            2,003
antarctica   12,615
considered    2,641
to            2,000
be            2,022
?             1,029


In [36]:
def from_df_to_model_dict(df, context_dict):

  # initialize structures
  input_ids = []
  input_mask = []
  input_type_ids = []
  start_indexes = []
  end_indexes = []

  cut_counter = 0
  max_iter = len(df)
  for i, row in df.iterrows():

    # print progress
    if i % 1000 == 0 and i >0:
      print(f"\r{i}/{max_iter}", end="")

    # encode question and context
    encoded_question = tokenizer.encode(row['question_text'])
    encoded_context = tokenizer.encode(context_dict[row['context_id']])[1:]
    
    # concatenate input data 
    encoded_input = encoded_question + encoded_context

    if len(encoded_input) > 512:
      cut_counter += 1
      encoded_input = encoded_input[:512]
                                       
    # create mask of ones
    ones_mask = tf.ones_like(encoded_input)

    # add padding and convert to tensor
    encoded_input = tf.keras.preprocessing.sequence.pad_sequences([encoded_input], maxlen=512, padding='pre')
    encoded_input = tf.squeeze(tf.convert_to_tensor(encoded_input))

    # create input_type_ids
    type_ids = tf.concat([tf.zeros_like(encoded_question, dtype=tf.int32), 
                          tf.ones_like(encoded_context, dtype=tf.int32)], 
                         axis=-1)

    type_ids = tf.keras.preprocessing.sequence.pad_sequences([type_ids], maxlen=512, padding='pre')
    type_ids = tf.squeeze(tf.convert_to_tensor(type_ids))

    # create mask of zeros
    zeros_mask = tf.zeros(tf.shape(encoded_input)[-1]-tf.shape(ones_mask)[-1], dtype=tf.int32)
    mask = tf.concat([zeros_mask, ones_mask], axis=-1)

    # append elements to lists
    input_ids.append(encoded_input)
    input_mask.append(mask)
    input_type_ids.append(type_ids)

    tokenized_answer = tokenizer.encode(row['answer_text'])[1: -1]
    start_idx = list(encoded_input).index(tokenized_answer[0])
    end_idx = start_idx + len(tokenized_answer)
    
    start_indexes.append(start_idx)
    end_indexes.append(end_idx)

  print("cut_counter: ", cut_counter)

  # save input data as dictionary
  inputs = {
    'input_ids': tf.convert_to_tensor(input_ids),
    'input_mask': tf.convert_to_tensor(input_mask),
    'input_type_ids': tf.convert_to_tensor(input_type_ids)
    }

  return inputs, start_indexes, end_indexes

In [37]:
train_dict, train_starts, train_ends = from_df_to_model_dict(train_df, train_context_dict)
test_dict, test_starts, test_ends = from_df_to_model_dict(test_df, test_context_dict)

for i in range(10):
  print(train_starts[i], train_ends[i])

29000/30000cut_counter:  27
Too long sequences ids:  [10968, 10969, 10970, 10971, 10972, 17342, 17343, 17344, 17345, 18916, 18917, 18918, 18919, 18920, 19957, 19958, 19959, 19960, 19961, 19962, 19963, 19964, 27803, 27804, 27805, 27806, 27807]


In [91]:
# longest answer
max_len = 0
for answer in train_df["answer_text"]:
  length = len(tokenizer.encode(answer))
  max_len = max(length, max_len)

In [92]:
max_len

70

In [51]:
id = 10969
context = train_context_dict[train_df["context_id"][id]]
question = train_df["question_text"][id]
print("id: ", id)
print("Question: ", question)
print("Context: ", context)

print("\nTokenized question len: ", len(tokenizer.encode(question)))
print("Tokenized context len: ", len(tokenizer.encode(context)))
print("Total length: ", len(tokenizer.encode(context) + tokenizer.encode(question)))

id:  10969
Question:  What was the title of Aerosmith's 1987 comeback album?
Context:  Established acts benefited from the new commercial climate, with Whitesnake's self-titled album (1987) selling over 17 million copies, outperforming anything in Coverdale's or Deep Purple's catalogue before or since. It featured the rock anthem "Here I Go Again '87" as one of 4 UK top 20 singles. The follow-up Slip of the Tongue (1989) went platinum, but according to critics Steve Erlwine and Greg Prato, "it was a considerable disappointment after the across-the-board success of Whitesnake". Aerosmith's comeback album Permanent Vacation (1987) would begin a decade long revival of their popularity. Crazy Nights (1987) by Kiss was the band's highest charting release in the US since 1979 and the highest of their career in the UK. Mötley Crüe with Girls, Girls, Girls (1987) continued their commercial success and Def Leppard with Hysteria (1987) hit their commercial peak, the latter producing seven hit si

In [95]:
longest_id = [10968, 10969, 10970, 10971, 10972, 17342, 17343, 17344, 17345, 18916, 18917, 18918, 18919, 18920, 19957, 19958, 19959, 19960, 19961, 19962, 19963, 19964, 27803, 27804, 27805, 27806, 27807]

lengths = [len(tokenizer.tokenize(train_context_dict[train_df["context_id"][id]])) for id in longest_id]
print(max(lengths))

718


In [101]:
cristo = list(range(718))

# ensure to preserve all sub-sequences of length limi-stride. 
def split_long_sequence(my_sequence, limit, stride):
  rest = my_sequence
  split = []
  while len(rest) > limit:
    left_hand = rest[:limit]
    rest = rest[stride:]
    split.append(left_hand)
  split.append(rest)
  return split 

limit = 512
stride = 200
for s in split_long_sequence(cristo, limit=limit, stride=stride):
  print(s)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221,

In [30]:
print(train_dict['input_ids'][0])
print(train_dict['input_mask'][0])
print(train_dict['input_type_ids'][0])

tf.Tensor(
[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0    

In [34]:
text = "porca la madonna quella zozzona"
tokenizer.encode(text) + tokenizer.encode(text)

[101,
 18499,
 3540,
 2474,
 11284,
 10861,
 4571,
 1062,
 18153,
 11597,
 2050,
 102,
 101,
 18499,
 3540,
 2474,
 11284,
 10861,
 4571,
 1062,
 18153,
 11597,
 2050,
 102]

In [None]:
# Set up epochs and steps
epochs = 3
batch_size = 32

train_data_size = len(train_dict['input_ids'])
test_data_size = len(test_dict['input_ids'])
train_steps_per_epoch = int(train_data_size / batch_size)
test_steps_per_epoch = int(test_data_size / batch_size)

# creates an optimizer with learning rate schedule
# optimizer = nlp.optimization.create_optimizer(
#     2e-5, num_train_steps=num_train_steps, num_warmup_steps=warmup_steps)

In [None]:
from transformers import TFBertModel
model = TFBertModel.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=536063208.0, style=ProgressStyle(descri…




Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
print(train_dict['input_type_ids'].shape)
print(train_dict['input_ids'].shape)
print(train_dict['input_mask'].shape)

print(test_dict['input_type_ids'].shape)
print(test_dict['input_ids'].shape)
print(test_dict['input_mask'].shape)

(10000, 512)
(10000, 512)
(10000, 512)
(10000, 512)
(10000, 512)
(10000, 512)


In [None]:
def train_model(train_dict, steps, batch_size=32):
  for i in range(0, steps):
    input = {'input_ids' : train_dict['input_ids'][i:i+batch_size],
            'input_mask' : train_dict['input_mask'][i:i+batch_size],
            'input_type_ids' : train_dict['input_type_ids'][i:i+batch_size]
            }
    print('Batch: ' + str(i+1) + '/' + str(steps))
    model(input, training = True)

train_model(train_dict, train_steps_per_epoch)

Batch: 1/312
Batch: 2/312
Batch: 3/312
Batch: 4/312
Batch: 5/312
Batch: 6/312
Batch: 7/312
Batch: 8/312
Batch: 9/312
Batch: 10/312
Batch: 11/312
Batch: 12/312
Batch: 13/312
Batch: 14/312
Batch: 15/312
Batch: 16/312
Batch: 17/312
Batch: 18/312
Batch: 19/312
Batch: 20/312
Batch: 21/312
Batch: 22/312
Batch: 23/312
Batch: 24/312
Batch: 25/312
Batch: 26/312
Batch: 27/312
Batch: 28/312
Batch: 29/312
Batch: 30/312
Batch: 31/312
Batch: 32/312
Batch: 33/312
Batch: 34/312
Batch: 35/312
Batch: 36/312
Batch: 37/312
Batch: 38/312
Batch: 39/312
Batch: 40/312
Batch: 41/312
Batch: 42/312
Batch: 43/312
Batch: 44/312
Batch: 45/312
Batch: 46/312
Batch: 47/312
Batch: 48/312
Batch: 49/312
Batch: 50/312
Batch: 51/312
Batch: 52/312
Batch: 53/312
Batch: 54/312
Batch: 55/312
Batch: 56/312
Batch: 57/312
Batch: 58/312
Batch: 59/312
Batch: 60/312
Batch: 61/312
Batch: 62/312
Batch: 63/312
Batch: 64/312
Batch: 65/312
Batch: 66/312
Batch: 67/312
Batch: 68/312
Batch: 69/312
Batch: 70/312
Batch: 71/312
Batch: 72/312
B

In [None]:
def test_model(test_dict, steps, batch_size=32):
  output = []

  for i in range(0, steps):
    input = {'input_ids' : test_dict['input_ids'][i:i+batch_size],
            'input_mask' : test_dict['input_mask'][i:i+batch_size],
            'input_type_ids' : test_dict['input_type_ids'][i:i+batch_size]
            }
    print('Batch: ' + str(i+1) + '/' + str(steps))
    output.append(model(input))
  
  return output

test_answers = test_model(test_dict, test_steps_per_epoch)

Batch: 1/312
Batch: 2/312
Batch: 3/312
Batch: 4/312
Batch: 5/312
Batch: 6/312
Batch: 7/312
Batch: 8/312
Batch: 9/312
Batch: 10/312
Batch: 11/312
Batch: 12/312
Batch: 13/312
Batch: 14/312
Batch: 15/312
Batch: 16/312
Batch: 17/312
Batch: 18/312
Batch: 19/312
Batch: 20/312
Batch: 21/312
Batch: 22/312
Batch: 23/312
Batch: 24/312
Batch: 25/312
Batch: 26/312
Batch: 27/312
Batch: 28/312
Batch: 29/312
Batch: 30/312
Batch: 31/312


In [None]:
print(test_answers[0].last_hidden_state)

tf.Tensor(
[[[-0.4465253   0.48536447  0.06580014 ... -0.46521625  0.41014314
   -0.70420337]
  [-0.6818489  -0.2513821   0.5476869  ... -0.3495175   0.79528636
   -0.9575961 ]
  [-0.33148208  0.00827573  0.59227216 ... -0.5239812   0.08607539
   -1.0944345 ]
  ...
  [-1.0840538   0.35074574  0.7235429  ...  0.43597403  0.7232471
   -0.30706415]
  [-0.08872701 -1.2204639   1.1156192  ... -0.00800674 -0.06154624
   -1.140662  ]
  [ 0.55041736  0.35058394  0.09902531 ... -0.11558585 -0.48601955
   -0.3842366 ]]

 [[-0.44596645  0.5361833   0.12666115 ... -0.4712901   0.39068177
   -0.7523664 ]
  [-0.51936555 -0.13520418  0.48739904 ... -0.37786257  0.79161364
   -0.98738134]
  [-0.2422797   0.05043562  0.59685683 ... -0.52779245  0.10194599
   -1.0443174 ]
  ...
  [ 0.19997582  0.23308192 -0.20614423 ...  0.37549567 -0.14534032
    0.47882754]
  [-0.29459804  0.00491316 -0.3172179  ... -0.82158023  0.09964563
   -0.937211  ]
  [ 0.55980796  0.37440133  0.11636388 ... -0.08858828 -0.49803

In [None]:
idx = test_answers[0].last_hidden_state[0]
idx

<tf.Tensor: shape=(512, 768), dtype=float32, numpy=
array([[-0.4465253 ,  0.48536447,  0.06580014, ..., -0.46521625,
         0.41014314, -0.70420337],
       [-0.6818489 , -0.2513821 ,  0.5476869 , ..., -0.3495175 ,
         0.79528636, -0.9575961 ],
       [-0.33148208,  0.00827573,  0.59227216, ..., -0.5239812 ,
         0.08607539, -1.0944345 ],
       ...,
       [-1.0840538 ,  0.35074574,  0.7235429 , ...,  0.43597403,
         0.7232471 , -0.30706415],
       [-0.08872701, -1.2204639 ,  1.1156192 , ..., -0.00800674,
        -0.06154624, -1.140662  ],
       [ 0.55041736,  0.35058394,  0.09902531, ..., -0.11558585,
        -0.48601955, -0.3842366 ]], dtype=float32)>

In [None]:
tokenizer.decode(np.argmax(idx, axis=1))

'[unused200] [unused617] [unused303] [unused303] [unused303] [unused303] [unused268] [unused268] [unused268] [unused303] [unused268] [unused268] [unused268] [unused303] [unused268] [unused268] [unused268] [unused268] [unused303] [unused303] [unused268] [unused268] [unused268] [unused268] [unused268] [unused268] [unused268] [unused268] [unused268] [unused268] [unused268] [unused268] [unused268] [unused268] [unused268] [unused268] [unused268] [unused303] [unused268] [unused303] [unused303] [unused303] [unused303] [unused303] [unused303] [unused303] [unused303] [unused303] [unused303] [unused303] [unused303] [unused303] [unused268] [unused268] [unused268] [unused268] [unused268] [unused268] [unused268] [unused268] [unused268] [unused268] [unused268] [unused268] [unused268] [unused268] [unused268] [unused268] [unused268] [unused268] [unused268] [unused268] [unused268] [unused268] [unused268] [unused268] [unused268] [unused268] [unused268] [unused268] [unused268] [unused268] [unused268] [un

In [None]:
int_indexes = []
for id in idx:
  int_indexes.append([id[-2], id[-1]])

In [None]:
idx[0]

<tf.Tensor: shape=(768,), dtype=float32, numpy=
array([-4.46525306e-01,  4.85364467e-01,  6.58001378e-02,  4.71078724e-01,
       -4.78263587e-01, -1.06993869e-01,  1.12684917e+00, -3.90133917e-01,
        1.38233513e-01,  1.15629986e-01, -1.96294591e-01, -6.29772484e-01,
       -2.84878433e-01,  5.34518957e-01,  5.14786899e-01,  4.44629073e-01,
        1.03844553e-01,  6.91295564e-01, -2.27408230e-01,  9.72810313e-02,
        4.71048623e-01, -7.89988190e-02,  7.51327097e-01,  1.58075243e-01,
       -6.11185990e-02,  1.17476657e-02, -2.52782345e-01, -1.48210025e+00,
       -7.44268894e-01, -1.95111185e-01, -5.63248217e-01,  7.19994545e-01,
       -1.90814175e-02, -5.12300432e-01,  4.11216706e-01, -2.07561985e-01,
       -4.91175473e-01, -2.02412933e-01,  8.61362278e-01, -1.97490662e-01,
       -1.84415698e-01, -4.09926832e-01,  5.78411400e-01, -8.86907354e-02,
       -3.24657738e-01, -1.69748947e-01, -3.83694077e+00,  1.88484922e-01,
       -4.50488269e-01, -9.69166040e-01, -1.16077825

In [None]:
tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(idx[0]))

'[PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [unused0] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [UNK] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [UNK] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [unused0] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [unused0] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [unused0] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [UNK] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [UNK] [PAD] [unused0] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [UNK] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [UNK] [PAD] [PAD] [PAD] [PAD] [PAD] [unused0] [PAD] [PAD] [PA