# SciBERT Implementation

- SciBERT: https://github.com/allenai/scibert
- Notebook: https://github.com/huggingface/transformers/blob/master/notebooks/02-transformers.ipynb

In [1]:
import torch
from transformers import *

In [2]:
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')

In [3]:
torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x233b04b4908>

In [4]:
# Tokens comes from a process that splits the input into sub-entities with interesting linguistic properties. 
tokens = tokenizer.tokenize("Complementary Dual-Contact Switch Using Soft and Hard Contact Materials for Achieving Low Contact Resistance and High Reliability Simultaneously")
print("Tokens: {}".format(tokens))

# This is not sufficient for the model, as it requires integers as input, 
# not a problem, let's convert tokens to ids.
tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
print("Tokens id: {}".format(tokens_ids))

# Add the required special tokens
tokens_ids = tokenizer.build_inputs_with_special_tokens(tokens_ids)

# We need to convert to a Deep Learning framework specific format, let's use PyTorch for now.
tokens_pt = torch.tensor([tokens_ids])
print("Tokens PyTorch: {}".format(tokens_pt))

# Now we're ready to go through BERT with out input
outputs, pooled = model(tokens_pt)
print("Token wise output: {}, Pooled output: {}".format(outputs.shape, pooled.shape))

Tokens: ['complementary', 'dual', '-', 'contact', 'switch', 'using', 'soft', 'and', 'hard', 'contact', 'materials', 'for', 'achieving', 'low', 'contact', 'resistance', 'and', 'high', 'reliability', 'simultaneously']
Tokens id: [8487, 4793, 579, 3585, 6216, 487, 1720, 137, 2723, 3585, 2518, 168, 9153, 629, 3585, 2661, 137, 597, 4817, 5364]
Tokens PyTorch: tensor([[ 102, 8487, 4793,  579, 3585, 6216,  487, 1720,  137, 2723, 3585, 2518,
          168, 9153,  629, 3585, 2661,  137,  597, 4817, 5364,  103]])
Token wise output: torch.Size([1, 22, 768]), Pooled output: torch.Size([1, 768])


In [5]:
pooled

tensor([[-0.0416, -0.1480, -0.6753,  0.8675,  0.2088,  0.9992,  0.2641, -0.9997,
          0.2043,  0.7729,  0.1298,  0.5405,  0.7846,  0.7665, -0.4069, -0.0907,
         -0.5304, -0.4133,  0.7156, -0.9106, -0.1457,  0.5916, -0.5221,  0.6518,
          0.1237,  0.4443, -0.9847, -0.1341, -0.2130,  0.4250,  0.4813, -0.9459,
          0.4612, -0.3520, -0.3998,  0.0332,  0.9989,  0.2931,  0.7156,  0.0493,
         -0.5442,  0.2990,  0.4944, -0.4461,  0.5150, -0.0930, -0.4712,  0.3879,
         -0.6986,  0.9627,  0.0752, -0.3971, -0.1057,  0.5300,  0.9873,  0.0485,
         -0.2699,  0.3184, -0.6361,  0.5775,  0.0217, -0.9774,  0.4412,  0.5084,
          0.5062, -0.4235, -0.2550, -0.3205,  0.9495,  0.2289,  0.9662,  0.3846,
          0.1071, -0.7250,  0.9554,  0.0102, -0.0994, -0.3838,  0.0567,  0.1527,
          0.2297,  0.1733,  0.0352, -0.9719, -0.5161, -0.4337, -0.0113, -0.3069,
          0.9672, -0.7773, -0.4541,  0.0539, -0.4561,  0.0442, -0.4799,  0.2286,
         -0.5568,  0.1368,  

In [6]:
# tokens = tokenizer.tokenize("This is an input example")
# tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
# tokens_pt = torch.tensor([tokens_ids])

# This code can be factored into one-line as follow
tokens_pt2 = tokenizer.encode_plus("Complementary Dual-Contact Switch Using Soft and Hard Contact Materials for Achieving Low Contact Resistance and High Reliability Simultaneously", return_tensors="pt")

for key, value in tokens_pt2.items():
    print("{}:\n\t{}".format(key, value))

outputs2, pooled2 = model(**tokens_pt2)
print("Difference with previous code: ({}, {})".format((outputs2 - outputs).sum(), (pooled2 - pooled).sum()))

input_ids:
	tensor([[ 102, 8487, 4793,  579, 3585, 6216,  487, 1720,  137, 2723, 3585, 2518,
          168, 9153,  629, 3585, 2661,  137,  597, 4817, 5364,  103]])
token_type_ids:
	tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
attention_mask:
	tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
Difference with previous code: (0.0, 0.0)
