In [1]:
import warnings
warnings.filterwarnings("ignore")

import torch
from transformers import BertTokenizer, BertModel
from txt2graph import text_to_graph

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

In [2]:

# Example sentence
text = ["BERT is great for natural language processing!", "Hi my name is Danik.", "Bob"]

# Tokenize input
inputs = tokenizer(text, return_tensors="pt", padding=True)
print(f'{inputs["input_ids"].shape=}')

# Forward pass (no gradient calculation needed for inference)
with torch.no_grad():
    outputs = model(**inputs)

# Extract embeddings
last_hidden_state = outputs.last_hidden_state  # Shape: [batch_size, seq_len, hidden_dim]

print(f'{last_hidden_state.shape=}')

inputs["input_ids"].shape=torch.Size([3, 10])
last_hidden_state.shape=torch.Size([3, 10, 768])


In [3]:
inputs

{'input_ids': tensor([[  101, 14324,  2003,  2307,  2005,  3019,  2653,  6364,   999,   102],
        [  101,  7632,  2026,  2171,  2003, 19522,  2243,  1012,   102,     0],
        [  101,  3960,   102,     0,     0,     0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 0, 0, 0, 0, 0, 0, 0]])}

In [4]:
b = text_to_graph(last_hidden_state, attn_mask=inputs['attention_mask'])
b

[Data(x=[10, 768], edge_index=[2, 18]),
 Data(x=[9, 768], edge_index=[2, 16]),
 Data(x=[3, 768], edge_index=[2, 4])]

In [5]:
for batch in b:
    print(batch.edge_index)

tensor([[0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 9],
        [1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8]])
tensor([[0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8],
        [1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7]])
tensor([[0, 1, 1, 2],
        [1, 2, 0, 1]])


In [6]:
c = text_to_graph(last_hidden_state[2], attn_mask=inputs['attention_mask'][2])

In [7]:
c[0].edge_index

tensor([[0, 1, 1, 2],
        [1, 2, 0, 1]])