In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from transformers import GPTNeoForSequenceClassification, GPTNeoForCausalLM, GPT2Tokenizer
from model import GPTNeoForSequenceClassificationBinary

model = GPTNeoForSequenceClassificationBinary.from_pretrained("/notebooks/results/checkpoint-3295")
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")


In [3]:
model.score

Linear(in_features=2048, out_features=1, bias=True)

In [4]:
from dataset import get_taxonomy_dataset, get_taxonomy_dataset_binary

In [5]:
dataset = get_taxonomy_dataset_binary('/notebooks/taxonomy.csv', entire_dataset=True)

Using custom data configuration default-7aa8b1b23cb12395
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-7aa8b1b23cb12395/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58)


  0%|          | 0/29754 [00:00<?, ?ex/s]

In [6]:
model = model.eval()
model = model.cuda()

In [9]:
from torch.utils.data import DataLoader
from transformers.data.data_collator import DataCollatorWithPadding
from tqdm.notebook import tqdm
import torch 

results = []

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorWithPadding(tokenizer)
train_dataloader = DataLoader(
    dataset["test"], batch_size=64,  collate_fn=data_collator
)
with torch.no_grad():
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = batch.to('cuda')
        output = model(batch['input_ids'])
        predicted_labels = output.logits > 0
        results.extend(predicted_labels.cpu().detach().numpy().tolist())

  0%|          | 0/465 [00:00<?, ?it/s]

In [11]:
results = [r[0] for r in results]

In [17]:
import pandas as pd
df = pd.read_csv('/notebooks/taxonomy.csv')
df['predicted_labels'] = results

df.to_csv('/notebooks/results.csv', index=False)

In [15]:
df

Unnamed: 0.1,Unnamed: 0,parent,child,group,flag,predicted_labels
0,0,space,mathematical_space,0,True,True
1,1,mathematical_space,manifold,0,True,True
2,2,mathematical_space,metric_space,0,True,True
3,3,metric_space,Euclidean_space,0,True,False
4,4,metric_space,Hilbert_space,0,True,False
...,...,...,...,...,...,...
29749,29749,bridgehead,strike_zone,514,False,False
29750,29750,Waldenses,Karaites,493,False,False
29751,29751,walk_through,appear,726,False,False
29752,29752,apple_aphid,pale_chrysanthemum_aphid,139,False,False


In [182]:
text = 'I am doing the taxonomy research. I think inauspiciousness is a subtopic of'
encode = tokenizer.encode(text, return_tensors='pt')
output = model.generate(encode, max_new_tokens=5)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [183]:
tokenizer.decode(output[0])

'I am doing the taxonomy research. I think inauspiciousness is a subtopic of subtopic<|endoftext|>'

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1]

tensor(3420)

In [38]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
tokenizer.pad_token = tokenizer.eos_token

In [45]:
from torch.utils.data import DataLoader
from dataset import DataCollatorWithPaddingAndMasking


data_collator = DataCollatorWithPaddingAndMasking(tokenizer)
train_dataloader = DataLoader(
    dataset["train"], batch_size=64,  collate_fn=data_collator
)

for step, batch in enumerate(train_dataloader):
    print(batch["label_mask"].shape, batch['input_ids'].shape)
    if step > 5:
        break

torch.Size([64, 25]) torch.Size([64, 25])
torch.Size([64, 25]) torch.Size([64, 25])
torch.Size([64, 26]) torch.Size([64, 26])
torch.Size([64, 23]) torch.Size([64, 23])
torch.Size([64, 23]) torch.Size([64, 23])
torch.Size([64, 24]) torch.Size([64, 24])
torch.Size([64, 25]) torch.Size([64, 25])


In [32]:
from torch.nn import functional
import torch
a = torch.tensor([[1, 2, 3, 4, 5, 6]])
b = torch.tensor([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]])

torch.cat((a, b), pad)

RuntimeError: Sizes of tensors must match except in dimension 0. Expected size 6 but got size 10 for tensor number 1 in the list.

In [43]:
input_text = "sample input text to generate"
encoding = tokenizer.encode(input_text, return_tensors="pt")
output=model(encoding)
torch.allclose(output.logits, full_logits[:, :encoding.shape[1], :], atol=1e-3)

True

In [None]:
def soft_prompt_test(parent_n,child_n):
  text_index = tokenizer.encode('I am doing the Taxonomy research. I think {child} is a subtopic of {parent}'.format(child=child_n, parent=parent_n),add_prefix_space=True)
  v_remove=tokenizer.encode(parent_n+" " +child_n,add_prefix_space=True)
  text_index_input=[i for i  in text_index if i not in set(v_remove)]
  vector = model.transformer.wte.weight[text_index_input,:]
  output=model(inputs_embeds=vector)
  res_v=torch.argmax(output.logits,dim=1)
  return res_v,text_index_input,text_index