In [1]:
import gc
from tqdm.auto import tqdm

from pathlib import Path

import torch
from torch.utils.data import DataLoader
from datasets import load_from_disk
from transformers import OPTForCausalLM, AutoTokenizer, DataCollatorWithPadding

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

In [3]:
datasets = load_from_disk(r'data/galactica-125m/tokenized_applications.json')
checkpoint = r'/home/cedric.dietzi/projects/galactica/test-trainer/output_dir/facebook/galactica-125m/applications/Jun16_20-19-30_instance-1/checkpoint-14336'

model = OPTForCausalLM.from_pretrained(checkpoint, device_map="auto").base_model
model.to(device)

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Some weights of the model checkpoint at /home/cedric.dietzi/projects/galactica/test-trainer/output_dir/facebook/galactica-125m/applications/Jun16_20-19-30_instance-1/checkpoint-14336 were not used when initializing OPTForCausalLM: ['score.weight']
- This IS expected if you are initializing OPTForCausalLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing OPTForCausalLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
from torch.utils.tensorboard import SummaryWriter
logging_dir = Path('test-trainer/embds')
tensors_file = Path(logging_dir, 'tensors.tsv')
metadata_file = Path(logging_dir, 'metadata.tsv')


tb_writer = SummaryWriter(logging_dir)

testset = datasets['test']  #.select(range(64))
testset = testset.remove_columns(['_labels', 'id', 'title', 'text', 'token_type_ids'])
testset.set_format("torch")

batch_size = 1
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
dataloader = DataLoader(testset, shuffle=False, batch_size=batch_size, collate_fn=data_collator)

num_steps = len(testset) // batch_size
progress_bar = tqdm(range(num_steps))

# Cleaning
gc.collect()
torch.cuda.empty_cache()

tensors_f = open(tensors_file, 'w')
metadata_f = open(metadata_file, 'w')

for i, batch in enumerate(dataloader):

    _batch = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
    outputs = model(**_batch)
    #outputs = outputs['last_hidden_state'].mean(dim=1)
    outputs = outputs['last_hidden_state'][:,-1,:]
    #del _batch
 
    for embd in outputs:
        embd = [str(e) for e in embd.tolist()]
        embd = '\t'.join(embd)
        tensors_f.write(embd + '\n')
        #del embd
    #del outputs

    _batch = {k: v.to(device) for k, v in batch.items() if k == 'labels'}
    for meta in batch['labels']:
        meta = str(int(meta))
        metadata_f.write(meta + '\n')
        #del meta
    #del _batch
    
    # gc.collect()
    # torch.cuda.empty_cache()
    
    progress_bar.update(1)

tensors_f.close()
metadata_f.close()

  0%|          | 0/12005 [00:00<?, ?it/s]You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 12005/12005 [04:41<00:00, 38.16it/s]