In [1]:
from datasets import load_dataset,load_from_disk
# \t is the tab character in Python
dataset = load_from_disk(r'D:\system\桌面\lcm-code\tokenizers_lcm\dataset')

In [2]:
from transformers import GPT2TokenizerFast
tokenizer=GPT2TokenizerFast.from_pretrained(r'D:\system\桌面\lcm-code\tokenizers_lcm\tokenizer_gpt100.json')

tokenizer.pad_token = tokenizer.eos_token

In [3]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [4]:
def tokenize(element):
    outputs = tokenizer(
        element["isosmiles"],
    )
    return outputs


tokenized_dataset = dataset.map(
    tokenize, batched=True,remove_columns=dataset['train'].column_names
)

length=[len(i) for i in tokenized_dataset['train']['input_ids']]

In [5]:
tokenized_dataset=tokenized_dataset.filter(lambda example:len(example['input_ids'])<500)
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 213530
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 26692
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 26690
    })
})

In [6]:
from torch.nn import CrossEntropyLoss
import torch

def keytoken_weighted_loss(inputs, logits,attention_mask):
    # Shift so that tokens < n predict n
    shift_labels = inputs[..., 1:]
    shift_logits = logits[..., :-1, :]
    # Calculate per-token loss
    loss_fct = CrossEntropyLoss(reduction='none')
    loss = loss_fct(shift_logits.reshape(-1, shift_logits.size(-1)), shift_labels.reshape(-1))
    loss *= attention_mask[:,1:].reshape(-1)
    loss = loss.mean()

    return loss

In [7]:
from torch.utils.data.dataloader import DataLoader

tokenized_dataset.set_format("torch")
train_dataloader = DataLoader(tokenized_dataset["train"], batch_size=20, shuffle=True,collate_fn=data_collator)
eval_dataloader = DataLoader(tokenized_dataset["val"], batch_size=20,collate_fn=data_collator)
test_dataloader = DataLoader(tokenized_dataset["test"], batch_size=20,collate_fn=data_collator)
for batch in train_dataloader:
    break
batch['input_ids'].shape

torch.Size([20, 66])

In [8]:
def evaluate_improve(dataloader):
    model.eval()
    losses = []
    for step, batch in enumerate(dataloader):
        with torch.no_grad():
            logits=model(batch['input_ids']).logits
            loss=keytoken_weighted_loss(batch['input_ids'],logits,batch['attention_mask'])

        losses.append(loss.unsqueeze(dim=0))
    loss = torch.mean(torch.cat(losses,dim=0))
    try:
        perplexity = torch.exp(loss)
    except OverflowError:
        perplexity = float("inf")
    torch.cuda.empty_cache()
    return loss.item(), perplexity.item()

In [60]:
def precise(dataloader):
    model.eval()
    corrects=0
    totals=0
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(batch["input_ids"]).logits[..., :-1, :]

            labels=batch["input_ids"][..., 1:][batch['attention_mask'][:,1:].bool()].flatten()

            predict=outputs.argmax(dim=2)[batch['attention_mask'][:,1:].bool()].flatten()

            correct=(labels==predict).sum()
            total=len(labels)
        corrects+=correct
        totals+=total
    return corrects/totals

In [10]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)
# model = GPT2LMHeadModel(config).cuda()

model = GPT2LMHeadModel.from_pretrained('chem_gpt100')

In [17]:
from accelerate import Accelerator
Accelerator.mixed_precision == 'fp16'
accelerator = Accelerator()

model, train_dataloader, eval_dataloader, test_dataloader = accelerator.prepare(
    model, train_dataloader, eval_dataloader,test_dataloader
)

In [53]:
input=tokenizer('C1=CN(C(=O)NC1=O)C2C(C(C(O2)COP(=O)(O)OP(=O)(O)OC3C(C(C(C(O3)C(=O)O)O)O)O)O)O')

In [54]:
input_id=torch.tensor(input['input_ids']).cuda()
input_id

tensor([22,  9, 18, 90,  2, 22, 69, 33,  3, 79,  9, 18, 33,  3, 22, 10, 22,  2,
        22,  2, 22,  2, 33, 10,  3, 81, 69, 33, 74, 33,  3, 33, 34, 69, 33, 74,
        33,  3, 71, 11, 22,  2, 22,  2, 22,  2, 22,  2, 33, 11,  3, 22, 69, 33,
         3, 33,  3, 33,  3, 33,  3, 33,  3, 33,  3, 33], device='cuda:0')

In [55]:
predict=model(input_id).logits.argmax(dim=-1)

In [56]:
(predict[:-1]==input_id[1:]).sum()

tensor(60, device='cuda:0')

In [62]:
precise(eval_dataloader)

tensor(0.8515, device='cuda:0')