In [None]:
from datasets import load_dataset,load_from_disk
# \t is the tab character in Python
dataset = load_from_disk(r'D:\system\桌面\lcm-code\tokenizers_lcm\dataset')

In [None]:
from transformers import GPT2TokenizerFast
tokenizer=GPT2TokenizerFast.from_pretrained(r'D:\system\桌面\lcm-code\tokenizers_lcm\tokenizer_gpt100.json')

tokenizer.pad_token = tokenizer.eos_token

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
def tokenize(element):
    outputs = tokenizer(
        element["isosmiles"],
    )
    return outputs


tokenized_dataset = dataset.map(
    tokenize, batched=True,remove_columns=dataset['train'].column_names
)

length=[len(i) for i in tokenized_dataset['train']['input_ids']]

In [None]:
tokenized_dataset=tokenized_dataset.filter(lambda example:len(example['input_ids'])<500)
tokenized_dataset

In [None]:
from torch.nn import CrossEntropyLoss
import torch


def keytoken_weighted_loss(inputs, logits,attention_mask):
    # Shift so that tokens < n predict n
    shift_labels = inputs[..., 1:]
    shift_logits = logits[..., :-1, :]
    # Calculate per-token loss
    loss_fct = CrossEntropyLoss(reduction='none')
    loss = loss_fct(shift_logits.reshape(-1, shift_logits.size(-1)), shift_labels.reshape(-1))
    loss *= attention_mask[:,1:].reshape(-1)
    loss = loss.mean()

    return loss

In [None]:
weight_decay = 0.1

def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
    params_with_wd, params_without_wd = [], []
    for n, p in model.named_parameters():
        if any(nd in n for nd in no_decay):
            params_without_wd.append(p)
        else:
            params_with_wd.append(p)
    return [
        {"params": params_with_wd, "weight_decay": weight_decay},
        {"params": params_without_wd, "weight_decay": 0.0},
    ]

In [None]:
from torch.utils.data.dataloader import DataLoader

tokenized_dataset.set_format("torch")
train_dataloader = DataLoader(tokenized_dataset["train"], batch_size=20, shuffle=True,collate_fn=data_collator)
eval_dataloader = DataLoader(tokenized_dataset["val"], batch_size=20,collate_fn=data_collator)
test_dataloader = DataLoader(tokenized_dataset["test"], batch_size=20,collate_fn=data_collator)


In [None]:
def evaluate_improve(dataloader):
    model.eval()
    losses = []
    for step, batch in enumerate(dataloader):
        with torch.no_grad():
            logits=model(batch['input_ids'],attention_mask=batch['attention_mask']).logits
            loss=keytoken_weighted_loss(batch['input_ids'],logits,batch['attention_mask'])

        losses.append(loss.unsqueeze(dim=0))
    loss = torch.mean(torch.cat(losses,dim=0))
    try:
        perplexity = torch.exp(loss)
    except OverflowError:
        perplexity = float("inf")
    torch.cuda.empty_cache()
    return loss.item(), perplexity.item()

In [None]:
def precise(dataloader):
    model.eval()
    corrects=0
    totals=0
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(batch["input_ids"],attention_mask=batch['attention_mask']).logits[..., :-1, :]

            labels=batch["input_ids"][..., 1:][batch['attention_mask'][:,1:].bool()].flatten()

            predict=outputs.argmax(dim=2)[batch['attention_mask'][:,1:].bool()].flatten()

            correct=(labels==predict).sum()
            total=len(labels)
        corrects+=correct
        totals+=total
    torch.cuda.empty_cache()
    return corrects/totals

In [None]:
# precise(eval_dataloader)

In [None]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig

# config = AutoConfig.from_pretrained(
#     "gpt2",
#     vocab_size=len(tokenizer),
#     bos_token_id=tokenizer.bos_token_id,
#     eos_token_id=tokenizer.eos_token_id,
# )
# model = GPT2LMHeadModel(config)

In [None]:
model = GPT2LMHeadModel.from_pretrained('chem_gpt100')

In [None]:
from torch.optim import AdamW

optimizer = AdamW(get_grouped_params(model), lr=5e-5)

In [None]:
from accelerate import Accelerator
Accelerator.mixed_precision == 'fp16'
accelerator = Accelerator()

model, optimizer, train_dataloader, eval_dataloader, test_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader,test_dataloader
)

In [None]:
evaluate_improve(eval_dataloader)

In [None]:
torch.cuda.empty_cache()

In [None]:
from transformers import get_scheduler

num_train_epochs = 30
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=1000,
    num_training_steps=num_training_steps,
)

In [None]:
from huggingface_hub import Repository, get_full_repo_name,create_repo

model_name = "chem_gpt100"
repo_name = get_full_repo_name(model_name)
repo_name
# create_repo(repo_name)

In [None]:
output_dir = "chem_gpt100"
# repo = Repository(output_dir, clone_from=repo_name)

In [None]:
from tqdm.notebook import tqdm

samples_per_step=25
gradient_accumulation_steps = 100
eval_steps = 200

model.train()
completed_steps = 1
min_eval=0.160
train_losses = []
eval_losses = []
perplexcitys=[]


for epoch in range(num_train_epochs):
    for step, batch in tqdm(
        enumerate(train_dataloader, start=1),total=len(train_dataloader)
    ):
        logits = model(batch["input_ids"],attention_mask=batch['attention_mask']).logits
        loss = keytoken_weighted_loss(batch["input_ids"], logits, batch['attention_mask'])
        if step % (gradient_accumulation_steps) == 0:
            accelerator.print(
                {
                    "samples": step * samples_per_step,
                    "iter_steps": completed_steps,
                    "loss/train": loss.item(),
                }
            )
            train_losses.append(loss.item())

        loss = loss / gradient_accumulation_steps
        accelerator.backward(loss)
        if step % gradient_accumulation_steps == 0:
            accelerator.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            completed_steps += 1
            torch.cuda.empty_cache()

    torch.cuda.empty_cache()
    eval_loss, perplexity = evaluate_improve(eval_dataloader)
    accelerator.print({"loss/eval": eval_loss, "perplexity": perplexity})
    model.train()
    eval_losses.append(eval_loss)
    perplexcitys.append(perplexity)

    torch.cuda.empty_cache()
    if eval_losses[-1]<min_eval:
        # accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.save_pretrained(output_dir)
        # model.save_pretrained('chem_gpt')
        min_eval=eval_losses[-1]


            # if accelerator.is_main_process:
            #     tokenizer.save_pretrained(output_dir)
            #     repo.push_to_hub(
            #         commit_message=f"Training in progress step {step}", blocking=False
            #     )

In [None]:
import matplotlib.ticker as ticker
from matplotlib import pyplot as plt
plt.rcParams['font.family'] = 'Times New Roman'
# Plotting the losses
plt.figure(figsize=(10, 6))
plt.style.use('seaborn-darkgrid')  # 设置图表样式

plt.plot(range(len(train_losses)), train_losses, label='Train Loss', linewidth=2)
plt.plot([i*100 for i in range(31)][1:], eval_losses, label='Eval Loss', linewidth=2)

plt.xlabel('Iterations', fontsize=14)  # 设置X轴标签和字体大小
plt.ylabel('Loss', fontsize=14)  # 设置Y轴标签和字体大小
plt.title('Train Losses', fontsize=16)  # 设置标题和字体大小
plt.legend(fontsize=12)  # 设置图例字体大小
plt.grid(True)
plt.gca().yaxis.set_major_formatter(ticker.FormatStrFormatter('%.4f'))  # 设置Y轴刻度格式
plt.show()

In [None]:
# 将列表写入文本文件中
with open('eval.txt', 'w') as f:
    for item in eval_losses:
        f.write(str(item) +','+ '\n')

In [None]:
eval_losses

In [None]:
torch.cuda.empty_cache()

In [None]:
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained('chem_gpt100')