In [1]:
from datasets import load_dataset,load_from_disk
# \t is the tab character in Python
dataset_dict = load_from_disk('dataset_aug')
dataset_dict=dataset_dict.select_columns(['smi','logkOH•'])


In [2]:
import torch
from transformers import GPT2Tokenizer, GPT2Model, GPT2Config, Trainer, TrainingArguments,GPT2DoubleHeadsModel,GPT2PreTrainedModel,GPT2ForSequenceClassification
from datasets import load_dataset

# 定义标记器和模型配置
tokenizer = GPT2Tokenizer.from_pretrained(r"D:\system\桌面\lcm-code\tokenizers_lcm\tokenizer_gpt_special.json")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token

'<|endoftext|>'

In [3]:
from torch import nn
from torch.nn.utils import prune

class GPT2LinearOutput(nn.Module):
    def __init__(self,n_block,drop,purn):
        super(GPT2LinearOutput, self).__init__()
        self.config=GPT2Config.from_pretrained('D:\system\桌面\lcm-code\pre_training\chem_gpt100')
        self.config.pad_token_id=tokenizer.pad_token_id
        self.config.num_attention_heads=4
        self.config.num_hidden_layers=n_block
        self.config.hidden_size=512
        self.gpt2 = GPT2Model(self.config)  #  使用预训练的GPT2模型
        self.linear = nn.Linear(self.config.hidden_size, 1)  # 输出维度为1的线性层
        for name, module in self.gpt2.named_modules():
            if isinstance(module, (nn.Dropout, nn.Dropout2d, nn.Dropout3d)):
                module.p = drop
        for name, module in self.gpt2.named_modules():
            if isinstance(module, torch.nn.Linear):
                prune.l1_unstructured(module, name='weight', amount=purn)

    def forward(self, input_ids, attention_mask):
        outputs = self.gpt2(input_ids=input_ids,attention_mask=attention_mask)
        last_hidden_states = outputs.last_hidden_state
        linear_output = self.linear(last_hidden_states)  # 只使用最后一个位置的隐藏状态
        return linear_output


In [4]:
tokenized_dataset = dataset_dict.map(
    lambda element: tokenizer(element['smi']),batched=True
)
def returnlength(example):
    return  {'length':len(example['input_ids'])-1}

tokenized_dataset=tokenized_dataset.map(returnlength)
tokenized_dataset=tokenized_dataset.select_columns(['input_ids','attention_mask','logkOH•','length'])
tokenized_dataset


Map:   0%|          | 0/2122 [00:00<?, ? examples/s]

Map:   0%|          | 0/138 [00:00<?, ? examples/s]

Map:   0%|          | 0/2122 [00:00<?, ? examples/s]

Map:   0%|          | 0/138 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'logkOH•', 'length'],
        num_rows: 2122
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'logkOH•', 'length'],
        num_rows: 138
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'logkOH•', 'length'],
        num_rows: 138
    })
})

In [8]:
import torch
# from torch.utils.data import DataLoader, Dataset
# model=GPT2LinearOutput()


In [9]:
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader, Dataset
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
tokenized_dataset.set_format("torch")



In [10]:
from torch.optim import AdamW
import torch.nn as nn
from transformers import get_scheduler

criterion = nn.MSELoss()
train_dataloader = DataLoader(tokenized_dataset["train"], batch_size=64, shuffle=True,collate_fn=data_collator)
eval_dataloader = DataLoader(tokenized_dataset["valid"], batch_size=69,collate_fn=data_collator)
test_dataloader=DataLoader(tokenized_dataset["test"], batch_size=69,collate_fn=data_collator)
num_train_epochs = 200
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

In [11]:
import pickle

# 读取 pickle 文件
with open('example-study.pkl', 'rb') as f:
    study = pickle.load(f)

# 现在你可以访问 study 对象中的信息，比如最优的超参数配置
print("最优参数配置:", study.best_params, study.best_value)


最优参数配置: {'num_blocks': 8, 'dropout': 0.2031915206133889, 'purns': 0.13211929362573221, 'lr': 0.00030663468933292816} 0.4131312966346741


In [12]:
from accelerate import Accelerator

accelerator = Accelerator()
model =GPT2LinearOutput(8,0.2,0.13)
# model.load_state_dict(torch.load('best_val0.model'))
train_dataloader, eval_dataloader,test_dataloader,model = accelerator.prepare(
    train_dataloader, eval_dataloader,test_dataloader,model
)
optimizer = AdamW(model.parameters(), lr=0.0003)
lr_scheduler = get_scheduler(
name="cosine",
optimizer=optimizer,
num_warmup_steps=num_training_steps//4,
num_training_steps=num_training_steps)

In [13]:
import torch
def caculate(loader,model):
    losses=[]
    model.eval()
    for batch in loader:
        with torch.no_grad():
            labels = batch['logkOH•']
            batch['input_ids']=batch['input_ids'].to(torch.int64)
            logits = model(batch['input_ids'],batch['attention_mask']).squeeze()[torch.arange(0,len(batch['input_ids'])),batch['length']]
            loss = criterion(logits, labels)
            losses.append(loss.unsqueeze(dim=0))
    torch.cuda.empty_cache()
    return torch.sqrt(torch.mean(torch.cat(losses,dim=0))).cpu().numpy()

In [14]:
import numpy as np
from sklearn.metrics import r2_score
def caculater2(loader,model):
    y_true=[]
    y_pre=[]
    model.eval()
    for batch in loader:
        with torch.no_grad():
            labels = batch['logkOH•']
            batch['input_ids']=batch['input_ids'].to(torch.int64)
            logits = model(batch['input_ids'],batch['attention_mask']).squeeze()[torch.arange(0,len(batch['input_ids'])),batch['length']]
            y_true.extend(labels.cpu().numpy())
            y_pre.extend(logits.cpu().numpy())
    return r2_score(y_true,y_pre)

In [None]:

from torch.cuda.amp import autocast
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm
import optuna

plt.style.use('ggplot')
plt.rcParams['font.family'] = 'Times New Roman'

train_losses=[]
eval_losses=[]
eval_r2s=[]
train_r2s=[]

completed_steps = 1

best_losses=[]
best_models=[]

best_val = float("inf")
scaler = torch.cuda.amp.GradScaler()
for epoch in tqdm(range(num_train_epochs),total=num_train_epochs):
    model.train()
    for step,batch in enumerate(train_dataloader):
        with autocast():

            labels = batch['logkOH•']
            batch['input_ids']=batch['input_ids'].to(torch.int64)

            logits = model(batch['input_ids'],batch['attention_mask']).squeeze()[torch.arange(0,len(batch['input_ids'])),batch['length']]

            loss = criterion(logits, labels)

            # loss=loss/gradient_accumulation_steps

            scaler.scale(loss).backward()

            # accelerator.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(optimizer)
            lr_scheduler.step()
            optimizer.zero_grad()
            completed_steps += 1
            scaler.update()
            torch.cuda.empty_cache()

        torch.cuda.empty_cache()


    eval_loss=caculate(eval_dataloader,model)
    train_loss=caculate(train_dataloader,model)


    eval_losses.append(eval_loss)
    train_losses.append(train_loss)


    eval_r2=caculater2(eval_dataloader,model)
    train_r2=caculater2(train_dataloader,model)
    eval_r2s.append(eval_r2)
    train_r2s.append(train_r2)

    tqdm.write(f'train_loss{train_loss} eval_loss:{eval_loss} trainr2 {train_r2} evalr2 {eval_r2}')

    torch.cuda.empty_cache()


    if len(best_models) < 3 or eval_loss < max(best_losses):
        best_models.append(model.state_dict())
        best_losses.append(eval_loss)

    # 如果保存的模型数量超过了5个，则删除验证集损失最大的模型
    if len(best_models) > 3:
        max_loss_idx = best_losses.index(max(best_losses))
        del best_models[max_loss_idx]
        del best_losses[max_loss_idx]

    if eval_loss < best_val:
        torch.save(model.state_dict(), 'best_test.model')
        best_val = eval_loss


fig, ax1 = plt.subplots(figsize=(10, 6))
# 绘制 accuracy
ax1.plot(range(num_train_epochs), train_r2s, label='Train r2', marker='o', color='blue', linestyle='--')
ax1.plot(range(num_train_epochs), eval_r2s, label='Eval r2', marker='x',color='blue', linestyle='--')
ax1.fill_between(range(num_train_epochs), train_r2s, eval_r2s, color='lightblue', alpha=0.1, zorder=2)
ax1.set_xlabel('Epochs', fontsize=10, fontweight='bold')
ax1.set_ylabel('R2', fontsize=10, fontweight='bold')
ax1.set_title('Train and Eval Accuracy Over Epochs', fontsize=16, fontweight='bold')
ax1.legend(loc='upper left', fontsize=10)

# 创建第二个 y 轴并绘制 loss
ax2 = ax1.twinx()
ax2.plot(range(num_train_epochs), train_losses, label='Train Loss', color='red', linestyle='--', marker='o')
ax2.plot(range(num_train_epochs), eval_losses, label='Eval Loss', color='red', linestyle='--', marker='x')
ax2.fill_between(range(num_train_epochs), train_losses, eval_losses, alpha=0.1,color='lightgreen')
ax2.set_ylabel('Loss', fontsize=10, fontweight='bold')
ax2.legend(loc='lower right', fontsize=10)

plt.grid(True)
plt.tight_layout()
plt.gcf().set_dpi(300)
plt.show()

  0%|          | 0/200 [00:00<?, ?it/s]

train_loss1.2702207565307617 eval_loss:1.2813682556152344 trainr2 -3.9139388631010945 evalr2 -3.188614633000592


In [None]:
model.load_state_dict(torch.load('best_val.model'))
model=accelerator.prepare(model)
caculater2(test_dataloader,model)

In [None]:
torch.save(best_models[1],'1.model')