In [1]:
%env CUDA_VISIBLE_DEVICES=1,2

env: CUDA_VISIBLE_DEVICES=1,2


In [2]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer
from datasets import Dataset

import torch
from torch import nn

import pandas as pd

from tqdm.notebook import tqdm
tqdm.pandas()

from sklearn.model_selection import train_test_split

In [3]:
from sklearn.preprocessing import MinMaxScaler

In [4]:
class T5ForRegression(T5ForConditionalGeneration):
    def __init__(self, config):
        super().__init__(config)
        self.regression_head = nn.Linear(config.d_model, 1)

    def forward(self, input_ids, attention_mask=None, decoder_input_ids=None, labels=None):
        # Provide default decoder_input_ids if not supplied
        if decoder_input_ids is None:
            decoder_input_ids = torch.ones((input_ids.shape[0], 1), dtype=torch.long, device=input_ids.device) * self.config.pad_token_id

        # Pass inputs through T5 model
        outputs = super().forward(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids
        )
        
        # Use the encoder's last hidden state
        sequence_output = outputs.encoder_last_hidden_state[:, 0, :]
        
        # Apply the regression head
        logits = self.regression_head(sequence_output)

        # Convert labels to float
        if labels is not None:
            labels = labels.float()
        
        # Calculate the loss if labels are provided
        loss = None
        if labels is not None:
            loss_fct = nn.MSELoss()
            loss = loss_fct(logits.view(-1), labels.view(-1))

        return {"loss": loss, "logits": logits}

In [16]:
tokenizer = T5Tokenizer.from_pretrained("QizhiPei/biot5-base-mol2text", model_max_length=512)
model = T5ForRegression.from_pretrained('QizhiPei/biot5-base-mol2text')

Some weights of T5ForRegression were not initialized from the model checkpoint at QizhiPei/biot5-base-mol2text and are newly initialized: ['regression_head.bias', 'regression_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
df=pd.read_csv("absorption_p1.csv")

scaler = MinMaxScaler()
df["scaled_output"] = scaler.fit_transform(df[["output"]])

train, test = train_test_split(df.head(100)[["input", "scaled_output"]])

train_dataset = Dataset.from_pandas(train)
eval_dataset = Dataset.from_pandas(test)

def preprocess_function(examples):
    inputs = examples["input"]
    model_inputs = tokenizer(inputs, padding="max_length", truncation=True)
    model_inputs["labels"] = examples["scaled_output"]
    return model_inputs

tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/75 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

In [21]:
# Define training arguments with reduced learning rate and gradient clipping
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    learning_rate=1e-9,  # Reduced learning rate
    logging_dir="./logs",
    logging_steps=10,
    max_grad_norm=1.0  # Gradient clipping
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
)

# Train the model
trainer.train()

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Step,Training Loss
10,0.0


TrainOutput(global_step=15, training_loss=0.0, metrics={'train_runtime': 19.3728, 'train_samples_per_second': 11.614, 'train_steps_per_second': 0.774, 'total_flos': 155634264345600.0, 'train_loss': 0.0, 'epoch': 3.0})

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device);

In [None]:
text = train.input[0]
inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True).to(device)
outputs = model(**inputs)
predicted_number = outputs["logits"].item()
print(f"Predicted number: {predicted_number}")