In [1]:
# Check for GPU
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

CUDA available: True
GPU: NVIDIA A100-SXM4-40GB


In [2]:
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained('t5-large')

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [3]:
from transformers import T5ForConditionalGeneration

model = T5ForConditionalGeneration.from_pretrained('t5-large')

model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [4]:
def convert_output_to_text(item):
  item['output'] = str(item['output'])
  return item

In [5]:
import torch
import json
from torch.utils.data import Dataset

class DirectTriples(Dataset):
  def __init__(self,json_file_path,tokenizer, max_length=512):
    self.data = json.load(open(json_file_path,'r'))
    self.data = [convert_output_to_text(item) for item in self.data]
    self.tokenizer = tokenizer
    self.max_length = max_length

  def __len__(self):
    return len(self.data)

  def __getitem__(self, index):
    item = self.data[index]

    ip = self.tokenizer(
        text=item['input'],
        max_length= self.max_length,
        add_special_tokens=True,
        padding=True,
        return_tensors=None,
        truncation=True
    )

    op = self.tokenizer(
        text=item['output'],
        max_length= self.max_length,
        add_special_tokens=True,
        padding=True,
        return_tensors=None,
        truncation=True
    )

    return {
       "input_ids": torch.tensor(ip["input_ids"], dtype=torch.long),
        "attention_mask": torch.tensor(ip["attention_mask"], dtype=torch.long),
        "labels": torch.tensor(op["input_ids"], dtype=torch.long)
    }

In [6]:
from torch.utils.data import DataLoader
from torch.nn import CrossEntropyLoss
from torch.optim import Adam

from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

train_dataset = DirectTriples(json_file_path='Master_dataset.json',tokenizer=tokenizer)
test_dataset = DirectTriples(json_file_path='Test_dataset.json',tokenizer=tokenizer)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=data_collator)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=data_collator)

In [7]:
from torch.optim import AdamW
from transformers import get_scheduler

optimizer = AdamW(model.parameters(), lr=5e-5)
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=500, num_training_steps=10000
)

In [8]:
from torch.nn import CrossEntropyLoss

# Loss function
loss_fn = CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

In [9]:
# Training loop
from torch.utils.data import DataLoader
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

torch.cuda.empty_cache()
# Training loop
model.train()
for epoch in range(3):  # Adjust epochs as needed
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        optimizer.zero_grad()

        # Forward pass
        outputs = model(
            input_ids=batch['input_ids'].to(device),
            attention_mask=batch['attention_mask'].to(device),
            labels=batch['labels'].to(device)
        )
        loss = outputs.loss
        loss.backward()

        # Gradient accumulation
        optimizer.step()
        lr_scheduler.step()

        # Update progress bar
        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())

  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Epoch 0: 100%|██████████| 1737/1737 [07:41<00:00,  3.76it/s, loss=0.202]
Epoch 1: 100%|██████████| 1737/1737 [07:39<00:00,  3.78it/s, loss=0.112]
Epoch 2: 100%|██████████| 1737/1737 [07:41<00:00,  3.77it/s, loss=0.0652]


In [13]:
model.eval()

input_text = 'generate_triples:MotorSport Vision is located in the city of Fawkham.'
expected_output = 'MotorSport Vision__sep__locationCity__sep__Fawkham'

inputs = tokenizer(
    input_text,
    return_tensors="pt",
    padding="max_length",
    truncation=True,
    max_length=512
).to(device)

outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=128,
        num_beams=4,
        early_stopping=True
)

generated_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Expected output: " + expected_output)
print("Generated output: " + generated_output)

Expected output: MotorSport Vision__sep__locationCity__sep__Fawkham
Generated output: ['MotorSport Vision__sep__locationCity__sep__Fawkham', 'MotorSport Vision__sep__location__sep__Fawkham']
