In [None]:
import json
from datasets import Dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Step 1: Load your JSON file
with open('extended_acl_to_databricks_dataset.json', 'r') as f:
    data = json.load(f)

# Example of what your JSON file structure might look like:
# [
#     {"acl_query": "SELECT * FROM users WHERE id = 1", "databricks_query": "SELECT * FROM users WHERE id = 1"},
#     {"acl_query": "SELECT name FROM users", "databricks_query": "SELECT name FROM users"}
# ]

# Step 2: Convert to a dataset
dataset = Dataset.from_list(data)

# Step 3: Load T5 tokenizer and model
model_name = "t5-small"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Step 4: Tokenization function
def tokenize_function(examples):
    # Tokenize the input (acl_query) as the encoder input
    model_inputs = tokenizer(examples['input'], padding='max_length', truncation=True, max_length=512)

    # Tokenize the target (databricks_query) as the decoder input (labels)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['output'], padding='max_length', truncation=True, max_length=512)

    # Set the target labels for the decoder
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

# Step 5: Tokenize the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Check the tokenized dataset format (first entry)
print(tokenized_datasets[0])

# Step 6: Fine-tuning setup (Trainer API)
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",         # Directory to save model checkpoints
    num_train_epochs=3,             # Number of epochs
    per_device_train_batch_size=8,  # Batch size for training
    per_device_eval_batch_size=8,   # Batch size for evaluation
    warmup_steps=500,               # Number of warmup steps
    weight_decay=0.01,              # Weight decay
    logging_dir='./logs',           # Directory for storing logs
    logging_steps=200,              # Frequency of logging
    evaluation_strategy="epoch",    # Evaluate every epoch
    save_strategy="epoch",          # Save model every epoch
    load_best_model_at_end=True     # Load the best model at the end
)

# Define the Trainer
trainer = Trainer(
    model=model,                        # The T5 model
    args=training_args,                  # Training arguments
    train_dataset=tokenized_datasets,    # The tokenized training dataset
    eval_dataset=tokenized_datasets,     # The tokenized validation dataset (you can split your dataset)
    tokenizer=tokenizer                  # The tokenizer used for tokenization
)

# Step 7: Start training
trainer.train()


Map:   0%|          | 0/515 [00:00<?, ? examples/s]

  trainer = Trainer(


{'input': 'AVG Year FROM employees', 'output': 'SELECT AVG(Year) FROM employees;', 'input_ids': [71, 17217, 2929, 21680, 1652, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss


{'acl_query': 'AVG Year FROM employees', 'databricks_query': 'SELECT AVG(Year) FROM employees;'}


Map:   0%|          | 0/515 [00:00<?, ? examples/s]

In [15]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",         # Directory to save model checkpoints
    num_train_epochs=3,             # Number of epochs to train
    per_device_train_batch_size=8,  # Batch size for training
    per_device_eval_batch_size=8,   # Batch size for evaluation
    warmup_steps=500,               # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,              # Strength of weight decay
    logging_dir='./logs',           # Directory for storing logs
    logging_steps=200,              # Frequency of logging
    evaluation_strategy="epoch",    # Evaluate every epoch
    save_strategy="epoch",          # Save model every epoch
    load_best_model_at_end=True     # Load the best model when finished training
)

trainer = Trainer(
    model=model,                        # The T5 model
    args=training_args,                  # Training arguments
    train_dataset=tokenized_datasets,    # The tokenized training dataset
    eval_dataset=tokenized_datasets,     # The tokenized validation dataset
    tokenizer=tokenizer                  # The tokenizer used for tokenization
)

# Start training
trainer.train()


  trainer = Trainer(


ValueError: You have to specify either decoder_input_ids or decoder_inputs_embeds

In [11]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory for checkpoints
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=8,    # batch size for evaluation
    warmup_steps=500,                # warmup steps for learning rate scheduler
    weight_decay=0.01,               # weight decay strength
    logging_dir='./logs',            # directory for logs
    logging_steps=10,                # log every 10 steps
    eval_strategy="epoch",     # evaluate after every epoch
)

In [12]:
trainer = Trainer(
    model=model,                         # pre-trained model
    args=training_args,                  # training arguments
    train_dataset=train_dataset,         # training dataset
    eval_dataset=eval_dataset            # evaluation dataset
)


In [13]:
# Start the training
trainer.train()


ValueError: You have to specify either decoder_input_ids or decoder_inputs_embeds