In [2]:
%pip install datasets sentencepiece accelerate transformers
%pip install pandas

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [3]:
from datasets import load_dataset
import pandas as pd

# Load your CSV file into a pandas DataFrame
dataset = pd.read_csv('./nlp_table_prompts_dataset_2000.csv')
# /kaggle/input/sql-dataset

# You can preview the dataset
print(dataset.head())


                                              prompt  \
0  create a deadline_log table with, deadline_log...   
1  create a deadline_log table with, deadline_log...   
2  create a deadline_log table with, deadline_log...   
3  create a deadline_log table with, deadline_log...   
4  create a admin_projection table with, project_...   

                                  create_table_query  
0  CREATE TABLE deadline_log (\n  deadline_log_id...  
1  CREATE TABLE deadline_log (\n  deadline_log_id...  
2  CREATE TABLE deadline_log (\n  deadline_log_id...  
3  CREATE TABLE deadline_log (\n  deadline_log_id...  
4  CREATE TABLE admin_projection (\n  project_id ...  


In [4]:
from transformers import T5Tokenizer
from datasets import Dataset

# Convert your pandas DataFrame into Hugging Face Dataset
dataset = Dataset.from_pandas(dataset)

# Initialize T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-large")

# Tokenize the input and output text
def preprocess(example):
    # Tokenize input (Description) and target (SQL Query)
    inputs = tokenizer(example['prompt'], padding="max_length", truncation=True, max_length=128)
    targets = tokenizer(example['create_table_query'], padding="max_length", truncation=True, max_length=128)
    inputs["labels"] = targets["input_ids"]
    return inputs

# Apply tokenization to your dataset
tokenized_dataset = dataset.map(preprocess, batched=True)


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Map: 100%|██████████| 2000/2000 [00:01<00:00, 1993.59 examples/s]


In [5]:
# Split the dataset into train and test (80/20 split)
train_dataset = tokenized_dataset.shuffle(seed=42).select([i for i in list(range(int(len(tokenized_dataset) * 0.8)))])
test_dataset = tokenized_dataset.shuffle(seed=42).select([i for i in list(range(int(len(tokenized_dataset) * 0.8), len(tokenized_dataset)))])

# Check dataset sizes

print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")


Train dataset size: 1600
Test dataset size: 400


In [7]:
# !pip install --upgrade transformers
import os
import shutil

%pip install huggingface_hub[hf_xet]

Collecting hf-xet>=0.1.4 (from huggingface_hub[hf_xet])
  Downloading hf_xet-1.0.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (494 bytes)
Downloading hf_xet-1.0.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (53.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.8/53.8 MB[0m [31m54.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: hf-xet
Successfully installed hf-xet-1.0.3
Note: you may need to restart the kernel to use updated packages.


In [6]:
from transformers import T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import DataCollatorForSeq2Seq
from transformers import TrainerCallback, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import T5Tokenizer, T5ForConditionalGeneration

class DeleteCheckpointCallback(TrainerCallback):
    def __init__(self, output_dir):
        self.output_dir = output_dir

    def on_save(self, args, state, control, model=None, tokenizer=None, **kwargs):
        # Delete the previous checkpoint before saving the new one
        checkpoint_dir = os.path.join(self.output_dir, f"checkpoint-{state.global_step}")
        
        # Check if there's an existing checkpoint and delete it
        if os.path.exists(checkpoint_dir):
            print(f"Deleting previous checkpoint at {checkpoint_dir}")
            shutil.rmtree(checkpoint_dir)
        
        return control

# Load T5 model
model = T5ForConditionalGeneration.from_pretrained("t5-large")

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./t5_sql_model",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    eval_strategy="epoch",
    save_strategy="steps",
    save_steps=500,
    logging_dir="./logs",
    logging_steps=100,
    learning_rate=3e-4,
    report_to="none"
)

# delete_checkpoint_callback = DeleteCheckpointCallback(output_dir=training_args.output_dir)

# Initialize the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model)
)

# Start the training process
trainer.train()


  trainer = Seq2SeqTrainer(


: 

In [None]:
def predict_description(desc):
    inputs = tokenizer(desc, return_tensors="pt", truncation=True)
    outputs = model.generate(**inputs)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

print(predict_description("create a customs table with id as number, name as string, agentId as reference to agent, price as number"))


In [None]:
# model.save_pretrained("./t5_sql_model")
# tokenizer.save_pretrained("./t5_sql_model")