<a href="https://colab.research.google.com/github/BhaveshKhaple/NLQ-2-SQL/blob/main/NLQ_2_SQL_using_T5_transformer_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -U transformers datasets pandas scikit-learn nltk sqlparse


Collecting transformers
  Downloading transformers-4.54.0-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting pandas
  Downloading pandas-2.3.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Collecting scikit-learn
  Downloading scikit_learn-1.7.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.34.1-py3-none-any.whl.metadata (14 kB)
Downloading transformers-4.54.0-py3-none-any.whl (11.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.2/11.2 MB[0m [31m91.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pandas-2.3.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.4 MB)
[2K 

In [2]:
from google.colab import files

print("📤 Please upload train_data.json, val_data.json, and test_data.json")
uploaded = files.upload()


📤 Please upload train_data.json, val_data.json, and test_data.json


Saving test_data.json to test_data.json
Saving train_data.json to train_data.json
Saving val_data.json to val_data.json


In [3]:
import json
from datasets import Dataset, DatasetDict

with open("/content/train_data.json") as f:
    train_data = json.load(f)

with open("/content/val_data.json") as f:
    val_data = json.load(f)

with open("/content/test_data.json") as f:
    test_data = json.load(f)

print(f"✅ Loaded {len(train_data)} training examples")
print(f"✅ Loaded {len(val_data)} validation examples")
print(f"✅ Loaded {len(test_data)} test examples")

train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)
test_dataset = Dataset.from_list(test_data)

dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})


✅ Loaded 699 training examples
✅ Loaded 150 validation examples
✅ Loaded 150 test examples


In [4]:
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained("t5-small")
MAX_INPUT_LENGTH = 512
MAX_TARGET_LENGTH = 128

def preprocess_function(examples):
    inputs = examples["input_text"]
    targets = examples["target_text"]

    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, padding="max_length", truncation=True)
    labels = tokenizer(targets, max_length=MAX_TARGET_LENGTH, padding="max_length", truncation=True)["input_ids"]
    labels = [[token if token != tokenizer.pad_token_id else -100 for token in label] for label in labels]
    model_inputs["labels"] = labels
    return model_inputs

remove_cols = ["input_text", "target_text", "category"]
if "original_nlq" in dataset_dict["train"].column_names:
    remove_cols.append("original_nlq")

tokenized_datasets = dataset_dict.map(
    preprocess_function,
    batched=True,
    remove_columns=remove_cols
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/699 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

In [5]:
import torch
from transformers import T5ForConditionalGeneration, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)

EPOCHS = 5
BATCH_SIZE = 8
LEARNING_RATE = 5e-5

args = TrainingArguments(
    output_dir="./t5_clinical_model",
    do_eval=True,
    logging_dir="./logs",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    prediction_loss_only=False
)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    if isinstance(predictions, tuple):
        predictions = predictions[0]

    if isinstance(predictions[0][0], (list, torch.Tensor)):
        predictions = [p[0] for p in predictions]
    if isinstance(labels[0][0], (list, torch.Tensor)):
        labels = [l[0] for l in labels]

    labels = [[tokenizer.pad_token_id if token == -100 else token for token in seq] for seq in labels]

    def clean(seq): return [int(tok) for tok in seq if 0 <= tok < tokenizer.vocab_size]
    predictions = [clean(seq) for seq in predictions]
    labels = [clean(seq) for seq in labels]

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    smooth = SmoothingFunction().method1
    bleu = sum([sentence_bleu([ref.split()], pred.split(), smoothing_function=smooth)
                for pred, ref in zip(decoded_preds, decoded_labels)]) / len(decoded_preds)
    exact = sum([int(pred.strip() == ref.strip()) for pred, ref in zip(decoded_preds, decoded_labels)]) / len(decoded_preds)
    return {"bleu": bleu, "exact_match": exact}

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
    compute_metrics=compute_metrics
)

trainer.train()


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msanbar1234567890[0m ([33msanbar1234567890-mit[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss


TrainOutput(global_step=440, training_loss=1.97532958984375, metrics={'train_runtime': 191.921, 'train_samples_per_second': 18.211, 'train_steps_per_second': 2.293, 'total_flos': 473019596144640.0, 'train_loss': 1.97532958984375, 'epoch': 5.0})

In [11]:
model.save_pretrained("./t5_clinical_model/final_model")
tokenizer.save_pretrained("./t5_clinical_model/final_model")


('./t5_clinical_model/final_model/tokenizer_config.json',
 './t5_clinical_model/final_model/special_tokens_map.json',
 './t5_clinical_model/final_model/spiece.model',
 './t5_clinical_model/final_model/added_tokens.json')

In [12]:
!zip -r t5_clinical_model.zip ./t5_clinical_model/final_model


  adding: t5_clinical_model/final_model/ (stored 0%)
  adding: t5_clinical_model/final_model/model.safetensors (deflated 10%)
  adding: t5_clinical_model/final_model/generation_config.json (deflated 29%)
  adding: t5_clinical_model/final_model/config.json (deflated 63%)
  adding: t5_clinical_model/final_model/added_tokens.json (deflated 83%)
  adding: t5_clinical_model/final_model/spiece.model (deflated 48%)
  adding: t5_clinical_model/final_model/special_tokens_map.json (deflated 85%)
  adding: t5_clinical_model/final_model/tokenizer_config.json (deflated 94%)


In [13]:
from google.colab import files
files.download("t5_clinical_model.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>