# SQLCodeT5-ColNameTypeAware model

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
PATH_TO_TRAINED_MODEL = '/content/drive/MyDrive/SQLCodeT5-ColNameTypeAware'

In [None]:
!pip install -q datasets rouge_score transformers==4.28.0

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/474.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m37.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m95.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m125.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m18.3 MB/s[0m eta

In [None]:
from transformers import AutoTokenizer, T5ForConditionalGeneration

T5_MODEL = 'Salesforce/codet5-small'

tokenizer = AutoTokenizer.from_pretrained(T5_MODEL)
model = T5ForConditionalGeneration.from_pretrained(T5_MODEL)

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

In [None]:
from datasets import load_dataset

train_data = load_dataset('wikisql', split='train+validation')
test_data = load_dataset('wikisql', split='test')

Downloading builder script:   0%|          | 0.00/6.57k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.76k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.80k [00:00<?, ?B/s]

Downloading and preparing dataset wikisql/default to /root/.cache/huggingface/datasets/wikisql/default/0.1.0/7037bfe6a42b1ca2b6ac3ccacba5253b1825d31379e9cc626fc79a620977252d...


Downloading data:   0%|          | 0.00/26.2M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/15878 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/8421 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/56355 [00:00<?, ? examples/s]

Dataset wikisql downloaded and prepared to /root/.cache/huggingface/datasets/wikisql/default/0.1.0/7037bfe6a42b1ca2b6ac3ccacba5253b1825d31379e9cc626fc79a620977252d. Subsequent calls will reuse this data.




In [None]:
def get_table_from_row(row):
  header = row['table']['header']
  data_types = row['table']['types']

  table_str = "Table(" + ", ".join([f"{h}: {t}" for h, t in zip(header, data_types)]) + ")"
  return table_str

def format_dataset(example):
  return {'input': 'translate to SQL the following natural language query: \'{}\', where the table is \'{}\''.format(example['question'], get_table_from_row(example)), 'target': example['sql']['human_readable']}

In [None]:
train_data = train_data.map(format_dataset, remove_columns=train_data.column_names)

Map:   0%|          | 0/64776 [00:00<?, ? examples/s]

In [None]:
test_data = test_data.map(format_dataset, remove_columns=test_data.column_names)

Map:   0%|          | 0/15878 [00:00<?, ? examples/s]

In [None]:
# tokenize the examples
def convert_to_features(example_batch):
    input_encodings = tokenizer.batch_encode_plus(example_batch['input'], pad_to_max_length=True, max_length=64)
    target_encodings = tokenizer.batch_encode_plus(example_batch['target'], pad_to_max_length=True, max_length=64)

    encodings = {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids'],
        'decoder_attention_mask': target_encodings['attention_mask'],
    }

    return encodings

In [None]:
train_data = train_data.map(convert_to_features, batched=True, remove_columns=train_data.column_names)
test_data = test_data.map(convert_to_features, batched=True, remove_columns=test_data.column_names)

columns = ['input_ids', 'attention_mask', 'labels', 'decoder_attention_mask']

train_data.set_format(type='torch', columns=columns)
test_data.set_format(type='torch', columns=columns)

Map:   0%|          | 0/64776 [00:00<?, ? examples/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Map:   0%|          | 0/15878 [00:00<?, ? examples/s]

In [None]:
from transformers import Seq2SeqTrainer
from transformers import Seq2SeqTrainingArguments
from transformers import get_linear_schedule_with_warmup

In [None]:
!pip install --upgrade accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting accelerate
  Downloading accelerate-0.20.3-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.6/227.6 kB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.20.3


In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir=PATH_TO_TRAINED_MODEL,
    per_device_train_batch_size=128,
    num_train_epochs=15,
    per_device_eval_batch_size=128,
    predict_with_generate=True,
    evaluation_strategy="epoch",
    do_train=True,
    do_eval=True,
    logging_steps=500,
    save_strategy="epoch",
    overwrite_output_dir=True,
    save_total_limit=3,
    load_best_model_at_end=True
)

In [None]:
from datasets import load_metric
rouge = load_metric("rouge")

def compute_metrics(pred):
  labels_ids = pred.label_ids
  pred_ids = pred.predictions

  # all unnecessary tokens are removed
  pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
  labels_ids[labels_ids == -100] = tokenizer.pad_token_id
  label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

  rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

  return {
      "rouge2_precision": round(rouge_output.precision, 4),
      "rouge2_recall": round(rouge_output.recall, 4),
      "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
  }

  rouge = load_metric("rouge")


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

In [None]:
# instantiate trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=test_data,
)

In [None]:
trainer.evaluate()

{'eval_loss': 2.553088426589966,
 'eval_rouge2_precision': 0.004,
 'eval_rouge2_recall': 0.0011,
 'eval_rouge2_fmeasure': 0.0016,
 'eval_runtime': 94.0708,
 'eval_samples_per_second': 168.788,
 'eval_steps_per_second': 1.329}

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,Rouge2 Precision,Rouge2 Recall,Rouge2 Fmeasure
1,0.183,0.09361,0.8659,0.8085,0.8311
2,0.1029,0.07766,0.878,0.8224,0.8445
3,0.0859,0.07038,0.8847,0.8305,0.852
4,0.0761,0.067029,0.8907,0.8362,0.8578
5,0.0682,0.063596,0.8955,0.8386,0.8614
6,0.0622,0.061364,0.8996,0.841,0.8647
7,0.0581,0.05983,0.901,0.8429,0.8663
8,0.0545,0.058833,0.9011,0.844,0.8669
9,0.0514,0.057584,0.9019,0.845,0.8678
10,0.0489,0.057303,0.9042,0.8461,0.8695


TrainOutput(global_step=7605, training_loss=0.0672542034678362, metrics={'train_runtime': 9245.2676, 'train_samples_per_second': 105.096, 'train_steps_per_second': 0.823, 'total_flos': 1.643793849778176e+16, 'train_loss': 0.0672542034678362, 'epoch': 15.0})

In [None]:
trainer.save_model(PATH_TO_TRAINED_MODEL)

In [None]:
tokenizer.save_pretrained(PATH_TO_TRAINED_MODEL)

('/content/drive/MyDrive/SQLCodeT5-ColNameTypeAware/tokenizer_config.json',
 '/content/drive/MyDrive/SQLCodeT5-ColNameTypeAware/special_tokens_map.json',
 '/content/drive/MyDrive/SQLCodeT5-ColNameTypeAware/vocab.json',
 '/content/drive/MyDrive/SQLCodeT5-ColNameTypeAware/merges.txt',
 '/content/drive/MyDrive/SQLCodeT5-ColNameTypeAware/added_tokens.json',
 '/content/drive/MyDrive/SQLCodeT5-ColNameTypeAware/tokenizer.json')