# SQLCodeT5-ColNameAware model

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install -q datasets rouge_score transformers==4.28.0

In [None]:
from transformers import AutoTokenizer, T5ForConditionalGeneration

T5_MODEL = 'Salesforce/codet5-small'

tokenizer = AutoTokenizer.from_pretrained(T5_MODEL)
model = T5ForConditionalGeneration.from_pretrained(T5_MODEL)

In [None]:
from datasets import load_dataset

train_data = load_dataset('wikisql', split='train+validation')
test_data = load_dataset('wikisql', split='test')



In [None]:
def get_table_from_row(row):
  header = row['table']['header']
  data_types = row['table']['types']

  table_str = "Table(" + ", ".join([f"\'{h}\'" for h in header]) + ")"
  return table_str

def format_dataset(example):
  return {'input': 'translate to SQL the following natural language query: \'{}\', where the table is \'{}\''.format(example['question'], get_table_from_row(example)), 'target': example['sql']['human_readable']}

In [None]:
train_data = train_data.map(format_dataset, remove_columns=train_data.column_names)



In [None]:
test_data = test_data.map(format_dataset, remove_columns=test_data.column_names)



In [None]:
# tokenize the examples
def convert_to_features(example_batch):
    input_encodings = tokenizer.batch_encode_plus(example_batch['input'], pad_to_max_length=True, max_length=64)
    target_encodings = tokenizer.batch_encode_plus(example_batch['target'], pad_to_max_length=True, max_length=64)

    encodings = {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids'],
        'decoder_attention_mask': target_encodings['attention_mask'],
    }

    return encodings

In [None]:
train_data = train_data.map(convert_to_features, batched=True, remove_columns=train_data.column_names)
test_data = test_data.map(convert_to_features, batched=True, remove_columns=test_data.column_names)

columns = ['input_ids', 'attention_mask', 'labels', 'decoder_attention_mask']

train_data.set_format(type='torch', columns=columns)
test_data.set_format(type='torch', columns=columns)



In [None]:
from transformers import Seq2SeqTrainer
from transformers import Seq2SeqTrainingArguments
from transformers import get_linear_schedule_with_warmup

PATH_TO_TRAINED_MODEL = '/content/drive/MyDrive/model0306_1'

In [None]:
!pip install --upgrade accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir=PATH_TO_TRAINED_MODEL,
    per_device_train_batch_size=128,
    num_train_epochs=15,
    per_device_eval_batch_size=128,
    predict_with_generate=True,
    evaluation_strategy="epoch",
    do_train=True,
    do_eval=True,
    logging_steps=500,
    save_strategy="epoch",
    overwrite_output_dir=True,
    save_total_limit=3,
    load_best_model_at_end=True
)

In [None]:
from datasets import load_metric
rouge = load_metric("rouge")

def compute_metrics(pred):
  labels_ids = pred.label_ids
  pred_ids = pred.predictions

  # all unnecessary tokens are removed
  pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
  labels_ids[labels_ids == -100] = tokenizer.pad_token_id
  label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

  rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

  return {
      "rouge2_precision": round(rouge_output.precision, 4),
      "rouge2_recall": round(rouge_output.recall, 4),
      "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
  }

  rouge = load_metric("rouge")


In [None]:
# instantiate trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=test_data,
)

NameError: ignored

In [None]:
trainer.evaluate()

{'eval_loss': 2.5842812061309814,
 'eval_rouge2_precision': 0.0068,
 'eval_rouge2_recall': 0.0016,
 'eval_rouge2_fmeasure': 0.0025,
 'eval_runtime': 95.0103,
 'eval_samples_per_second': 167.119,
 'eval_steps_per_second': 1.316}

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,Rouge2 Precision,Rouge2 Recall,Rouge2 Fmeasure
1,0.1767,0.086866,0.8765,0.8179,0.8412
2,0.0968,0.071974,0.8896,0.8307,0.8544
3,0.0803,0.065402,0.8956,0.839,0.8617
4,0.0703,0.061048,0.8998,0.8431,0.8658
5,0.0629,0.058336,0.9042,0.8459,0.8694
6,0.0575,0.056869,0.9075,0.8477,0.8719
7,0.0534,0.054682,0.9082,0.8493,0.8731
8,0.0502,0.054154,0.9092,0.8507,0.8744
9,0.047,0.053312,0.9097,0.8513,0.8748
10,0.0448,0.052743,0.9108,0.8521,0.8758


TrainOutput(global_step=7605, training_loss=0.06255210240681966, metrics={'train_runtime': 9786.951, 'train_samples_per_second': 99.279, 'train_steps_per_second': 0.777, 'total_flos': 1.643793849778176e+16, 'train_loss': 0.06255210240681966, 'epoch': 15.0})

In [None]:
trainer.save_model(PATH_TO_TRAINED_MODEL)

In [None]:
tokenizer.save_pretrained(PATH_TO_TRAINED_MODEL)

('/content/drive/MyDrive/model0306_1/tokenizer_config.json',
 '/content/drive/MyDrive/model0306_1/special_tokens_map.json',
 '/content/drive/MyDrive/model0306_1/vocab.json',
 '/content/drive/MyDrive/model0306_1/merges.txt',
 '/content/drive/MyDrive/model0306_1/added_tokens.json',
 '/content/drive/MyDrive/model0306_1/tokenizer.json')