In [1]:
!pip install sentencepiece



In [2]:
!git clone https://github.com/ML-KULeuven/deepstochlog-lm.git
%cd deepstochlog-lm/
!cp -r data/lms_task2 /content/
!ls /content/lms_task2
%cd /content/

Cloning into 'deepstochlog-lm'...
remote: Enumerating objects: 141, done.[K
remote: Counting objects: 100% (141/141), done.[K
remote: Compressing objects: 100% (109/109), done.[K
remote: Total 141 (delta 37), reused 121 (delta 29), pack-reused 0 (from 0)[K
Receiving objects: 100% (141/141), 3.47 MiB | 7.54 MiB/s, done.
Resolving deltas: 100% (37/37), done.
/content/deepstochlog-lm
 test_column_cf_task2.json   train_column_cf_task2.json
 test_column_task2.json      train_column_task2.json
 test_desc_task2.json	     train_desc_task2.json
'test_group by_task2.json'  'train_group by_task2.json'
 test_having_task2.json      train_having_task2.json
 test_limit_task2.json	     train_limit_task2.json
 test_op_task2.json	     train_op_task2.json
'test_order by_task2.json'  'train_order by_task2.json'
 test_ss_task2.json	     train_ss_task2.json
 test_table_task2.json	     train_table_task2.json
 test_type_task2.json	     train_type_task2.json
 test_where_task2.json	     train_where_task2.js

In [14]:
import os
from google.colab import files
import shutil
import json
import numpy as np
import pandas as pd
import torch
import gc
import math
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import T5Tokenizer, T5ForConditionalGeneration, GPT2Tokenizer, GPT2LMHeadModel
from rich.table import Column, Table
from rich import box
from rich.console import Console
from torch import cuda
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
gc.collect()
torch.cuda.empty_cache()
device = 'cuda' if cuda.is_available() else 'cpu'

In [4]:
class SQLDataset(Dataset):

  def __init__(
          self, dataframe, tokenizer, source_len, target_len, source_text, target_text, padding
  ):
    self.tokenizer = tokenizer
    self.data = dataframe
    self.source_len = source_len
    self.target_len = target_len
    self.target_text = self.data[target_text]
    self.source_text = self.data[source_text]
    self.padding = padding

  def __len__(self):
    return len(self.target_text)

  def __getitem__(self, index):
    source_text = str(self.source_text[index])
    target_text = str(self.target_text[index])

    source_text = " ".join(source_text.split())
    target_text = " ".join(target_text.split())

    if self.padding:
      source = self.tokenizer.batch_encode_plus(
        [source_text],
        max_length=self.source_len,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
      )
      target = self.tokenizer.batch_encode_plus(
        [target_text],
        max_length=self.target_len,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
      )
    else:
      source = self.tokenizer.batch_encode_plus(
        [source_text],
        return_tensors="pt",
      )
      target = self.tokenizer.batch_encode_plus(
        [target_text],
        return_tensors="pt",
      )

    source_ids = source["input_ids"].squeeze()
    source_mask = source["attention_mask"].squeeze()
    target_ids = target["input_ids"].squeeze()
    target_mask = target["attention_mask"].squeeze()

    return {
      "source_ids": source_ids.to(dtype=torch.long),
      "source_mask": source_mask.to(dtype=torch.long),
      "target_ids": target_ids.to(dtype=torch.long),
      "target_ids_y": target_ids.to(dtype=torch.long),
    }

In [5]:
def train(epoch, fold, tokenizer, model, device, loader, optimizer):

    model.train()
    for _, data in enumerate(loader, 0):
      y = data["target_ids"].to(device, dtype=torch.long)
      y_ids = y[:, :-1].contiguous()
      lm_labels = y[:, 1:].clone().detach()
      lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
      ids = data["source_ids"].to(device, dtype=torch.long)
      mask = data["source_mask"].to(device, dtype=torch.long)
      # print(lm_labels)

      outputs = model(
        input_ids=ids,
        attention_mask=mask,
        labels=y_ids,
      )
      loss = outputs[0]

      # if _ % 10 == 0:
      #   training_logger.add_row(str(epoch), str(fold), str(_), str(loss))
      #   console.print(training_logger)

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

In [6]:
def validate(epoch, fold, tokenizer, model, device, loader, target_len, save_pred):

  model.eval()
  predictions = []
  actuals = []
  sample_no = 0
  acc = 0
  with torch.no_grad():
      for _, data in enumerate(loader, 0):
          y = data['target_ids'].to(device, dtype = torch.long)
          ids = data['source_ids'].to(device, dtype = torch.long)
          mask = data['source_mask'].to(device, dtype = torch.long)

          generated_ids = model.generate(
              input_ids = ids,
              attention_mask = mask,
              max_length=target_len,
              )
          preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
          target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
          sample_no = len(target)
          for i in range(len(preds)):
            if preds[i] == target[i]:
              acc += 1

          predictions.extend(preds)
          actuals.extend(target)

  if save_pred:
    final_df = pd.DataFrame({"Generated Text": predictions, "Actual Text": actuals})
    #replace with you output dir
    prediction_dir = "output/prediction.csv"
    final_df.to_csv(prediction_dir)

  return acc

In [7]:
def Trainer(
    train_dataset=None, val_dataset=None, test_dataset=None, model_params=None, model_number = -1, output_dir=None, save_pred=False
  ):

    torch.manual_seed(model_params["SEED"])
    np.random.seed(model_params["SEED"])
    torch.backends.cudnn.deterministic = True

    tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])
    model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
    model = model.to(device)
    padding = True

    training_set = SQLDataset(
      train_dataset,
      tokenizer,
      model_params["MAX_SOURCE_TEXT_LENGTH"],
      model_params["MAX_TARGET_TEXT_LENGTH"],
      "source",
      "target",
      padding,
    )

    train_params = {
      "batch_size": model_params["TRAIN_BATCH_SIZE"],
      "shuffle": True,
      "num_workers": 0,
    }

    training_loader = DataLoader(training_set, **train_params)

    if val_dataset is not None:
      val_set = SQLDataset(
        val_dataset,
        tokenizer,
        model_params["MAX_SOURCE_TEXT_LENGTH"],
        model_params["MAX_TARGET_TEXT_LENGTH"],
        "source",
        "target",
        padding,
      )

      val_params = {
        "batch_size": model_params["VALID_BATCH_SIZE"],
        "shuffle": False,
        "num_workers": 0,
      }

      val_loader = DataLoader(val_set, **val_params)

    if test_dataset is not None:
      testing_set = SQLDataset(
          test_dataset,
          tokenizer,
          model_params["MAX_SOURCE_TEXT_LENGTH"],
          model_params["MAX_TARGET_TEXT_LENGTH"],
          "source",
          "target",
          padding,
      )

      test_params = {
        "batch_size": model_params["TEST_BATCH_SIZE"],
        "shuffle": False,
        "num_workers": 0,
      }

      testing_loader = DataLoader(testing_set, **test_params)


    val_acc_all = []

    for epoch in range(model_params["TRAIN_EPOCHS"]):

      optimizer = torch.optim.Adam(params=model.parameters(), lr=model_params["LEARNING_RATE"])

      train(epoch, model_number, tokenizer, model, device, training_loader, optimizer)

      if val_dataset is not None:
        val_acc = validate(epoch, model_number, tokenizer, model, device, val_loader, model_params["MAX_TARGET_TEXT_LENGTH"], save_pred)
        val_acc = val_acc / len(val_dataset)
        val_acc_all.append(val_acc)
        print(epoch, model_number, val_acc)

    if test_dataset is not None:
      test_acc = validate(epoch, model_number, tokenizer, model, device, testing_loader, model_params["MAX_TARGET_TEXT_LENGTH"], save_pred)
      print(test_acc / len(test_dataset))


    if output_dir is not None:
      console.log(f"[Saving Model]...\n")

      path = os.path.join(output_dir, str(model_params["TRAIN_BATCH_SIZE"]) + "_" + str(model_params["TRAIN_EPOCHS"]) + "_" + str(model_params["LEARNING_RATE"]))
      model.save_pretrained(path)
      tokenizer.save_pretrained(path)

    return val_acc_all

In [8]:
def cross_validation(k=5, model_params = None, df_data=None):

  val_size = math.ceil(model_params["TRAIN_NO"]/k)
  full_train = df_data[:model_params["TRAIN_NO"]]

  val_all = []
  for i in range(k):
    if (i+1)*val_size <= model_params["TRAIN_NO"]:
      val_dataset = full_train[i*val_size:(i+1)*val_size]
    else:
      val_dataset = full_train[i*val_size:]
    train_dataset = full_train.drop(val_dataset.index).reset_index(drop=True)
    val_dataset = val_dataset.reset_index(drop=True)

    val = Trainer(
                            train_dataset = train_dataset,
                            val_dataset = val_dataset,
                            model_params=model_params,
                            model_number = i,
                        )
    val_all.append(val)

  avg_val = []
  for i in range(model_params["TRAIN_EPOCHS"]):
    total_val = 0
    for array in val_all:
        total_val += array[i]
    avg_val.append(total_val / k)

  print("Model avg")
  print(avg_val)

In [9]:
def full_train(model_params=None, df_data=None, task=None, save_model=False, save_pred=False):
  train_dataset = df_data[:model_params["TRAIN_NO"]]
  test_dataset = df_data.drop(train_dataset.index).reset_index(drop=True)
  if save_model:
    output_dir = "lms/"+task
  else:
    output_dir = None
  Trainer(
            train_dataset = train_dataset,
            test_dataset = test_dataset,
            model_params=model_params,
            output_dir= output_dir,
            save_pred = save_pred,
  )

In [10]:
def load_data(task="table"):

  dataset = []

  # replace "root" with you data root
  if task == "t5_baseline":
    train_dir = "train_"+task+".json"
    test_dir = "test_"+task+".json"
    root = ""
  else:
    train_dir = "train_"+task+"_task2.json"
    test_dir = "test_"+task+"_task2.json"
    root = "lms_task2/"

  with open(root + train_dir, "r") as file:
      train_data = json.load(file)
  train_len = len(train_data)
  with open(root + test_dir, "r") as file:
      test_data = json.load(file)
  data = train_data + test_data

  tokenizer = T5Tokenizer.from_pretrained("t5-small")
  source_len = []
  target_len = []
  for sample in data:
      prompt = sample["prompt"]
      target = sample["target"]
      source_len.append(tokenizer(prompt, return_tensors="pt")["input_ids"].size()[1])
      target_len.append(tokenizer(target, return_tensors="pt")["input_ids"].size()[1])
      dataset.append([prompt, target])
  max_source_len = max(source_len)
  max_target_len = max(target_len)
  if max_source_len > 512:
    max_source_len = 512
  print(max_source_len)
  print(max_target_len)

  df_data = pd.DataFrame(dataset, columns = ['source', 'target'])
  print(len(train_data))
  print(len(test_data))

  return max_source_len, max_target_len, df_data, train_len

In [26]:
console = Console(record=True)

training_logger = Table(
    Column("Epoch", justify="center"),
    Column("Fold", justify="center"),
    Column("Steps", justify="center"),
    Column("Loss", justify="center"),
    title="Training Status",
    pad_edge=False,
    box=box.ASCII,
)

# "column_cf" for T5-small + CFG baseline, "t5_baseline" for vanilla T5-small baseline
# tasks = ["table", "column", "column_cf", "type", "where", "group by", "order by", "ss", "having", "limit", "op", "desc", "t5_baseline"]
task = "desc"
# put lms_task2 folder in git repo "/data" to session storage or your google drive
source_len, target_len, df_data, train_len = load_data(task)

batch_size = 32
epoch = 10
learning_rate = 5e-4

for batch in [batch_size]:
  for lr in [learning_rate]:
    print("this is for")
    print(batch, lr)
    model_params = {
      "MODEL": "t5-small",
      "TRAIN_BATCH_SIZE": batch,
      "VALID_BATCH_SIZE": 8,
      "TEST_BATCH_SIZE": 8,
      "TRAIN_EPOCHS": epoch,
      "VAL_EPOCHS": 1,
      "LEARNING_RATE": lr,
      "MAX_SOURCE_TEXT_LENGTH": source_len,
      "MAX_TARGET_TEXT_LENGTH": target_len,
      "SEED": 23,
      "TRAIN_NO": train_len,
    }
    cross_validation(k=5, model_params=model_params, df_data=df_data)

# fill in chosen epoch, batch_size, lr from cross_validation
chosen_model_params = {
    "MODEL": "t5-small",
    "TRAIN_BATCH_SIZE": batch,
    "VALID_BATCH_SIZE": 8,
    "TEST_BATCH_SIZE": 8,
    "TRAIN_EPOCHS": epoch,
    "VAL_EPOCHS": 1,
    "LEARNING_RATE": lr,
    "MAX_SOURCE_TEXT_LENGTH": source_len,
    "MAX_TARGET_TEXT_LENGTH": target_len,
    "SEED": 23,
    "TRAIN_NO": train_len,
}

print("batch: ", batch);
print("lr: ", lr);
print("epoch: ", epoch);

full_train(model_params=chosen_model_params, df_data=df_data, task=task, save_model=True, save_pred=False)

# Zip the folder
shutil.make_archive(task, 'zip', "lms/" + task)

# Download the zip
files.download(task + ".zip")
print("Download complete");

43
2
417
38
this is for
32 0.0005
0 0 0.5357142857142857
1 0 0.47619047619047616
2 0 0.7976190476190477
3 0 0.8452380952380952
4 0 0.9047619047619048
5 0 0.9047619047619048
6 0 0.9047619047619048
7 0 0.9047619047619048
8 0 0.9166666666666666
9 0 0.9166666666666666
0 1 0.38095238095238093
1 1 0.6904761904761905
2 1 0.7619047619047619
3 1 0.9523809523809523
4 1 0.9166666666666666
5 1 0.9642857142857143
6 1 0.9761904761904762
7 1 0.9523809523809523
8 1 0.9642857142857143
9 1 0.9642857142857143
0 2 0.5357142857142857
1 2 0.6428571428571429
2 2 0.8571428571428571
3 2 0.8571428571428571
4 2 0.9166666666666666
5 2 0.9166666666666666
6 2 0.9166666666666666
7 2 0.9404761904761905
8 2 0.9404761904761905
9 2 0.9285714285714286
0 3 0.5476190476190477
1 3 0.6309523809523809
2 3 0.6071428571428571
3 3 0.8452380952380952
4 3 0.9047619047619048
5 3 0.8928571428571429
6 3 0.9285714285714286
7 3 0.9166666666666666
8 3 0.9642857142857143
9 3 0.9523809523809523
0 4 0.4691358024691358
1 4 0.925925925925925

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Download complete
