# Setup

In [None]:
!pip install -q datasets
!pip install -q evaluate
!pip install -q sentencepiece
!pip install -q transformers

In [2]:
import copy
import datasets
import evaluate
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim
import warnings

from dataclasses import dataclass
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from transformers import T5ForConditionalGeneration, AutoTokenizer, T5Config
from transformers import Trainer, TrainingArguments

In [3]:
tqdm.pandas()
warnings.filterwarnings("ignore", category=DeprecationWarning)

if torch.cuda.is_available():
    DEVICE = torch.device("cuda")
else:
    DEVICE = torch.device("cpu")
DEVICE

device(type='cuda')

In [4]:
MODEL_NAME = "t5-base"
SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Prepare Dataset

In [None]:
df_train = datasets.load_dataset(
    "code_x_glue_tc_text_to_code",
    split="train")
df_valid = datasets.load_dataset(
    "code_x_glue_tc_text_to_code",
    split="validation")
df_test = datasets.load_dataset(
    "code_x_glue_tc_text_to_code",
    split="test")

df_train = pd.DataFrame({
    'id': df_train['id'],
    'nl': df_train['nl'],
    'code': df_train['code']
})
df_valid = pd.DataFrame({
    'id': df_valid['id'],
    'nl': df_valid['nl'],
    'code': df_valid['code']
})
df_test = pd.DataFrame({
    'id': df_test['id'],
    'nl': df_test['nl'],
    'code': df_test['code']
})

In [6]:
df_train.head()

Unnamed: 0,id,nl,code
0,0,check if details are parsed . concode_field_se...,boolean function ( ) { return isParsed ; }
1,1,answer the library file defining the library c...,File function ( ) { return libraryFile ; }
2,2,this method deletes index files of the @linkpl...,"void function ( Directory arg0 , Collection < ..."
3,3,"do n't use this . no , really , do n't use thi...","byte [ ] function ( Class < ? > arg0 , Configu..."
4,4,force the eventbus from ambarieventpublisher t...,void function ( Binder arg0 ) { EventBus loc0 ...


# Train

In [7]:
tokenizer_code = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer_nl = AutoTokenizer.from_pretrained(MODEL_NAME)

Downloading:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [8]:
class Code2TextDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, i):
        code = "code: " + self.df['code'].iloc[i]
        nl = self.df['nl'].iloc[i]

        code_tokenized = tokenizer_code(
            code,
            padding="max_length",
            max_length=64,
            truncation=True)
        code_input_ids = code_tokenized.input_ids
        code_attention = code_tokenized.attention_mask

        nl_tokenized = tokenizer_nl(
            nl,
            padding="max_length",
            max_length=64,
            truncation=True)
        nl_input_ids = nl_tokenized.input_ids

        return {
            "input_ids": code_input_ids,
            "labels": nl_input_ids,
            "attention_mask": code_attention
        }

In [9]:
# TODO: Implement this and add as parameter in Trainer
def compute_metrics(eval_pred):
    print(eval_pred)
    logits, labels = eval_pred
    return 1

In [10]:
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
model.cuda()

training_args = TrainingArguments(
    output_dir="code2text",
    evaluation_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
)

Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=Code2TextDataset(df_train),
    eval_dataset=Code2TextDataset(df_valid),
)
trainer.train()

***** Running training *****
  Num examples = 100000
  Num Epochs = 3
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 2346
  Number of trainable parameters = 222903552


Epoch,Training Loss,Validation Loss
1,2.0004,1.683103
2,1.6003,1.626118
3,1.561,1.618219


Saving model checkpoint to code2text/checkpoint-500
Configuration saved in code2text/checkpoint-500/config.json
Model weights saved in code2text/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 128
Saving model checkpoint to code2text/checkpoint-1000
Configuration saved in code2text/checkpoint-1000/config.json
Model weights saved in code2text/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to code2text/checkpoint-1500
Configuration saved in code2text/checkpoint-1500/config.json
Model weights saved in code2text/checkpoint-1500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 128
Saving model checkpoint to code2text/checkpoint-2000
Configuration saved in code2text/checkpoint-2000/config.json
Model weights saved in code2text/checkpoint-2000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 128


Training completed. Do not forget to share your model on hugg

TrainOutput(global_step=2346, training_loss=1.6863474606044864, metrics={'train_runtime': 2295.6865, 'train_samples_per_second': 130.68, 'train_steps_per_second': 1.022, 'total_flos': 2.2835920896e+16, 'train_loss': 1.6863474606044864, 'epoch': 3.0})

# Generate

In [14]:
text = \
    "code: boolean function ( ) { return isNew ; }" \
  + ""

input_ids = tokenizer_code(text, return_tensors="pt").input_ids.cuda()
outputs = model.generate(input_ids)

tokenizer_nl.decode(outputs[0], skip_special_tokens=True)

'returns true if the ssl is new. concode_field_s'