# T5 Fine_Tuning

in this notebook we will fine-tune different models on the datasets we already processed.

## NOTEBOOK SETTINGS

We recommend to use "high ram" setting for this notebook
you can changed this in the colab menu : `Runtime > Change runtime type`


We start by setting the environment connecting colab to the Google Cloud Storage (GCS) bucket and setting everything up for the TPU processor. (This colab uses TPU and high ram settings)

In [1]:
from google.colab import auth
auth.authenticate_user()
#@title ## Set Your GCS credential
project_id = '' #@param {type:"string"}
bucket_name = '' #@param {type:"string"}

!gcloud config set project {project_id}

!pip3 install --upgrade pip
!pip3 install t5==0.9.0
!pip3 install tensorflow==2.6.0
!pip3 install keras==2.6.0
!pip3 install gin-config

import functools
import os
import time
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import tensorflow.compat.v1 as tf
import tensorflow_datasets as tfds

import t5

#Set the base dir(Google cloud bucket)
BASE_DIR = "gs://" + bucket_name 

if not BASE_DIR or BASE_DIR == "gs://":
  raise ValueError("You must enter a BASE_DIR.")
ON_CLOUD = True


if ON_CLOUD:
  import tensorflow_gcs_config
  from google.colab import auth
  # Set credentials for GCS reading/writing from Colab and TPU.
  TPU_TOPOLOGY = "2x2"
  try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    TPU_ADDRESS = tpu.get_master()
    print('Running on TPU:', TPU_ADDRESS)
  except ValueError:
    raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')
  auth.authenticate_user()
  tf.config.experimental_connect_to_host(TPU_ADDRESS)
  tensorflow_gcs_config.configure_gcs_from_colab_auth()

tf.disable_v2_behavior()

# Improve logging.
from contextlib import contextmanager
import logging as py_logging

if ON_CLOUD:
  tf.get_logger().propagate = False
  py_logging.root.setLevel('INFO')

@contextmanager
def tf_verbosity_level(level):
  og_level = tf.logging.get_verbosity()
  tf.logging.set_verbosity(level)
  yield
  tf.logging.set_verbosity(og_level)

We specify the paths and the sizes of all our datasets to later build our tasks.

In [2]:
## tasks large dataset
nq_tsv_path_code_code_large = {
    "train":      'gs://' + bucket_name + '/automating_code_review/dataset/fine-tuning/new_large/code-to-code/train.tsv',
    "validation": 'gs://' + bucket_name + '/automating_code_review/dataset/fine-tuning/new_large/code-to-code/val.tsv'
}

!gsutil cp {nq_tsv_path_code_code_large["train"]} ./train.tsv
!gsutil cp {nq_tsv_path_code_code_large["validation"]} ./val.tsv

data_train = len([line for line in open('./train.tsv', 'r')])
data_val = len([line for line in open('./val.tsv', 'r')])

num_nq_examples_code_code_large = dict(train=data_train, validation=data_val)

nq_tsv_path_code_comment_large = {
    "train":      'gs://' + bucket_name + '/automating_code_review/dataset/fine-tuning/new_large/code-to-comment/train.tsv',
    "validation": 'gs://' + bucket_name + '/automating_code_review/dataset/fine-tuning/new_large/code-to-comment/val.tsv'
}

!gsutil cp {nq_tsv_path_code_comment_large["train"]} ./train.tsv
!gsutil cp {nq_tsv_path_code_comment_large["validation"]} ./val.tsv

data_train = len([line for line in open('./train.tsv', 'r')])
data_val = len([line for line in open('./val.tsv', 'r')])

num_nq_examples_code_comment_large = dict(train=data_train, validation=data_val)

nq_tsv_path_codeANDcomment_code_large = {
    "train":      'gs://' + bucket_name + '/automating_code_review/dataset/fine-tuning/new_large/code&comment-to-code/train.tsv',
    "validation": 'gs://' + bucket_name + '/automating_code_review/dataset/fine-tuning/new_large/code&comment-to-code/val.tsv'
}

!gsutil cp {nq_tsv_path_codeANDcomment_code_large["train"]} ./train.tsv
!gsutil cp {nq_tsv_path_codeANDcomment_code_large["validation"]} ./val.tsv

data_train = len([line for line in open('./train.tsv', 'r')])
data_val = len([line for line in open('./val.tsv', 'r')])

num_nq_examples_codeANDcomment_code_large = dict(train=data_train, validation=data_val)

## tasks small dataset
nq_tsv_path_code_code_small = {
    "train":      'gs://' + bucket_name + '/automating_code_review/dataset/fine-tuning/Tufano_etal_ICSE21/code-to-code/train.tsv',
    "validation": 'gs://' + bucket_name + '/automating_code_review/dataset/fine-tuning/Tufano_etal_ICSE21/code-to-code/val.tsv'
}

!gsutil cp {nq_tsv_path_code_code_small["train"]} ./train.tsv
!gsutil cp {nq_tsv_path_code_code_small["validation"]} ./val.tsv

data_train = len([line for line in open('./train.tsv', 'r')])
data_val = len([line for line in open('./val.tsv', 'r')])

num_nq_examples_code_code_small = dict(train=data_train, validation=data_val)

nq_tsv_path_codeANDcomment_code_small = {
    "train":      'gs://' + bucket_name + '/automating_code_review/dataset/fine-tuning/Tufano_etal_ICSE21/code&comment-to-code/train.tsv',
    "validation": 'gs://' + bucket_name + '/automating_code_review/dataset/fine-tuning/Tufano_etal_ICSE21/code&comment-to-code/val.tsv'
}

!gsutil cp {nq_tsv_path_codeANDcomment_code_small["train"]} ./train.tsv
!gsutil cp {nq_tsv_path_codeANDcomment_code_small["validation"]} ./val.tsv

data_train = len([line for line in open('./train.tsv', 'r')])
data_val = len([line for line in open('./val.tsv', 'r')])

num_nq_examples_codeANDcomment_code_small = dict(train=data_train, validation=data_val)

!rm ./train.tsv
!rm ./val.tsv

We specify the model and vocab path of the previusly trained sentencepiece tokenizer model in the GCS bucket

In [None]:
from t5.data import postprocessors as t5_postprocessors
from t5.seqio import Feature,SentencePieceVocabulary

vocab_model_path = 'gs://' + bucket_name + '/automating_code_review/tokenizer/TokenizerModel.model'
vocab_path = 'gs://' + bucket_name + '/automating_code_review/tokenizer/TokenizerModel.vocab'

TaskRegistry = t5.data.TaskRegistry
TfdsTask = t5.data.TfdsTask

def get_default_vocabulary():
  return SentencePieceVocabulary(vocab_model_path, 100)

DEFAULT_OUTPUT_FEATURES = {
    "inputs": Feature(
        vocabulary=get_default_vocabulary(), add_eos=True, required=False),

    "targets": Feature(
        vocabulary=get_default_vocabulary(), add_eos=True)
}

# Setting up all the tasks

We will set the following tasks
- code-to-code (new large dataset)
- code-to-code (Tufano etal. dataset)
- code&comment-to-code (new large dataset)
- code&comment-to-code (Tufano etal. dataset)
- code-to-comment (new large dataset)

then we will later chose which one or which mixture to tune


## TASK : CODE to CODE on new large dataset
- task name = `code-to-code_new_large`
- task prefix = `code2code: `

In [None]:
def nq_dataset_code_code_large(split, shuffle_files=True):
  # We only have one file for each split.
  del shuffle_files

  # Load lines from the text file as examples.
  ds = tf.data.TextLineDataset(nq_tsv_path_code_code_large[split])
  ds = ds.map(
      functools.partial(tf.io.decode_csv, record_defaults=["string","string"],
                        field_delim="\t", use_quote_delim=False),
      num_parallel_calls=tf.data.experimental.AUTOTUNE)
  
  ds = ds.map(lambda *ex: dict(zip(["input", "output"], ex)))
  return ds

print("A few raw validation examples...")
for ex in tfds.as_numpy(nq_dataset_code_code_large("validation").take(2)):
  print(ex)
print("A few raw training examples...")
for ex in tfds.as_numpy(nq_dataset_code_code_large("train").take(2)):
  print(ex)

def code_code_preprocessing(ds):
  def to_inputs_and_targets(ex):
        inputs = tf.strings.join(['code2code: ' + ex['input']], separator=' ')
        class_label = tf.strings.join([ex['output']], separator=' ')
        return {'inputs': inputs, 'targets': class_label }
    
  return ds.map(to_inputs_and_targets, 
                num_parallel_calls=tf.data.experimental.AUTOTUNE)
  
t5.data.TaskRegistry.remove('code_to_code_new_large')
t5.data.TaskRegistry.add(
    "code_to_code_new_large",
    dataset_fn=nq_dataset_code_code_large,
    splits=["train", "validation"],
    text_preprocessor=[code_code_preprocessing],
    output_features = DEFAULT_OUTPUT_FEATURES,
    metric_fns=[t5.evaluation.metrics.accuracy],
    num_input_examples=num_nq_examples_code_code_large
)

nq_task = t5.data.TaskRegistry.get("code_to_code_new_large")
ds = nq_task.get_dataset(split="train", sequence_length={"inputs": 512, "targets": 512})
print("A few preprocessed training examples...")
for ex in tfds.as_numpy(ds.take(3)):
  print(ex)

A few raw validation examples...
{'input': b'public static int positionToDragCursor(int swtPositionConstant) { switch (swtPositionConstant) { case SWT.LEFT: return LEFT; case SWT.RIGHT: return RIGHT; case SWT.TOP: return TOP; case SWT.BOTTOM: return BOTTOM; case SWT.CENTER: return CENTER; } return INVALID; }', 'output': b'public static int positionToDragCursor(int swtPositionConstant) { switch (swtPositionConstant) { case SWT.LEFT: return LEFT; case SWT.RIGHT: return RIGHT; case SWT.TOP: return TOP; case SWT.BOTTOM: return BOTTOM; case SWT.CENTER: return CENTER; default: return INVALID; } }'}
{'input': b'public static boolean shouldCaptureIncrementalChanges(FileSystem fs,Path rootDir) throws StandardException{ boolean shouldRegister = false; try { boolean enabled = incrementalBackupEnabled(); if (enabled) { RecoverableZooKeeper zooKeeper = ZkUtils.getRecoverableZooKeeper(); String spliceBackupPath = BackupUtils.getBackupPath(); if (zooKeeper.exists(spliceBackupPath, false)==null){ retu

## TASK : CODE to COMMENT on new large dataset
- task name = `code-to-comment_new_large`
- task prefix = `code2comment: `

In [None]:
def nq_dataset_code_comment_large(split, shuffle_files=False):
  # We only have one file for each split.
  del shuffle_files

  # Load lines from the text file as examples.
  ds = tf.data.TextLineDataset(nq_tsv_path_code_comment_large[split])
  ds = ds.map(
      functools.partial(tf.io.decode_csv, record_defaults=["string","string"],
                        field_delim="\t", use_quote_delim=False),
      num_parallel_calls=tf.data.experimental.AUTOTUNE)
  
  ds = ds.map(lambda *ex: dict(zip(["input", "output"], ex)))
  return ds

# print("A few raw validation examples...")
# for ex in tfds.as_numpy(nq_dataset_code_comment_large("validation").take(2)):
#   print(ex)
# print("A few raw training examples...")
# for ex in tfds.as_numpy(nq_dataset_code_comment_large("train").take(2)):
#   print(ex)

def code_comment_preprocessing(ds):
  def to_inputs_and_targets(ex):

        inputs = tf.strings.join(['code2comment: ' + ex['input']], separator=' ')
        class_label = tf.strings.join([ex['output']], separator=' ')
        return {'inputs': inputs, 'targets': class_label }
    
  return ds.map(to_inputs_and_targets, 
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

#Create a new training task
t5.data.TaskRegistry.remove('code_to_comment_new_large')
t5.data.TaskRegistry.add(
    "code_to_comment_new_large",
    dataset_fn=nq_dataset_code_comment_large,
    splits=["train", "validation"],
    text_preprocessor=[code_comment_preprocessing],
    output_features = DEFAULT_OUTPUT_FEATURES,
    metric_fns=[t5.evaluation.metrics.accuracy],
    num_input_examples=num_nq_examples_code_comment_large
)

nq_task = t5.data.TaskRegistry.get("code_to_comment_new_large")
ds = nq_task.get_dataset(split="train", sequence_length={"inputs": 512, "targets": 512})
# print("A few preprocessed training examples...")
# for ex in tfds.as_numpy(ds.take(3)):
#   print(ex)

## TASK : CODE and COMMENT to CODE on new large dataset
- task name = `code&comment-to-code_new_large`
- task prefix = `code&comment2code: `

In [None]:
############### THIRD TASK : CODE&COMMENT2CODE ###############

def nq_dataset_codeANDcomment_code_large(split, shuffle_files=False):
  # We only have one file for each split.
  del shuffle_files

  # Load lines from the text file as examples.
  ds = tf.data.TextLineDataset(nq_tsv_path_codeANDcomment_code_large[split])
  ds = ds.map(
      functools.partial(tf.io.decode_csv, record_defaults=["string","string"],
                        field_delim="\t", use_quote_delim=False),
      num_parallel_calls=tf.data.experimental.AUTOTUNE)
  
  ds = ds.map(lambda *ex: dict(zip(["input", "output"], ex)))
  return ds

# print("A few raw validation examples...")
# for ex in tfds.as_numpy(nq_dataset_codeANDcomment_code_large("validation").take(2)):
#   print(ex)
# print("A few raw training examples...")
# for ex in tfds.as_numpy(nq_dataset_codeANDcomment_code_large("train").take(2)):
#   print(ex)

def codeANDcomment_code_preprocessing(ds):
  
  def to_inputs_and_targets(ex):

        inputs = tf.strings.join(['code&comment2code: ' + ex['input']], separator=' ')
        class_label = tf.strings.join([ex['output']], separator=' ')
        return {'inputs': inputs, 'targets': class_label }
    
  return ds.map(to_inputs_and_targets, 
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

#Create a new training task
t5.data.TaskRegistry.remove('code_comment_to_code_new_large')
t5.data.TaskRegistry.add(
    "code_comment_to_code_new_large",
    dataset_fn=nq_dataset_codeANDcomment_code_large,
    splits=["train", "validation"],
    text_preprocessor=[codeANDcomment_code_preprocessing],
    output_features = DEFAULT_OUTPUT_FEATURES,
    metric_fns=[t5.evaluation.metrics.accuracy],
    num_input_examples=num_nq_examples_codeANDcomment_code_large
)

nq_task = t5.data.TaskRegistry.get("code_comment_to_code_new_large")
ds = nq_task.get_dataset(split="train", sequence_length={"inputs": 512, "targets": 512})
# print("A few preprocessed training examples...")
# for ex in tfds.as_numpy(ds.take(3)):
#   print(ex)

## TASK : CODE to CODE on Tufano etal. ICSE21 dataset
- task name = `code-to-code_Tufano_etal_ICSE21`
- task prefix = `code2code: `

In [None]:
def nq_dataset_code_code_small(split, shuffle_files=False):
  # We only have one file for each split.
  del shuffle_files

  # Load lines from the text file as examples.
  ds = tf.data.TextLineDataset(nq_tsv_path_code_code_small[split])
  ds = ds.map(
      functools.partial(tf.io.decode_csv, record_defaults=["string","string"],
                        field_delim="\t", use_quote_delim=False),
      num_parallel_calls=tf.data.experimental.AUTOTUNE)
  
  ds = ds.map(lambda *ex: dict(zip(["input", "output"], ex)))
  return ds

# print("A few raw validation examples...")
# for ex in tfds.as_numpy(nq_dataset_code_code_small("validation").take(2)):
#   print(ex)
# print("A few raw training examples...")
# for ex in tfds.as_numpy(nq_dataset_code_code_small("train").take(2)):
#   print(ex)

def code_code_preprocessing(ds):
  
  def to_inputs_and_targets(ex):

        inputs = tf.strings.join(['code2code: ' + ex['input']], separator=' ')
        class_label = tf.strings.join([ex['output']], separator=' ')
        return {'inputs': inputs, 'targets': class_label }
    
  return ds.map(to_inputs_and_targets, 
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

#Create a new training task
t5.data.TaskRegistry.remove('code_to_code_Tufano_etal_ICSE21')
t5.data.TaskRegistry.add(
    "code_to_code_Tufano_etal_ICSE21",
    dataset_fn=nq_dataset_code_code_small,
    splits=["train", "validation"],
    text_preprocessor=[code_code_preprocessing],
    output_features = DEFAULT_OUTPUT_FEATURES,
    metric_fns=[t5.evaluation.metrics.accuracy],
    num_input_examples=num_nq_examples_codeANDcomment_code_small
)

nq_task = t5.data.TaskRegistry.get("code_to_code_Tufano_etal_ICSE21")
ds = nq_task.get_dataset(split="train", sequence_length={"inputs": 512, "targets": 512})
# print("A few preprocessed training examples...")
# for ex in tfds.as_numpy(ds.take(3)):
#   print(ex)

## TASK : CODE and COMMENT to CODE on Tufano etal. ICSE21 dataset
- task name = `code&comment-to-code_Tufano_etal_ICSE21`
- task prefix = `code&comment2code: `

In [None]:
def nq_dataset_codeANDcomment_code_small(split, shuffle_files=False):
  # We only have one file for each split.
  del shuffle_files

  # Load lines from the text file as examples.
  ds = tf.data.TextLineDataset(nq_tsv_path_codeANDcomment_code_small[split])
  ds = ds.map(
      functools.partial(tf.io.decode_csv, record_defaults=["string","string"],
                        field_delim="\t", use_quote_delim=False),
      num_parallel_calls=tf.data.experimental.AUTOTUNE)
  
  ds = ds.map(lambda *ex: dict(zip(["input", "output"], ex)))
  return ds

# print("A few raw validation examples...")
# for ex in tfds.as_numpy(nq_dataset_codeANDcomment_code_small("validation").take(2)):
#   print(ex)
# print("A few raw training examples...")
# for ex in tfds.as_numpy(nq_dataset_codeANDcomment_code_small("train").take(2)):
#   print(ex)

def marked_code_preprocessing(ds):
  
  def to_inputs_and_targets(ex):

        inputs = tf.strings.join(['code&comment2code: ' + ex['input']], separator=' ')
        class_label = tf.strings.join([ex['output']], separator=' ')
        return {'inputs': inputs, 'targets': class_label }
    
  return ds.map(to_inputs_and_targets, 
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

#Create a new training task
t5.data.TaskRegistry.remove('code_comment_to_code_Tufano_etal_ICSE21')
t5.data.TaskRegistry.add(
    "code_comment_to_code_Tufano_etal_ICSE21",
    dataset_fn=nq_dataset_codeANDcomment_code_small,
    splits=["train", "validation"],
    text_preprocessor=[marked_code_preprocessing],
    output_features = DEFAULT_OUTPUT_FEATURES,
    metric_fns=[t5.evaluation.metrics.accuracy],
    num_input_examples=num_nq_examples_codeANDcomment_code_small
)

nq_task = t5.data.TaskRegistry.get("code_comment_to_code_Tufano_etal_ICSE21")
ds = nq_task.get_dataset(split="train", sequence_length={"inputs": 512, "targets": 512})
# print("A few preprocessed training examples...")
# for ex in tfds.as_numpy(ds.take(3)):
#   print(ex)

# Setting up fine tuning tasks

In [None]:
def _rate_num_input_examples(task):
  if "train" in task.splits:
    return float(task.num_input_examples("train"))
  elif "validation" in task.splits:
    return float(task.num_input_examples("validation"))
  else:
    raise ValueError("Task %s does not have a train or validation split." % (task.name))

In [3]:
t5.data.MixtureRegistry.remove("code_to_code_new_large")
t5.data.MixtureRegistry.add(
    "code_to_code_new_large",
    ["code_to_code_new_large"],
    default_rate=_rate_num_input_examples
)
t5.data.MixtureRegistry.remove("code_to_comment_new_large")
t5.data.MixtureRegistry.add(
    "code_to_comment_new_large",
    ["code_to_comment_new_large"],
    default_rate=_rate_num_input_examples
)

t5.data.MixtureRegistry.remove("code_comment_to_code_new_large")
t5.data.MixtureRegistry.add(
    "code_comment_to_code_new_large",
    ["code_comment_to_code_new_large"],
    default_rate=_rate_num_input_examples
)


t5.data.MixtureRegistry.remove("code_to_code_Tufano_etal_ICSE21")
t5.data.MixtureRegistry.add(
    "code_to_code_Tufano_etal_ICSE21",
    ["code_to_code_Tufano_etal_ICSE21"],
    default_rate=_rate_num_input_examples
)

t5.data.MixtureRegistry.remove("code_comment_to_code_Tufano_etal_ICSE21")
t5.data.MixtureRegistry.add(
    "code_comment_to_code_Tufano_etal_ICSE21",
    ["code_comment_to_code_Tufano_etal_ICSE21"],
    default_rate=_rate_num_input_examples
)

Here we need to specify:
- if we want to fin-tuning a pre-trained model or not (and the path of the pre-trained model if needed)
- the dataset we want to use between the new larger dataset and the one by Tufano etal. (ICSE21)
- the downstream task

In [None]:
# our T5 selected architecture
MODEL_SIZE = "small"

#@title Select fine-tuning with or without pre-training
fine_tuning = "fine-tuning_without_pre-training/" #@param ["fine-tuning_with_pre-training/", "fine-tuning_without_pre-training/"]

if fine_tuning == "fine-tuning_without_pre-training/":
  # Specify the pre-trained dir which must contain the pre-trained models, the operative_config.gin file and the checkpoint file as well
  PRETRAINED_DIR= 'gs://' + bucket_name + '/automating_code_review/model_dumps/pre-training/'

#@title Select small or large dataset
dataset = "new_large" #@param ["Tufano_etal_ICSE21", "new_large"]

#@title Selecte the task
if dataset == 'Tufano_etal_ICSE21':
  task_small = "code-to-code/" #@param ["code-to-code/","code&comment-to-code/"]
  task = task_small
else:
  task_large = "code-to-comment/" #@param ["code-to-code/","code-to-comment/","code&comment-to-code/"]
  task = task_large

if task == "code-to-code/":
  task_to_train = "code_to_code_" + dataset
elif task == "code-to-comment/":
  task_to_train = "code_to_comment_" + dataset
elif task == "code&comment-to-code/":
  task_to_train = "code_comment_to_code_" + dataset

############ output path ############
MODEL_DIR = 'gs://' + bucket_name + '/automating_code_review/model_dumps/' + fine_tuning + dataset + '/' + task

model_parallelism, train_batch_size, keep_checkpoint_max = {
    "small": (1, 128, 200),
    "base": (2, 128, 8),
    "large": (8, 64, 4),
    "3B": (8, 16, 1),
    "11B": (8, 16, 1)}[MODEL_SIZE]


We set the selected learning rate scheduler

In [5]:
from mesh_tensorflow.transformer.learning_rate_schedules import slanted_triangular 

from mesh_tensorflow.transformer.learning_rate_schedules import truncated_rsqrt
 
from tensorflow.keras.optimizers.schedules import PolynomialDecay

starter_learning_rate = 0.05
end_learning_rate = 0.001
decay_steps = 10000

learning_rate_fn = PolynomialDecay(
    starter_learning_rate,
    decay_steps,
    end_learning_rate,
    power=0.5)

#@title Select a learning rate scheduler
learning_rate_scheduler_picker = "constant" #@param ["slanted", "isr", "polynomial", "constant"]

if learning_rate_scheduler_picker == "slanted":
  selected_learning_rate_scheduler = slanted_triangular
  PATH_GIN_FILE = 'gs://' + bucket_name + '/automating_code_review/utils/operative_config_slanted.gin'
elif learning_rate_scheduler_picker == "isr":
  selected_learning_rate_scheduler = truncated_rsqrt
  PATH_GIN_FILE = 'gs://' + bucket_name + '/automating_code_review/utils/operative_config_isr.gin'
elif learning_rate_scheduler_picker == "polynomial":
  selected_learning_rate_scheduler = learning_rate_fn
  PATH_GIN_FILE = 'gs://' + bucket_name + '/automating_code_review/utils/operative_config_polynomial.gin'
elif learning_rate_scheduler_picker == "constant":
  selected_learning_rate_scheduler = 0.001
  PATH_GIN_FILE = 'gs://' + bucket_name + '/automating_code_review/utils/operative_config_constant.gin'

#@title Select a learning rate scheduler
number_of_steps = 500 #@param {type:"integer"}

tf.io.gfile.makedirs(MODEL_DIR)

model = t5.models.MtfModel(
    model_dir=MODEL_DIR,
    tpu=TPU_ADDRESS,
    tpu_topology=TPU_TOPOLOGY,
    model_parallelism=model_parallelism,
    batch_size=train_batch_size,
    learning_rate_schedule = selected_learning_rate_scheduler,
    sequence_length={"inputs": 512, "targets": 512},
    save_checkpoints_steps=10000,
    keep_checkpoint_max=keep_checkpoint_max if ON_CLOUD else None,
    iterations_per_loop=100,
)

!gsutil cp {PATH_GIN_FILE}  ./config.gin

if learning_rate_scheduler_picker == "slanted":
  gin_lines = [line for line in open("./config.gin")]
  f = open("./config.gin", "w+")
  for i in range(len(gin_lines)):
    if i == 196 and fine_tuning == "fine-tuning_without_pre-training/":
      line = "slanted_triangular.start_step = 0\n"
      f.write(line)
      continue
    if i == 197:
      line = "slanted_triangular.total_train_steps = " + str(number_of_steps) + '\n'
      f.write(line)
      continue
    f.write(gin_lines[i])
  f.close()

# Start Training

In [6]:
import gin

if fine_tuning == "fine-tuning_without_pre-training/":
  # NON PRETRAINED
  with gin.unlock_config():    
      gin.parse_config_file("./config.gin")
      TRAIN_STEPS = number_of_steps
      model.train(task_to_train, steps=number_of_steps)

else:
  # PRETRAINED
  with gin.unlock_config():
      gin.parse_config_file("./config.gin")
      #RUN FINE-TUNING
      model.finetune(
          mixture_or_task_name=task_to_train,
          pretrained_model_dir=PRETRAINED_DIR,
          finetune_steps=number_of_steps
      )

# Evaluation

Evaluate the model checkpoint(s) on the validation set

In [7]:
# Use a larger batch size for evaluation, which requires less memory.
model.batch_size = 1024
model.eval(
    mixture_or_task_name=task_to_train,
    # -1 will evaluate the last checkpoint, you can also provide 
    # a list of checkpoints with the following format : [10000, 20000, 30000]
    checkpoint_steps=-1,
    split="validation"
    )

# Confidence Score

Using the `model.score()` function we evaluate the model confidence about the generated predictions (given the input).

NOTE: To generate the predictions follow the instructions in our  [replication package](https://github.com/CodeReviewAutomation/code_review_automation)

In [8]:
import math

# upload your input file (source.txt) and your prediction file (predictions.txt)

inputs_file = './source.txt'
pred_file = 'predictions.txt'
score_file = 'score'
model.score(inputs=inputs_file,
            targets=pred_file,
            scores_file=score_file,
            checkpoint_steps='best',
            vocabulary=get_default_vocabulary())

confidence_score = [math.exp(float(line.split)) for line in open('./score.score', 'r')]

f = open('./confidence_score.txt', 'w+')
for i in range(len(confidence_score)):
  f.write(str(confidence_score[i]) + '\n')
f.close()