### Creation of the environment

In [1]:
%tensorflow_version 2.x
!pip3 install --upgrade pip
!pip install -qU t5


import functools
import os
import time
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import tensorflow.compat.v1 as tf
import tensorflow_datasets as tfds

import t5

#Set the base dir(Google cloud bucket)
BASE_DIR = "gs://bucket_block_completion" 

if not BASE_DIR or BASE_DIR == "gs://":
  raise ValueError("You must enter a BASE_DIR.")
ON_CLOUD = True

!pip install tensorflow-gcs-config

if ON_CLOUD:
  import tensorflow_gcs_config
  from google.colab import auth
  # Set credentials for GCS reading/writing from Colab and TPU.
  TPU_TOPOLOGY = "2x2"
  try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    TPU_ADDRESS = tpu.get_master()
    print('Running on TPU:', TPU_ADDRESS)
  except ValueError:
    raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')
  auth.authenticate_user()
  tf.config.experimental_connect_to_host(TPU_ADDRESS)
  tensorflow_gcs_config.configure_gcs_from_colab_auth()

tf.disable_v2_behavior()

# Improve logging.
from contextlib import contextmanager
import logging as py_logging

if ON_CLOUD:
  tf.get_logger().propagate = False
  py_logging.root.setLevel('INFO')

@contextmanager
def tf_verbosity_level(level):
  og_level = tf.logging.get_verbosity()
  tf.logging.set_verbosity(level)
  yield
  tf.logging.set_verbosity(og_level)

Collecting pip
  Downloading pip-21.3.1-py3-none-any.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 5.1 MB/s 
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 21.1.3
    Uninstalling pip-21.1.3:
      Successfully uninstalled pip-21.1.3
Successfully installed pip-21.3.1
     |████████████████████████████████| 153 kB 5.2 MB/s            
     |████████████████████████████████| 1.2 MB 60.7 MB/s            
     |████████████████████████████████| 3.1 MB 86.8 MB/s            
     |████████████████████████████████| 4.0 MB 66.0 MB/s            
     |████████████████████████████████| 4.9 MB 69.8 MB/s            
     |████████████████████████████████| 366 kB 92.1 MB/s            
     |████████████████████████████████| 90 kB 11.6 MB/s            
     |████████████████████████████████| 286 kB 104.8 MB/s            
     |████████████████████████████████| 59 kB 6.3 MB/s             
     |███████████████████████████████

Instructions for updating:
non-resource variables are not supported in the long term


### Loading of tsv files
With this script you can load each tsv file for finetuning.
Please be sure that the path to all tsv files are correct

In [2]:

nq_tsv_path = {
    "train":      'gs://bucket_block_completion/dataset_08_07/train.tsv',
    # we don't use them
    "validation": 'gs://bucket_block_completion/dataset_08_07/test.tsv',
}

num_nq_examples = dict(train=1197310, validation=15783)

### Preprocess of the dataset
In this step we preprocess the dataset.  
You have to change the path to vocab files (*vocab_model_path* and *vocab_path*)
We're going to preprocess all the tsv file so that T5 can use them for evaluation.

In [3]:
from t5.data import postprocessors as t5_postprocessors
from t5.seqio import Feature,SentencePieceVocabulary


# # Set the path of sentencepiece model and vocab files
# # Must be the same used for the pre-trained phase
vocab_model_path = 'gs://bucket_block_completion/code.model'
vocab_path = 'gs://bucket_block_completion/code.vocab'


TaskRegistry = t5.data.TaskRegistry
TfdsTask = t5.data.TfdsTask


def get_default_vocabulary():
  return SentencePieceVocabulary(vocab_model_path, 100)

DEFAULT_OUTPUT_FEATURES = {
    "inputs": Feature(
        vocabulary=get_default_vocabulary(), add_eos=True, required=False),

    "targets": Feature(
        vocabulary=get_default_vocabulary(), add_eos=True)
}

In [4]:
prefix=''

def nq_dataset_task(split, shuffle_files=True):
  # We only have one file for each split.
  del shuffle_files

  # Load lines from the text file as examples.

  ds = tf.data.TextLineDataset(nq_tsv_path[split])
  ds = ds.map(
      functools.partial(tf.io.decode_csv, record_defaults=["string","string"],
                        field_delim="\t", use_quote_delim=False),
      num_parallel_calls=tf.data.experimental.AUTOTUNE)
  
  ds = ds.map(lambda *ex: dict(zip(["input", "output"], ex)))
  return ds

print("A few raw train examples...")
for ex in tfds.as_numpy(nq_dataset_task("train").take(5)):
    print(ex)


def preprocessing(ds):
  
  def to_inputs_and_targets(ex):
        inputs = tf.strings.join([prefix + ex['input']], separator=' ')
        class_label = tf.strings.join([ex['output']], separator=' ')
        return {'inputs': inputs, 'targets': class_label}
    
  return ds.map(to_inputs_and_targets, 
                num_parallel_calls=tf.data.experimental.AUTOTUNE)
    
      
t5.data.TaskRegistry.remove('finetuning')
t5.data.TaskRegistry.add(
    "finetuning",
    dataset_fn=nq_dataset_task,
    splits=["validation"],
    text_preprocessor=preprocessing,
    output_features = DEFAULT_OUTPUT_FEATURES,
    metric_fns=[t5.evaluation.metrics.accuracy]
)

A few raw train examples...
{'input': b'public static void saveBitmapToDevice(Bitmap btmp, String filePath, String imageName) { File file = new File (filePath, imageName); if (file.exists ()) file.delete (); try <extra_id_0> catch (Exception e) { e.printStackTrace(); } }', 'output': b'{ FileOutputStream out = new FileOutputStream(file); btmp.compress(Bitmap.CompressFormat.JPEG, 90, out); out.flush(); out.close(); }'}
{'input': b'@Override public Dampening updateGroupDampening(String tenantId, Dampening groupDampening) throws Exception { if (isEmpty(tenantId)) { throw new IllegalArgumentException("TenantId must be not null"); } if (isEmpty(groupDampening)) { throw new IllegalArgumentException("DampeningId and TriggerId must be not null"); } try { deferNotifications(); checkTenantId(tenantId, groupDampening); String groupId = groupDampening.getTriggerId(); Trigger groupTrigger = getTrigger(tenantId, groupId); if (!groupTrigger.isGroup()) { throw new IllegalArgumentException( "Trigger [" 

<t5.data.dataset_providers.FunctionTask at 0x7f6d13e87850>

In [5]:
nq_task = t5.data.TaskRegistry.get("finetuning")
ds = nq_task.get_dataset(split="validation", sequence_length={"inputs": 512, "targets": 512})
print("A few preprocessed training examples...")
for ex in tfds.as_numpy(ds.take(5)):
  print(ex)

A few preprocessed training examples...
{'inputs_pretokenized': b'public void handleWindowUpdate(int streamId, int deltaWindowSize) throws IOException { if (streamId == 0) { if (deltaWindowSize == 0) <extra_id_0> synchronized (flowControlLock) { boolean exhausted = sendWindowSize <= FLOW_CONTROL_MIN_WINDOW; sendWindowSize += deltaWindowSize; if (exhausted) { notifyFlowControlAllowed(); } if (sendWindowSize > Integer.MAX_VALUE) { sendGoAway(ERROR_FLOW_CONTROL_ERROR); } } } else { if (deltaWindowSize == 0) { UndertowLogger.REQUEST_IO_LOGGER.debug("Invalid flow-control window increment of 0 received with WINDOW_UPDATE frame for stream " + streamId); sendRstStream(streamId, ERROR_PROTOCOL_ERROR); return; } StreamHolder holder = currentStreams.get(streamId); Http2StreamSinkChannel stream = holder != null ? holder.sinkChannel : null; if (stream == null) { if(isIdle(streamId)) { sendGoAway(ERROR_PROTOCOL_ERROR); } } else { stream.updateFlowControlWindow(deltaWindowSize); } } }', 'inputs': arr

CHOOSE THE DATASET YOU WANT TO PROCESS

In [14]:
dataset="0807"

checkpoints=dict()
checkpoints["0302"]=220400
checkpoints["0403"]=240800
checkpoints["0605"]=424400
checkpoints["0807"]=648800

checkpoint=checkpoints[dataset]

from t5 import models

MODEL_SIZE = "small" 

# Set the folder where the checkpoints and all the others information will be writed
MODEL_DIR = 'gs://bucket_block_completion/finetuning/{}/model/'.format(dataset)

# Specify the pre-trained dir which must contain the pre-trained models, the operative_config.gin file and the checkpoint file as well
PRETRAINED_DIR='gs://bucket_block_completion/pretrained_model'


model_parallelism, train_batch_size, keep_checkpoint_max = {
    "small": (1, 256, 5000),
    "base": (2, 128, 8),
    "large": (8, 64, 4),
    "3B": (8, 16, 1),
    "11B": (8, 16, 1)}[MODEL_SIZE]

tf.io.gfile.makedirs(MODEL_DIR)

model = models.mtf_model.MtfModel(
    model_dir=MODEL_DIR,
    tpu=TPU_ADDRESS,
    tpu_topology=TPU_TOPOLOGY,
    model_parallelism=model_parallelism,
    batch_size=train_batch_size,
    sequence_length={"inputs": 512, "targets": 512},
    save_checkpoints_steps=5000,
    keep_checkpoint_max=keep_checkpoint_max if ON_CLOUD else None,
    iterations_per_loop=100,
)



In [15]:
input_folder=MODEL_DIR
output_folder="gs://bucket_block_completion/score/{}".format(dataset)

import os
input_file='gs://bucket_block_completion/input_testset/finetuning_inputs'
# you can keep the same output file; by default when predicting it added the number of steps so the file is not overwritten
output_file=os.path.join(output_folder, "scores.txt")
print(output_file)

prediction_file='gs://bucket_block_completion/input_testset/{}/predictions.txt-{}'.format(dataset, checkpoint)
print(prediction_file)
from t5.seqio import SentencePieceVocabulary

# # Set the path of sentencepiece model and vocab files
# # Must be the same used for the pre-trained phase
vocab_model_path = 'gs://bucket_block_completion/code.model'
vocab_path = 'gs://bucket_block_completion/code.vocab'

def get_default_vocabulary():
  return SentencePieceVocabulary(vocab_model_path, 100)

vocabulary_predict=get_default_vocabulary()

model.batch_size = 512

# we use the prediction file as target file (since we want to see how much the prediction is likely)
model.score(inputs=input_file, 
            targets=prediction_file,
            scores_file=output_file,
            checkpoint_steps=checkpoint, 
            vocabulary=vocabulary_predict)


INFO:root:system_path_file_exists:gs://bucket_block_completion/finetuning/0807/model/operative_config.gin
ERROR:root:Path not found: gs://bucket_block_completion/finetuning/0807/model/operative_config.gin


gs://bucket_block_completion/score/0807/scores.txt
gs://bucket_block_completion/input_testset/0807/predictions.txt-648800
INFO:tensorflow:Using config: {'_model_dir': 'gs://bucket_block_completion/finetuning/0807/model/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 5000, '_save_checkpoints_secs': None, '_session_config': graph_options {
  rewrite_options {
    disable_meta_optimizer: true
  }
}
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value: "10.119.67.250:8470"
    }
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({'worker': ['10.119.67.250:8470']}), '_task_type': 'worker'

([-66.68849,
  -116.237946,
  -3.291596,
  -1.367021,
  -34.412277,
  -28.218254,
  -5.9873776,
  -0.21279621,
  -5.080937,
  -14.203014,
  -47.10598,
  -80.2726,
  -6.6083417,
  -5.957514,
  -7.9226475,
  -6.627978,
  -27.62106,
  -27.445118,
  -12.051581,
  -23.496326,
  -4.339297,
  -3.4166286,
  -4.7476835,
  -1.0621758,
  -11.729537,
  -2.9205616,
  -11.157913,
  -10.371377,
  -1.4320638,
  -16.302792,
  -10.689233,
  -38.580605,
  -53.21902,
  -9.986889,
  -5.1466475,
  -5.8738422,
  -4.1288977,
  -6.370301,
  -3.7715533,
  -5.5834107,
  -3.0878901,
  -14.899129,
  -7.725478,
  -7.344103,
  -3.9239037,
  -55.04731,
  -70.3792,
  -7.43471,
  -43.741547,
  -2.4197836,
  -25.353817,
  -3.8581145,
  -108.4177,
  -34.715904,
  -24.883698,
  -65.443085,
  -43.9293,
  -5.2445164,
  -44.793205,
  -1.6436619,
  -1.5947523,
  -51.443005,
  -0.8395388,
  -8.192999,
  -36.66394,
  -2.1960726,
  -2.0814664,
  -1.6628842,
  -7.1528325,
  -8.834174,
  -12.160839,
  -4.7226615,
  -6.051177,
  -7