# Helper functions

# Pip install requirements

In [None]:
!pip install transformers
!pip install datasets

# Imports

In [2]:
import pip
import tensorflow as tf
import transformers
from datasets import Dataset

import os
import pathlib
from google.colab import drive

import pandas as pd
import math

# Mounting Drive

In [3]:
drive.mount('/content/drive')

Mounted at /content/drive


# Global variables

In [4]:
raw_data_path = '/content/drive/MyDrive/Colab Notebooks/tensorflow'
model_to_preload = 't5-small'
sentence_prefix = 'translate Dansish to English: '
BATCH_SIZE = 64
EPOCHS = 20

# Tokenizer

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(model_to_preload)

Collator

In [6]:
collator = transformers.DataCollatorForSeq2Seq(tokenizer,padding=True, return_tensors='tf')

# Tokenizer function

In [7]:
def tokenize(sentence):
  #print(len(sentence['context']))
  #print(len(sentence['target']))
  
  (context, target) = (tokenizer(sentence['context'], padding=True), tokenizer(sentence['target'], padding=True))  
  
  #print(len(context['input_ids']))
  #print(len(target['input_ids']))
  
  tokenized = {'input_ids': context['input_ids'], 'attention_mask': context['attention_mask'], 'decoder_input_ids': target['input_ids'], 'decoder_attention_mask': target['attention_mask']}
  return tokenized

In [None]:
print(tokenize({'context': {'Hello', 'what is up'}, 'target': {'hello what is up', 'heelo'}}))

# Load data

In [8]:
l1 = pathlib.Path(raw_data_path + '/TED2020.da-en.txt')
l2 = pathlib.Path(raw_data_path + '/TED2020.da-en-en.txt')

l1_split = l1.read_text().splitlines()
l2_split = l2.read_text().splitlines()

#Prefix l1 for t5 model finetuning
l1_split_prefix = []
[l1_split_prefix.append(sentence_prefix + sentence) for sentence in l1_split]


combined = {
    'context': l1_split,
    'target': l2_split
    }





In [None]:

print(l1_split_prefix[0])

print(len(combined['context']))


72113


In [9]:
df = pd.DataFrame.from_dict(combined)
#should prob shuffle dataframe before split
length = len(df)
train_df = df.iloc[:math.floor(length*0.8)]
val_df = df.iloc[(math.floor(length*0.8))+1:]

In [None]:
print(length)
print(len(train_df))
print(len(val_df))

# Data preprocessing

In [10]:
hug_ds_train = Dataset.from_pandas(train_df, split='train')
hug_ds_val = Dataset.from_pandas(val_df, split='val')

In [None]:
print(hug_ds_train)
print(hug_ds_val)

In [11]:
hug_ds_train_tokenized = hug_ds_train.map(tokenize, batched=True, batch_size=BATCH_SIZE)

hug_ds_val_tokenized = hug_ds_val.map(tokenize, batched=True, batch_size=BATCH_SIZE)

  0%|          | 0/902 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (706 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/226 [00:00<?, ?ba/s]

In [None]:
print(hug_ds_train_tokenized)
print(hug_ds_val_tokenized)

In [12]:
columns = ['context', 'target']
hug_ds_train_tokenized = hug_ds_train_tokenized.remove_columns(columns)
hug_ds_val_tokenized = hug_ds_val_tokenized.remove_columns(columns)


In [None]:
print(hug_ds_train_tokenized)
print(hug_ds_val_tokenized)

In [13]:
features = {'columns': ['input_ids', 'attention_mask', 'decoder_input_ids'], 'label_cols': ['decoder_input_ids', 'decoder_attention_mask']}

tf_ds_train = hug_ds_train_tokenized.to_tf_dataset(BATCH_SIZE, columns = features['columns'], label_cols=features['label_cols'], prefetch=True, collate_fn=collator)
tf_ds_val = hug_ds_val_tokenized.to_tf_dataset(BATCH_SIZE, columns = features['columns'], label_cols=features['label_cols'], prefetch=True, collate_fn=collator)

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [21]:
print(tf_ds_train)
print(tf_ds_val)

<PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'decoder_input_ids': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'label_mask': TensorSpec(shape=(None, None), dtype=tf.int64, name=None)}, {'decoder_input_ids': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'label_mask': TensorSpec(shape=(None, None), dtype=tf.int64, name=None)})>
<PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'decoder_input_ids': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'label_mask': TensorSpec(shape=(None, None), dtype=tf.int64, name=None)}, {'decoder_input_ids': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'label_mask': TensorSpec(shape=(None, None), dtype=tf.int64, name=None)})>


# Distribution (becuase stuff)

In [41]:
strategy = tf.distribute.OneDeviceStrategy(device = '/TPU:0')

# Model instansiating

In [None]:
#with strategy.scope():
model = transformers.TFAutoModel.from_pretrained(model_to_preload)

# Loss function

In [21]:
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True, name='loss')

# Accuracy function

In [22]:
metrics = [tf.keras.losses.SparseCategoricalCrossentropy(name = 'Accuracy')]

# Precision, recall, f_1 score function/s

# Learning rate

In [23]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, warmup_steps=1e4):
    super().__init__()

    self.warmup_steps = tf.cast(warmup_steps, tf.float32)
    
  def __call__(self, step):
    step = tf.cast(step, tf.float32)
    m = tf.maximum(self.warmup_steps, step)
    m = tf.cast(m, tf.float32)
    lr = tf.math.rsqrt(m)
    
    return lr 

# Optimizer

In [24]:
learning_rate = CustomSchedule()
optimizer = tf.optimizers.Adam(learning_rate)

# Compiling and training

In [25]:
model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=metrics
)


In [None]:
model.fit(tf_ds_train, epochs=EPOCHS, validation_data=tf_ds_val)

# Testing model

In [None]:
gradsList = []
loss_fn = tf.keras.metrics.Mean(name='loss')
for epoch in range(EPOCHS):
  for (_, batch) in enumerate(tf_ds_train):

    input_ids = batch[0]['input_ids']
    
    input_masks = batch[0]['attention_mask']

    labels = batch[1]['decoder_input_ids']

    label_mask = batch[1]['decoder_attention_mask']

    with tf.GradientTape() as tape:
      
      outputs = model(input_ids = input_ids, attention_mask = input_masks, decoder_input_ids = labels, decoder_attention_mask = label_mask)
      
      loss = outputs[0]

      print(type(model.compiled_metrics))
      print(type(model.compiled.loss))
      

      logits = outputs[1]

      #loss_fn()
      #loss = model.compute_loss(loss)
      loss = tf.reduce_mean(loss)
      
      #print(loss)

      # grads = tape.gradient(loss, model.trainable_variables)

      # gradsList.append(grads)

      # optimizer.apply_gradients(zip(grads, model.trainable_variables))

      #https://colab.research.google.com/github/snapthat/TF-T5-text-to-text/blob/master/snapthatT5/notebooks/TF-T5-Datasets%20Training.ipynb#scrollTo=2xcGqd9qDXOF
      # loss_tracker.update_state(loss)        
      # compiled_metrics.update_state(y, logits)
      # metrics = {m.name: m.result() for m in self.metrics}
      # metrics.update({'lr': lr})

      # template = ("Epoch {}, Loss: {}, Accuracy: {}, Test Loss: {}, "
      #         "Test Accuracy: {}")
      # print(template.format(epoch + 1, loss,
      #                       accuracy.result() * 100, loss.result(),
      #                       accuracy.result() * 100))


    break
  break
