In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 15.9 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 70.3 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 59.8 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.2 transformers-4.24.0


In [None]:
from huggingface_hub import notebook_login

notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token


In [None]:
import numpy as np
import re
import os

In [None]:
import tensorflow as tf
from transformers import AutoTokenizer
from transformers import TFAutoModelForSeq2SeqLM
from transformers import create_optimizer
from tensorflow.keras.callbacks import ModelCheckpoint
#from transformers import DataCollatorForSeq2Seq

In [None]:
cwd = os.getcwd()
cwd = cwd+"/drive/MyDrive/NEWSsummarization"

In [None]:
train_path_file1 = cwd + "/Dataset/train1.csv"
train_path_file2=  cwd +"/Dataset/train2.csv"
val_path = cwd +"/Dataset/dev.csv"
checkpoint_path = cwd +"/modelCheckpoints/"
train_path = [train_path_file1,train_path_file2]


In [None]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

In [None]:
BATCH_SIZE = 12
text_len = 1024
summary_len = 150
l_rate=5e-2

In [None]:
model_checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [None]:
# load dataset
train_ds=tf.data.experimental.make_csv_dataset(train_path,batch_size=BATCH_SIZE,num_epochs=1,select_columns=['text','summary'])
valiadation_ds=tf.data.experimental.make_csv_dataset(val_path,batch_size=BATCH_SIZE,num_epochs=1,select_columns=['text','summary'])

# preprocess

In [None]:
def clean_text(mess):
  regex = r"[^a-zA-Z0-9\s]"
  mess = tf.strings.lower(mess)
  mess = tf.strings.regex_replace(mess,"<br /><br />", r" ")
  mess = tf.strings.regex_replace(mess,regex, r" ")    # Remove all none alphanumeric characters
  mess = tf.strings.regex_replace(mess,"(\\n)", r" ")  # remove new line character
  mess = tf.strings.regex_replace(mess,"  ", r" ")     # remove double white spaces
  return mess.numpy().decode('UTF-8')
  
def pre_process_text(string):
  string=tf.strings.join(string,separator=' ')
  # apply clean text funtion
  clean_string = clean_text(string)

  model_inputs = tokenizer(clean_string, max_length=text_len, truncation=True)

  return model_inputs['input_ids'],model_inputs['attention_mask']

def pre_process_summary(string):
  # join all the strings
  string=tf.strings.join(string,separator=' ')
  # apply clean text funtion
  clean_string = clean_text(string)

  labels = tokenizer(clean_string,max_length=summary_len,truncation=True)
  return np.array(labels['input_ids'])

def apply_function(data_dict):
  text_ids,text_attention_mask= tf.py_function(pre_process_text, inp=[data_dict["text"]],Tout=(tf.int32, tf.int32)) 
  summary_id = tf.py_function(pre_process_summary, inp=[data_dict["summary"]],Tout=tf.int32)
  return text_ids,text_attention_mask,summary_id

In [None]:
def create_dict(input_ids,attention_mask,labels_id):

  return {"input_ids": tf.convert_to_tensor(input_ids),"attention_mask": tf.convert_to_tensor(attention_mask),"labels":tf.convert_to_tensor(labels_id)}
   

In [None]:
train_data = train_ds.map(apply_function)
train_data = train_data.padded_batch(BATCH_SIZE, padded_shapes=([text_len],[text_len],[summary_len]),padding_values=(0,0,-100))
train_data = train_data.map(create_dict)
train_data = train_data.prefetch(AUTOTUNE)

In [None]:
validation_data = valiadation_ds.map(apply_function)
validation_data = validation_data.padded_batch(BATCH_SIZE, padded_shapes=([text_len],[text_len],[summary_len]),padding_values=(0,0,-100))
validation_data = validation_data.map(create_dict)
validation_data = validation_data.prefetch(AUTOTUNE)

# model

In [None]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [None]:
tf.keras.backend.clear_session()

In [None]:
no_train_samples = 994950
size_train_dataset = no_train_samples//BATCH_SIZE

In [None]:
from transformers.keras_callbacks import PushToHubCallback

callback = PushToHubCallback(
    output_dir="news-sum-dev-ai5", tokenizer=tokenizer )

/content/news-sum-dev-ai5 is already a clone of https://huggingface.co/devansh71/news-sum-dev-ai5. Make sure you pull the latest changes with `repo.git_pull()`.


In [None]:
num_train_epochs = 4
model_name = model_checkpoint
num_train_steps = size_train_dataset * num_train_epochs

optimizer, schedule = create_optimizer(init_lr=l_rate,
                                       num_warmup_steps=0,
                                       num_train_steps=num_train_steps,
                                       weight_decay_rate=0.01,
                                       )

In [None]:
model.compile(optimizer=optimizer)

# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [None]:
model.fit(train_data, validation_data=validation_data, epochs=num_train_epochs, callbacks=[callback],verbose=1)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file tf_model.h5:   0%|          | 3.34k/357M [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/devansh71/news-sum-dev-ai5
   8ea3b63..36dae8b  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/devansh71/news-sum-dev-ai5
   8ea3b63..36dae8b  main -> main



<keras.callbacks.History at 0x7effec03e190>

In [None]:
model.push_to_hub('ai5_sum_model',commit_message='adding final model')