In [32]:
! pip install transformers



In [33]:

import tensorflow as tf
import pandas as pd
from transformers import T5Tokenizer, TFT5Model, TFT5ForConditionalGeneration
import tensorflow_datasets as tfds
import time
import logging
logging.basicConfig(level=logging.ERROR)

In [34]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Hugging face T5-small

In [35]:
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = TFT5ForConditionalGeneration.from_pretrained('t5-small')

task_specific_params = model.config.task_specific_params
if task_specific_params is not None:
    model.config.update(task_specific_params.get("summarization", {}))
    
pad_token_id = tokenizer.pad_token_id

In [49]:
BATCH_SIZE = 64

SHUFFEL_SIZE = 1024

learning_rate = 3e-5

In [50]:

optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08, clipnorm=1.0)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)


train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')


val_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='val_accuracy')

model.summary()

Model: "tf_t5for_conditional_generation_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
shared (TFSharedEmbeddings)  multiple                  16449536  
_________________________________________________________________
encoder (TFT5MainLayer)      multiple                  18881280  
_________________________________________________________________
decoder (TFT5MainLayer)      multiple                  25176064  
Total params: 60,506,880
Trainable params: 60,506,880
Non-trainable params: 0
_________________________________________________________________


## Data preperation



In [62]:
data=pd.read_csv('/content/gdrive/My Drive/Bert/data_without_ent.csv')

In [63]:
len_data = len(data)
len_test = int(len_data * 0.1)
len_train = len_data - len_test

In [64]:
test_data = data.iloc[:len_test]
train_data = data.iloc[len_test:]

In [65]:
test_data.head()

Unnamed: 0.1,Unnamed: 0,text,summary
0,0,"tripoli , libya rebels in tripoli furiously hu...",_START_ is sara sidner sees another world in ...
1,1,a german tourist was in critical condition af...,_START_ this is the fourth shark attack in ma...
2,2,if that car parked in harvard yard is a rocki...,_START_ harvard bans all romantic relationshi...
3,3,south korea launched an investigation tuesday...,_START_ u.s. and south korea teams are search...
4,4,space shuttle discovery launched just before ...,_START_ space shuttle discovery launches just...


In [66]:
train_tfds = tf.data.Dataset.from_tensor_slices((train_data.text.values, train_data.summary.values))
test_tfds = tf.data.Dataset.from_tensor_slices((test_data.text.values, test_data.summary.values))

In [67]:
def normalize_text(text):
    
    text = tf.strings.lower(text)
    text = tf.strings.regex_replace(text,"'(.*)'", r"\1")
    return text.numpy().decode('UTF-8')

def tokenize_articles(text):
    text = normalize_text(text)
    ids = tokenizer.encode_plus((model.config.prefix + text), return_tensors="tf", max_length=350, truncation='longest_first') 

    return tf.squeeze(ids['input_ids']), tf.squeeze(ids['attention_mask'])
        
def tokenize_highlights(text):
    text = normalize_text(text)
    ids = tokenizer.encode(text, return_tensors="tf", max_length=50)
    return tf.squeeze(ids)


def map_func(x, y):
    article_ids, attention_mask = tf.py_function(tokenize_articles, inp=[x], Tout=(tf.int32, tf.int32))
    highlights_ids = tf.py_function(tokenize_highlights, inp=[y], Tout=tf.int32)

    return article_ids, attention_mask, highlights_ids

In [68]:
x,y = next(iter(train_tfds))

mapped_data = map_func(x,y)



In [69]:
train_ds = train_tfds.map(map_func)\
    .shuffle(SHUFFEL_SIZE)\
    .padded_batch(BATCH_SIZE, padded_shapes=([350],[350],[50]))\
    .prefetch(tf.data.experimental.AUTOTUNE)

test_ds = test_tfds.map(map_func)\
    .shuffle(SHUFFEL_SIZE)\
    .padded_batch(BATCH_SIZE, padded_shapes=([350],[350],[50]))\
    .prefetch(tf.data.experimental.AUTOTUNE)

## Train function

#### https://github.com/huggingface/transformers/blob/master/examples/summarization/bart/finetune.py

#### https://github.com/yahah100/text_summarization/blob/ffaf0047dcbd54f8daf439bc45b67c808913770f/german_text_summary/t5_tf_german_text_summary.ipynb

In [70]:

@tf.function
def train_step(input_ids, input_mask, y):
    
    y_ids = y[:, :-1]
    lm_labels = tf.identity(y[:, 1:])
    lm_labels = tf.where(tf.equal(y[:, 1:],pad_token_id), -100, lm_labels)

    with tf.GradientTape() as tape:
     

        predictions, _, _ = model(input_ids, attention_mask=input_mask, decoder_input_ids=y_ids, lm_labels=lm_labels, training=True)
        loss = loss_object(y[:, 1:], predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    

    train_accuracy(y[:, 1:], predictions)

### validation function

In [71]:

@tf.function
def val_step(input_ids, input_mask, y):

   
    y_ids = y[:, :-1]
    lm_labels = tf.identity(y[:, 1:])
    lm_labels = tf.where(tf.equal(y[:, 1:],pad_token_id), -100, lm_labels)
    
    predictions, _, _ = model(input_ids, attention_mask=input_mask, decoder_input_ids=y_ids, lm_labels=lm_labels, training=False)
    v_loss = loss_object(y[:, 1:], predictions)

  
    val_accuracy(y[:, 1:], predictions)

In [72]:


EPOCHS = 5
log_interval = 200
for epoch in range(EPOCHS):
    # reset metrics

    train_accuracy.reset_states()

    val_accuracy.reset_states()
    
    val_batches = iter(train_ds)
    
    start_time = time.time()
    for i, (input_ids, input_mask, y) in enumerate(train_ds):
        # training
        train_step(input_ids, input_mask, y)
        
        # validation
        if i % log_interval == 0:
            x_val, x_mask_val, y_val = next(val_batches)
            val_step(x_val, x_mask_val, y_val)
            elapsed = time.time() - start_time
            print('| epoch {:3d} | [{:5d}/{:5d}] | '
                  'ms/batch {:5.2f} | '
                  'train acc {:5.2f} | val acc {:5.2f} |'.format(
                    epoch, i, int(len_train/BATCH_SIZE),
                    elapsed * 1000 / log_interval,
                    train_accuracy.result() * 100, val_accuracy.result() * 100))
            start_time = time.time()

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


| epoch   0 | [    0/   56] | ms/batch 56.05 | train acc 40.82 | val acc 53.32 |
| epoch   1 | [    0/   56] | ms/batch 15.17 | train acc 53.06 | val acc 53.57 |
| epoch   2 | [    0/   56] | ms/batch 15.22 | train acc 58.16 | val acc 53.83 |
| epoch   3 | [    0/   56] | ms/batch 14.99 | train acc 67.86 | val acc 54.59 |
| epoch   4 | [    0/   56] | ms/batch 14.69 | train acc 57.65 | val acc 69.13 |


In [116]:
from tqdm import tqdm
predictions = []
reference=[]

for i, (input_ids, input_mask, y) in (enumerate(test_ds)):
    start_time = time.time()
    summaries = model.generate(input_ids=input_ids,max_length=45 ,attention_mask=input_mask)

    pred = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summaries]
    real = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in y]
    
    predictions.append(pred)
    reference.append(real)

In [92]:
for i in range(0,7):
    print("------")
    print('pred_sent :',predictions[i] )

    print('ref_sent :' , reference[i] )
    print("------")

------
pred_sent : santa barbara county sheriff is department evacuated up to 6,000 people from popular white rock campground spokesman says the flames were 5 contained the forest service expects 700 personnel in place
ref_sent : _start_ the white fire covers 1 000 acres and is 5 contained up to 6 000 people were evacuated from a popular camping area the wildfire is moving away from homes and structures authorities say _end_
------
------
pred_sent : argentine president cristina fernandez de kirchner told to take a month off work after doctors diagnosed her with a subdural hematoma diagnosis mean fern
ref_sent : _start_ cristina fernandez de kirchner has a blood clot on the surface of her brain a month of rest is recommended spokesman the president suffered cranial trauma in august _
------
------
pred_sent : soyuz capsule carrying south korea is first astronaut landed in northern kazakhstan on saturday 260 miles 418 kilometers off its mark russian space officials say
ref_sent : _start

### Rogue score

In [96]:
!pip install sumeval

Collecting sumeval
[?25l  Downloading https://files.pythonhosted.org/packages/e6/87/bfc0f9397b9421305863edfdd2dbea637e47204976cb5473535c856338f4/sumeval-0.2.2.tar.gz (80kB)
[K     |████                            | 10kB 18.8MB/s eta 0:00:01[K     |████████▏                       | 20kB 4.7MB/s eta 0:00:01[K     |████████████▏                   | 30kB 5.8MB/s eta 0:00:01[K     |████████████████▎               | 40kB 6.5MB/s eta 0:00:01[K     |████████████████████▎           | 51kB 5.2MB/s eta 0:00:01[K     |████████████████████████▍       | 61kB 5.7MB/s eta 0:00:01[K     |████████████████████████████▌   | 71kB 6.3MB/s eta 0:00:01[K     |████████████████████████████████| 81kB 4.3MB/s 
Collecting sacrebleu>=1.3.2
[?25l  Downloading https://files.pythonhosted.org/packages/66/5b/cf661da8e9b0229f5d98c2961b072a5728fd11a0758957f8c0fd36081c06/sacrebleu-1.4.12-py3-none-any.whl (54kB)
[K     |████████████████████████████████| 61kB 6.9MB/s 
[?25hCollecting portalocker
  Downloa

In [100]:
from sumeval.metrics.rouge import RougeCalculator
import numpy as np

In [104]:
rouge = RougeCalculator(stopwords=True, lang="en")
Rouge_1 = []
Rouge_2 = []
Rouge_l = []

for i in range(len(predictions)):

    rouge_1 = rouge.rouge_n(
    summary=predictions[i],
    references=reference[i],
    n=1)
    
    Rouge_1.append(rouge_1)
 
 
 ## rogue_2

    rouge_2 = rouge.rouge_n(
     summary=predictions[i],
     references=reference[i],
    n=2)
    Rouge_2.append(rouge_2)
 ## rogue_l

    rouge_l = rouge.rouge_l(
     summary=predictions[i],
     references=reference[i])
    Rouge_l.append(rouge_l)

print("ROUGE-1: {}, ROUGE-2: {}, ROUGE-L: {}".format(np.round(sum(Rouge_1)/len(Rouge_1),3),
 np.round(sum(Rouge_2)/len(Rouge_2),3),
 np.round(sum(Rouge_l)/len(Rouge_l),3)
))

ROUGE-1: 0.395, ROUGE-2: 0.169, ROUGE-L: 0.352


In [None]:
%cd /content/gdrive/My\ Drive/

In [116]:
model.save_weights('t5model.h5')