In [1]:
import os
import pandas as pd
import numpy as np
import re
import os
import random
import string

In [2]:
from sklearn.model_selection import train_test_split
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import AdamWeightDecay
from transformers.keras_callbacks import KerasMetricCallback
from tensorflow.keras.callbacks import TensorBoard
import tensorflow as tf

In [3]:
!pip install transformers
!pip install datasets
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.0.4-py2.py3-none-any.whl (22 kB)
Installing collected packages: rouge-score
Successfully installed rouge-score-0.0.4


In [45]:
import transformers
import datasets
from datasets import load_dataset, load_metric
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig

In [5]:
import nltk

In [6]:
print(transformers.__version__)
print(tf.__version__)

4.16.2
2.6.2


In [7]:
df = pd.read_csv("../input/bbc-articles/BBCarticles_csv.csv", encoding="ISO-8859-1")
df.head()

Unnamed: 0,Summary,Text
0,TimeWarner said fourth quarter sales rose 2% t...,Ad sales boost Time Warner profit\n\nQuarterly...
1,The dollar has hit its highest level against t...,Dollar gains on Greenspan speech\n\nThe dollar...
2,Yukos' owner Menatep Group says it will ask Ro...,Yukos unit buyer faces loan claim\n\nThe owner...
3,"Rod Eddington, BA's chief executive, said the ...",High fuel prices hit BA's profits\n\nBritish A...
4,Pernod has reduced the debt it took on to fund...,Pernod takeover talk lifts Domecq\n\nShares in...


In [8]:
df = df.dropna().reset_index()
df['Text'] = df['Text'].apply(lambda x: x.replace('\n',' '))
df['Summary'] = df['Summary'].apply(lambda x: x.replace('\n',' '))
df.head()

Unnamed: 0,index,Summary,Text
0,0,TimeWarner said fourth quarter sales rose 2% t...,Ad sales boost Time Warner profit Quarterly p...
1,1,The dollar has hit its highest level against t...,Dollar gains on Greenspan speech The dollar h...
2,2,Yukos' owner Menatep Group says it will ask Ro...,Yukos unit buyer faces loan claim The owners ...
3,3,"Rod Eddington, BA's chief executive, said the ...",High fuel prices hit BA's profits British Air...
4,4,Pernod has reduced the debt it took on to fund...,Pernod takeover talk lifts Domecq Shares in U...


In [9]:
truncated_df = df.head(30)
df = truncated_df
print(df.shape)

(30, 3)


In [10]:
train, test = train_test_split(df, test_size=0.1, random_state=42)
print(len(train), len(test))

27 3


In [11]:
train.head()

Unnamed: 0,index,Summary,Text
17,17,India's rupee has hit a five-year high after S...,India's rupee hits five-year high India's rup...
8,8,"In 2003, crop production totalled 11.49 millio...",Ethiopia's crop production up 24% Ethiopia pr...
9,9,A US government claim accusing the country's b...,Court rejects $280bn tobacco case A US govern...
28,28,The Vestey Group said it had owned the land si...,UK firm faces Venezuelan land row Venezuelan ...
24,24,The court ruling is a blow to efforts to get d...,Yukos loses US bankruptcy battle A judge has ...


In [12]:
pretrained_model_name = "facebook/bart-large-cnn"

In [13]:
metric = load_metric("rouge")
metric

Downloading:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

Metric(name: "rouge", features: {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')}, usage: """
Calculates average rouge scores for a list of hypotheses and references
Args:
    predictions: list of predictions to score. Each predictions
        should be a string with tokens separated by spaces.
    references: list of reference for each prediction. Each
        reference should be a string with tokens separated by spaces.
    rouge_types: A list of rouge types to calculate.
        Valid names:
        `"rouge{n}"` (e.g. `"rouge1"`, `"rouge2"`) where: {n} is the n-gram based scoring,
        `"rougeL"`: Longest common subsequence based scoring.
        `"rougeLSum"`: rougeLsum splits text using `"
"`.
        See details in https://github.com/huggingface/datasets/issues/617
    use_stemmer: Bool indicating whether Porter stemmer should be used to strip word suffixes.
    use_agregator: Return aggregates if this is set to True
Retu

In [14]:
def show_random_example(df):
    rand = random.randint(0,df.shape[0])
    print("\nExample number: ", rand)
    sample_text = df.iloc[rand,2]
    gold_summary = df.iloc[rand,1]
    print("\nText: ", sample_text)
    print("\nGold Summary: ", gold_summary)

In [15]:
show_random_example(df)


Example number:  13

Text:  Telegraph newspapers axe 90 jobs  The Daily and Sunday Telegraph newspapers are axing 90 journalist jobs - 17% of their editorial staff.  The Telegraph Group says the cuts are needed to fund an Â£150m investment in new printing facilities. Journalists at the firm met on Friday afternoon to discuss how to react to the surprise announcement. The cuts come against a background of fierce competition for readers and sluggish advertising revenues amid competition from online advertising. The National Union of Journalists has called on the management to recall the notice of redundancy by midday on Monday or face a strike ballot.  Pearson's Financial Times said last week it was offering voluntary redundancy to about 30 reporters.  The National Union of Journalists said it stood strongly behind the journalists and did not rule out a strike. "Managers have torn up agreed procedures and kicked staff in the teeth by sacking people to pay for printing facilities," said 

# Preprocessing the dataset

In [69]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)

In [17]:
tokenizer("This is a test sentence")

{'input_ids': [0, 713, 16, 10, 1296, 3645, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [18]:
max_input_length = 1024
max_target_length = 512

In [19]:
def preprocess_function(examples):
    inputs = [doc for doc in examples["Text"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["Summary"], max_length=max_target_length, truncation=True
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [20]:
tds = Dataset.from_pandas(train)
vds = Dataset.from_pandas(test)
ds = DatasetDict()

ds['train'] = tds
ds['validation'] = vds

print(ds)

DatasetDict({
    train: Dataset({
        features: ['index', 'Summary', 'Text', '__index_level_0__'],
        num_rows: 27
    })
    validation: Dataset({
        features: ['index', 'Summary', 'Text', '__index_level_0__'],
        num_rows: 3
    })
})


In [21]:
tokenized_datasets = ds.map(preprocess_function, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [22]:
print(tokenized_datasets)

DatasetDict({
    train: Dataset({
        features: ['index', 'Summary', 'Text', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 27
    })
    validation: Dataset({
        features: ['index', 'Summary', 'Text', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3
    })
})


In [23]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name)

Downloading:   0%|          | 0.00/1.51G [00:00<?, ?B/s]

2022-04-21 05:53:22.984190: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-21 05:53:22.985372: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-21 05:53:22.986169: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-21 05:53:22.987417: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

In [24]:
batch_size = 2
learning_rate = 2e-5
weight_decay = 0.01
epochs = 1

model_name = "bart-fine-tuned"

In [25]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

In [26]:
tokenized_datasets["train"]

Dataset({
    features: ['index', 'Summary', 'Text', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 27
})

In [27]:
train_dataset = tokenized_datasets["train"].to_tf_dataset(
    batch_size=batch_size,
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=True,
    collate_fn=data_collator,
)
validation_dataset = tokenized_datasets["validation"].to_tf_dataset(
    batch_size=8,
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=False,
    collate_fn=data_collator,
)
# generation_dataset = (
#     tokenized_datasets["validation"]
#     .shuffle()
#     .select(list(range(200)))
#     .to_tf_dataset(
#         batch_size=8,
#         columns=["input_ids", "attention_mask", "labels"],
#         shuffle=False,
#         collate_fn=data_collator,
#     )
# )

In [28]:
optimizer = AdamWeightDecay(learning_rate=learning_rate, weight_decay_rate=weight_decay)

In [29]:
model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! Please ensure your labels are passed as keys in the input dict so that they are accessible to the model during the forward pass. To disable this behaviour, please pass a loss argument, or explicitly pass loss=None if you do not want your model to compute a loss.


In [30]:
def metric_fn(eval_predictions):
    predictions, labels = eval_predictions
    decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    for label in labels:
        label[label < 0] = tokenizer.pad_token_id  # Replace masked label tokens
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Rouge expects a newline after each sentence
    decoded_predictions = [
        "\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_predictions
    ]
    decoded_labels = [
        "\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels
    ]
    result = metric.compute(
        predictions=decoded_predictions, references=decoded_labels, use_stemmer=True
    )
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    # Add mean generated length
    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions
    ]
    result["gen_len"] = np.mean(prediction_lens)

    return result

In [31]:
tensorboard_callback = TensorBoard(log_dir="./summarization_model_save/logs")

metric_callback = KerasMetricCallback(
    metric_fn, eval_dataset=validation_dataset, predict_with_generate=True
)

callbacks = [metric_callback, tensorboard_callback]

2022-04-21 05:54:15.702486: I tensorflow/core/profiler/lib/profiler_session.cc:131] Profiler session initializing.
2022-04-21 05:54:15.702660: I tensorflow/core/profiler/lib/profiler_session.cc:146] Profiler session started.
2022-04-21 05:54:15.704691: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1614] Profiler found 1 GPUs
2022-04-21 05:54:16.000719: I tensorflow/core/profiler/lib/profiler_session.cc:164] Profiler session tear down.
2022-04-21 05:54:16.000900: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1748] CUPTI activity buffer flushed


In [32]:
tf.keras.backend.clear_session()

In [33]:
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.compat.v1.Session(config=config)

2022-04-21 05:54:26.305342: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-21 05:54:26.306805: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-21 05:54:26.307562: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-21 05:54:26.308388: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-21 05:54:26.309219: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from S

In [34]:
model.fit(
    train_dataset, validation_data=validation_dataset, epochs=epochs, callbacks=callbacks, verbose = 1
)

2022-04-21 05:54:29.524522: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


 1/13 [=>............................] - ETA: 10:01 - loss: 0.7259

2022-04-21 05:55:21.222412: I tensorflow/core/profiler/lib/profiler_session.cc:131] Profiler session initializing.
2022-04-21 05:55:21.222499: I tensorflow/core/profiler/lib/profiler_session.cc:146] Profiler session started.


 2/13 [===>..........................] - ETA: 26s - loss: 0.6282  

2022-04-21 05:55:22.132313: I tensorflow/core/profiler/lib/profiler_session.cc:66] Profiler session collecting data.
2022-04-21 05:55:22.140858: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1748] CUPTI activity buffer flushed
2022-04-21 05:55:22.372655: I tensorflow/core/profiler/internal/gpu/cupti_collector.cc:673]  GpuTracer has collected 9144 callback api events and 9140 activity events. 
2022-04-21 05:55:22.612631: I tensorflow/core/profiler/lib/profiler_session.cc:164] Profiler session tear down.




<keras.callbacks.History at 0x7f9d7874d610>

In [44]:
example = test.iloc[0]
sample_text= "Tanjiro Kamado is a kind-hearted and intelligent boy who lives with his family in the mountains. He became his family's breadwinner after his father's death, making trips to the nearby village to sell charcoal. Everything changed when he came home one day to discover that his family was attacked and slaughtered by a demon. Tanjiro and his sister Nezuko were the sole survivors of the incident, with Nezuko being transformed into a demon, but still surprisingly showing signs of human emotion and thought. After an encounter with Giyū Tomioka, a demon slayer, Tanjiro is recruited by Giyū and sent to his retired master Sakonji Urokodaki for training to also become a demon slayer, beginning his quest to help his sister turn into human again and avenge the death of his family. After two years of strenuous training, Tanjiro takes part in a formidable exam and is one of the few survivors to pass, officially making him a member of the Demon Slayer Corps. He begins his work of hunting down and slaying demons alongside Nezuko, who has been hypnotized to bring no harm to humans and who occasionally helps him in battle. One of Tanjiro's assignments brings him to Asakusa where he encounters Muzan Kibutsuji, the progenitor of all demons and the one who murdered his family. He also meets Tamayo, a demon who is free of Muzan's control. Tamayo allies with Tanjiro and begins to develop a cure for Nezuko, though it will require Tanjiro to supply her with blood from the Twelve Kizuki, the most powerful demons under Muzan's command."

In [79]:
inputs = tokenizer.batch_encode_plus([sample_text], max_length=2048, return_tensors='tf')
summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=128, early_stopping=True)
summary = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]

In [80]:
print(summary)

["Tanjiro Kamado is a kind-hearted and intelligent boy who lives with his family in the mountains.He became his family's breadwinner after his father's death, making trips to the nearby village to sell charcoal.After an encounter with Giyū Tomioka, a demon slayer, Tanjiro begins his work of hunting down and slaying demons alongside Nezuko, who has been hypnotized to bring no harm to humans and who occasionally helps him in battle.After two years of strenuous training, TanJiro takes part in a formidable exam and is one of the few survivors to pass"]
