In [None]:
! pip install datasets transformers rouge-score nltk



In [None]:
import os
import pandas as pd
import numpy as np
import re
import random
import string
import torch

In [None]:
import transformers
from datasets import load_dataset, load_metric
from datasets import Dataset, DatasetDict
import nltk
from sklearn.model_selection import train_test_split

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [None]:
print(transformers.__version__)

4.18.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv("/content/sample_data/BBCarticles_csv.csv", encoding="ISO-8859-1")
df.head()

Unnamed: 0,Summary,Text
0,TimeWarner said fourth quarter sales rose 2% t...,Ad sales boost Time Warner profit\n\nQuarterly...
1,The dollar has hit its highest level against t...,Dollar gains on Greenspan speech\n\nThe dollar...
2,Yukos' owner Menatep Group says it will ask Ro...,Yukos unit buyer faces loan claim\n\nThe owner...
3,"Rod Eddington, BA's chief executive, said the ...",High fuel prices hit BA's profits\n\nBritish A...
4,Pernod has reduced the debt it took on to fund...,Pernod takeover talk lifts Domecq\n\nShares in...


In [None]:
df = df.dropna().reset_index()
df['Text'] = df['Text'].apply(lambda x: x.replace('\n',' '))
df['Summary'] = df['Summary'].apply(lambda x: x.replace('\n',' '))
df.head()

Unnamed: 0,index,Summary,Text
0,0,TimeWarner said fourth quarter sales rose 2% t...,Ad sales boost Time Warner profit Quarterly p...
1,1,The dollar has hit its highest level against t...,Dollar gains on Greenspan speech The dollar h...
2,2,Yukos' owner Menatep Group says it will ask Ro...,Yukos unit buyer faces loan claim The owners ...
3,3,"Rod Eddington, BA's chief executive, said the ...",High fuel prices hit BA's profits British Air...
4,4,Pernod has reduced the debt it took on to fund...,Pernod takeover talk lifts Domecq Shares in U...


In [None]:
# truncated_df = df.head(30)
# df = truncated_df
print(df.shape)

(2225, 3)


In [None]:
train, test = train_test_split(df, test_size=0.1, random_state=42)
print(len(train), len(test))

2002 223


In [None]:
train.head()

Unnamed: 0,index,Summary,Text
817,817,"French actress Audrey Tautou, star of hit film...",Tautou 'to star in Da Vinci film' French actr...
944,944,"Earlier Mr Kennedy, whose party opposes the ID...",Clarke to press on with ID cards New Home Sec...
999,999,"The Archbishop of Canterbury said: ""I am pleas...",Royal couple watch nation's mood Prince Charl...
1044,1044,At a news conference following talks with Mr B...,Blair returns from peace mission Prime Minist...
1001,1001,"Earlier this week, Mr Howard said his party's ...",Howard attacks cost of asylum Michael Howard ...


In [None]:
def show_random_example(df):
    rand = random.randint(0,df.shape[0])
    print("\nExample number: ", rand)
    sample_text = df.iloc[rand,2]
    gold_summary = df.iloc[rand,1]
    print("\nText: ", sample_text)
    print("\nGold Summary: ", gold_summary)

In [None]:
show_random_example(df)


Example number:  1603

Text:  Thomas out of Six Nations  Wales captain Gareth Thomas has been ruled out of the rest of the Six Nations with a broken thumb.  The full-back will have surgery on Monday after fracturing his thumb in the 24-18 win over France on Saturday. But Welsh legend Phil Bennett insisted Wales can cope without Thomas as they chase a first Grand Slam in 27 years. Bennett told BBC Sport: "Such is the spirit in the camp, they'll put Kevin Morgan at 15, Rhys Williams at wing and just carry on." Thomas will miss the match against Scotland on 13 March, and what promises to be a huge encounter against the Irish six days later. Bennett added: "It's a setback. He's a great captain, he leads from the front and the boys love him." Thomas was replaced at half-time by Williams as his side turned around a 15-6 deficit in Paris.  "With Gareth missing I would think Michael Owen will be our captain," said Wales coach Mike Ruddock. "He did a great job in the second half in France. He 

In [None]:
model_checkpoint = "ainize/bart-base-cnn"

In [None]:
metric = load_metric("rouge")
metric

Metric(name: "rouge", features: {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')}, usage: """
Calculates average rouge scores for a list of hypotheses and references
Args:
    predictions: list of predictions to score. Each prediction
        should be a string with tokens separated by spaces.
    references: list of reference for each prediction. Each
        reference should be a string with tokens separated by spaces.
    rouge_types: A list of rouge types to calculate.
        Valid names:
        `"rouge{n}"` (e.g. `"rouge1"`, `"rouge2"`) where: {n} is the n-gram based scoring,
        `"rougeL"`: Longest common subsequence based scoring.
        `"rougeLSum"`: rougeLsum splits text using `"
"`.
        See details in https://github.com/huggingface/datasets/issues/617
    use_stemmer: Bool indicating whether Porter stemmer should be used to strip word suffixes.
    use_aggregator: Return aggregates if this is set to True
Retu

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer("Test sentence")

{'input_ids': [0, 34603, 3645, 2], 'attention_mask': [1, 1, 1, 1]}

In [None]:
max_input_length = 512
max_target_length = 256

In [None]:
def preprocess_function(examples):
    inputs = [doc for doc in examples["Text"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["Summary"], max_length=max_target_length, truncation=True
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:

tds = Dataset.from_pandas(train)
vds = Dataset.from_pandas(test)
ds = DatasetDict()

ds['train'] = tds
ds['validation'] = vds

print(ds)

DatasetDict({
    train: Dataset({
        features: ['index', 'Summary', 'Text', '__index_level_0__'],
        num_rows: 2002
    })
    validation: Dataset({
        features: ['index', 'Summary', 'Text', '__index_level_0__'],
        num_rows: 223
    })
})


In [None]:
tokenized_datasets = ds.map(preprocess_function, batched=True)

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
print(tokenized_datasets)

DatasetDict({
    train: Dataset({
        features: ['index', 'Summary', 'Text', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2002
    })
    validation: Dataset({
        features: ['index', 'Summary', 'Text', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 223
    })
})


In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
batch_size = 8
learning_rate = 1e-7
weight_decay = 0.01
epochs = 25

model_name = "bart-fine-tuned"


In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
tokenized_datasets["train"]

Dataset({
    features: ['index', 'Summary', 'Text', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 2002
})

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size//2,
    weight_decay=weight_decay,
    save_total_limit=3,
    num_train_epochs=epochs
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)


In [None]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: __index_level_0__, Text, Summary, index. If __index_level_0__, Text, Summary, index are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2002
  Num Epochs = 25
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 6275


Epoch,Training Loss,Validation Loss
1,No log,0.931255
2,1.007400,0.820772
3,1.007400,0.761224
4,0.814900,0.726777
5,0.814900,0.703696
6,0.746300,0.686473
7,0.746300,0.67274
8,0.715700,0.661689
9,0.715700,0.652602
10,0.693500,0.64503


The following columns in the evaluation set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: __index_level_0__, Text, Summary, index. If __index_level_0__, Text, Summary, index are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 223
  Batch size = 4
Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-500/special_tokens_map.json
Deleting older checkpoint [results/checkpoint-4000] due to args.save_total_limit
The following columns in the evaluation set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: __index_level_0__, Text, Summary, index. If

TrainOutput(global_step=6275, training_loss=0.7112288711365475, metrics={'train_runtime': 6019.4522, 'train_samples_per_second': 8.315, 'train_steps_per_second': 1.042, 'total_flos': 1.521659138807808e+16, 'train_loss': 0.7112288711365475, 'epoch': 25.0})

In [None]:
input_text = "The story opens on a winter night in Moscow, Russia. A card game is being played until four in the morning at the house of Narumov, a Horse Guard. The protagonist Hermann, an officer in the Army Engineers and the son of a German, feverishly watches people gamble, though he has never played, calculating that the risks are too great. Tomsky tells the story of how his grandmother, Countess Anna Fedotovna, incurred a debt while playing the card game faro in Paris fifty years earlier. When her husband refused to pay, she learned a secret to winning at faro from the mysterious and notorious Count of St. Germain. Tomsky says she only ever told one man about it, Chaplitsky, but ends his story without saying how things turned out for the man. The point of view switches to Liza, a young ward of the 87-year-old Countess. As the subject of the Countess's abuse, Liza lives a miserable life and longs for a man to rescue her. She begins a flirtation with an Engineers officer who stands outside her window and looks up while she embroiders. It is revealed that the man is Hermann, who has grown obsessed with learning the Countess's secret."

In [None]:
input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation = True, max_length = 512 ,  add_special_tokens = False, verbose = True)

In [None]:
device = "cuda:0"

In [None]:
summary_ids = model.generate(input_ids.to(device), num_beams=5, no_repeat_ngram_size = 2, max_length=256)


In [None]:
summary = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in summary_ids]

In [None]:
print(summary)

['The protagonist Hermann, an officer in the Army Engineers and the son of a German, feverishly watches people gamble, though he has never played, calculating that the risks are too great.Tomsky tells the story of how his grandmother, Countess Anna Fedotovna, incurred a debt while playing the card game faro in Paris fifty years earlier.']


In [None]:
PATH = '/content/drive/MyDrive/LY PROJECT PY FILES/model-checkpoints/torch-bart-base-cnn-1.pth'

In [None]:
torch.save(model, PATH )

In [None]:
model = torch.load(PATH, map_location=torch.device('cpu'))