In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import warnings
warnings.filterwarnings("ignore") 

In [2]:
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from lightning.pytorch import Trainer
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.loggers import TensorBoardLogger

In [3]:
import transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoTokenizer, AutoModel
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, BartForConditionalGeneration
from transformers import DataCollatorForSeq2Seq, AdamW, get_linear_schedule_with_warmup
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import create_optimizer, AdamWeightDecay
from transformers import pipeline
import datasets
from datasets import Dataset, DatasetDict
#from torch.utils.data import DataLoader, Dataset
from tqdm.auto import tqdm
from rouge import Rouge

In [4]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
device

'cuda'

In [5]:
df = pd.read_csv("news_summary.csv", encoding="latin-1")
df.head()

Unnamed: 0,author,date,headlines,read_more,text,ctext
0,Chhavi Tyagi,"03 Aug 2017,Thursday",Daman & Diu revokes mandatory Rakshabandhan in...,http://www.hindustantimes.com/india-news/raksh...,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...
1,Daisy Mowke,"03 Aug 2017,Thursday",Malaika slams user who trolled her for 'divorc...,http://www.hindustantimes.com/bollywood/malaik...,Malaika Arora slammed an Instagram user who tr...,"From her special numbers to TV?appearances, Bo..."
2,Arshiya Chopra,"03 Aug 2017,Thursday",'Virgin' now corrected to 'Unmarried' in IGIMS...,http://www.hindustantimes.com/patna/bihar-igim...,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...
3,Sumedha Sehra,"03 Aug 2017,Thursday",Aaj aapne pakad liya: LeT man Dujana before be...,http://indiatoday.intoday.in/story/abu-dujana-...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Aarushi Maheshwari,"03 Aug 2017,Thursday",Hotel staff to get training to spot signs of s...,http://indiatoday.intoday.in/story/sex-traffic...,Hotels in Maharashtra will train their staff t...,Hotels in Mumbai and other Indian cities are t...


In [6]:
df.describe()

Unnamed: 0,author,date,headlines,read_more,text,ctext
count,4514,4514,4514,4514,4514,4396
unique,45,240,4514,4461,4514,4341
top,Chhavi Tyagi,"19 Jul 2017,Wednesday",More than half of India's languages may die in...,http://indiatoday.intoday.in/story/assembly-el...,At least 400 languages or more than half langu...,AAJ TAK LIVE TV WITH LIVE ELECTION RESULTS I c...
freq,559,76,1,13,1,13


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>

RangeIndex: 4514 entries, 0 to 4513

Data columns (total 6 columns):

 #   Column     Non-Null Count  Dtype 

---  ------     --------------  ----- 

 0   author     4514 non-null   object

 1   date       4514 non-null   object

 2   headlines  4514 non-null   object

 3   read_more  4514 non-null   object

 4   text       4514 non-null   object

 5   ctext      4396 non-null   object

dtypes: object(6)

memory usage: 211.7+ KB


In [8]:
df = df[['headlines', 'text', 'ctext']]
df.head()

Unnamed: 0,headlines,text,ctext
0,Daman & Diu revokes mandatory Rakshabandhan in...,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...
1,Malaika slams user who trolled her for 'divorc...,Malaika Arora slammed an Instagram user who tr...,"From her special numbers to TV?appearances, Bo..."
2,'Virgin' now corrected to 'Unmarried' in IGIMS...,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...
3,Aaj aapne pakad liya: LeT man Dujana before be...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Hotel staff to get training to spot signs of s...,Hotels in Maharashtra will train their staff t...,Hotels in Mumbai and other Indian cities are t...


In [9]:
# drop na
df = df.dropna()
df.describe()

Unnamed: 0,headlines,text,ctext
count,4396,4396,4396
unique,4396,4396,4341
top,More than half of India's languages may die in...,At least 400 languages or more than half langu...,AAJ TAK LIVE TV WITH LIVE ELECTION RESULTS I c...
freq,1,1,13


In [10]:
# Converting to lowercase
df['ctext'] = df['ctext'].apply(str.lower)
df['text'] = df['text'].apply(str.lower)
df['headlines'] = df['headlines'].apply(str.lower)
df.head()

Unnamed: 0,headlines,text,ctext
0,daman & diu revokes mandatory rakshabandhan in...,the administration of union territory daman an...,the daman and diu administration on wednesday ...
1,malaika slams user who trolled her for 'divorc...,malaika arora slammed an instagram user who tr...,"from her special numbers to tv?appearances, bo..."
2,'virgin' now corrected to 'unmarried' in igims...,the indira gandhi institute of medical science...,the indira gandhi institute of medical science...
3,aaj aapne pakad liya: let man dujana before be...,lashkar-e-taiba's kashmir commander abu dujana...,lashkar-e-taiba's kashmir commander abu dujana...
4,hotel staff to get training to spot signs of s...,hotels in maharashtra will train their staff t...,hotels in mumbai and other indian cities are t...


In [11]:
df['headlines_length'] = [len(x.split()) for x in df.headlines]
df['text_length'] = [len(x.split()) for x in df.text]
df['ctext_length'] = [len(x.split()) for x in df.ctext]
df.head()

Unnamed: 0,headlines,text,ctext,headlines_length,text_length,ctext_length
0,daman & diu revokes mandatory rakshabandhan in...,the administration of union territory daman an...,the daman and diu administration on wednesday ...,9,60,364
1,malaika slams user who trolled her for 'divorc...,malaika arora slammed an instagram user who tr...,"from her special numbers to tv?appearances, bo...",10,60,396
2,'virgin' now corrected to 'unmarried' in igims...,the indira gandhi institute of medical science...,the indira gandhi institute of medical science...,8,60,335
3,aaj aapne pakad liya: let man dujana before be...,lashkar-e-taiba's kashmir commander abu dujana...,lashkar-e-taiba's kashmir commander abu dujana...,10,60,404
4,hotel staff to get training to spot signs of s...,hotels in maharashtra will train their staff t...,hotels in mumbai and other indian cities are t...,11,60,526


In [12]:
df = df[df['ctext_length'] >= df['text_length']]
df.describe()

Unnamed: 0,headlines_length,text_length,ctext_length
count,4274.0,4274.0,4274.0
mean,9.300889,58.299719,351.740056
std,1.407168,2.314246,358.884472
min,4.0,44.0,50.0
25%,8.0,57.0,193.0
50%,9.0,59.0,288.0
75%,10.0,60.0,416.0
max,14.0,62.0,12202.0


In [14]:
df = df.drop(columns=['headlines_length', 'text_length', 'ctext_length', 'diff'])
df.head()

Unnamed: 0,headlines,text,ctext
0,daman & diu revokes mandatory rakshabandhan in...,the administration of union territory daman an...,the daman and diu administration on wednesday ...
1,malaika slams user who trolled her for 'divorc...,malaika arora slammed an instagram user who tr...,"from her special numbers to tv?appearances, bo..."
2,'virgin' now corrected to 'unmarried' in igims...,the indira gandhi institute of medical science...,the indira gandhi institute of medical science...
3,aaj aapne pakad liya: let man dujana before be...,lashkar-e-taiba's kashmir commander abu dujana...,lashkar-e-taiba's kashmir commander abu dujana...
4,hotel staff to get training to spot signs of s...,hotels in maharashtra will train their staff t...,hotels in mumbai and other indian cities are t...


In [15]:
df['news'] = df['headlines'] + '. ' + df['ctext']
df.rename(columns={"text": "summary"}, inplace = True)
df = df.drop(columns=['headlines', 'ctext'])
df.head()

Unnamed: 0,summary,news
0,the administration of union territory daman an...,daman & diu revokes mandatory rakshabandhan in...
1,malaika arora slammed an instagram user who tr...,malaika slams user who trolled her for 'divorc...
2,the indira gandhi institute of medical science...,'virgin' now corrected to 'unmarried' in igims...
3,lashkar-e-taiba's kashmir commander abu dujana...,aaj aapne pakad liya: let man dujana before be...
4,hotels in maharashtra will train their staff t...,hotel staff to get training to spot signs of s...


In [16]:
# Making the dataset
prefix = 'summarize: '
df['news'] = prefix + df['news']
df.head()

Unnamed: 0,summary,news
0,the administration of union territory daman an...,summarize: daman & diu revokes mandatory raksh...
1,malaika arora slammed an instagram user who tr...,summarize: malaika slams user who trolled her ...
2,the indira gandhi institute of medical science...,summarize: 'virgin' now corrected to 'unmarrie...
3,lashkar-e-taiba's kashmir commander abu dujana...,summarize: aaj aapne pakad liya: let man dujan...
4,hotels in maharashtra will train their staff t...,summarize: hotel staff to get training to spot...


In [17]:
# Converting the pandas dataset to huggingface dataset
# first split the train and test set
train_df, test_df = train_test_split(df, test_size=0.01, shuffle=True)
print("train and val shape:", train_df.shape, "test shape:",test_df.shape)
# save for every model inference
global_train_df = train_df
global_test_df = test_df
train_df = datasets.Dataset.from_pandas(train_df)
train_df = train_df.remove_columns(["__index_level_0__"])
# split train into train and val
train_df = train_df.train_test_split(test_size=0.2, shuffle=True) # split train and test
train_df["train"][0]

train and val shape: (4231, 2) test shape: (43, 2)


{'summary': "lenovo phab 2 pro, the first phone with google's augmented reality camera, has been launched in india at ?29,999. through its depth-sensing google tango camera, the phone can map out physical spaces, track its own position in a room, and follow objects. powered by a snapdragon 652 processor, the 6.4-inch phone has 4 gb ram and 4,050 mah battery.",
 'news': "summarize: 1st google tango camera phone launches in india at ?29,999. how crazy are you about the pokã©mon go? if not much, what about lenovo phones? nothing still? what about a lenovo phone priced cheaper in india than in the us?lenovo's first smartphone touted to have google tango -- lenovo phab2 pro -- has been launched in india and available for rs 29,999 exclusively on flipkart, discounted from its us price of $499 (roughly rs 33,500) where it was first announced in november 2016. the phone was expected to sell at rs 40,000 given the usual price difference between two markets.also read:  xiaomi mi max v lenovo pha

In [18]:
# Fitting into dataset dict
train_val_test_dataset = DatasetDict({
    'train': train_df["train"],
    'val': train_df['test']})

print(type(train_val_test_dataset))
train_val_test_dataset

<class 'datasets.dataset_dict.DatasetDict'>


DatasetDict({
    train: Dataset({
        features: ['summary', 'news'],
        num_rows: 3384
    })
    val: Dataset({
        features: ['summary', 'news'],
        num_rows: 847
    })
})

In [20]:
def prepare_dataset(data):
    inputs = data["news"]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)
    labels = tokenizer(text_target=data["summary"], max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

Now create a batch of examples using [DataCollatorForSeq2Seq](https://huggingface.co/docs/transformers/v4.35.0/en/main_classes/data_collator#transformers.DataCollatorForSeq2Seq). It’s more efficient to dynamically pad the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.

In [24]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = Rouge().get_scores(decoded_preds, decoded_labels, avg=True, ignore_empty=True)

    # prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    # result["gen_len"] = np.mean(prediction_lens)

    return result

In [32]:
# Free up memory
torch.cuda.empty_cache()

In [33]:
# tokenize the data
model_name = "google/pegasus-large"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast = False)
tokenized_data = train_val_test_dataset.map(prepare_dataset, batched=True)

Map:   0%|          | 0/3384 [00:00<?, ? examples/s]

Map:   0%|          | 0/847 [00:00<?, ? examples/s]

In [34]:
# Padding
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name)

In [35]:
# model
PEGASUSmodel = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']

You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
# set up hyper-parameters
training_args = Seq2SeqTrainingArguments(
    output_dir="pegasus-news",
    evaluation_strategy="epoch",
    learning_rate=1e-6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=False, # FP16 not supported for PEGASUS: https://huggingface.co/docs/transformers/model_doc/pegasus#checkpoints
    report_to="none"
)

In [37]:
# setup trainer
trainer = Seq2SeqTrainer(
    model = PEGASUSmodel,
    args = training_args,
    train_dataset = tokenized_data["train"],
    eval_dataset = tokenized_data["val"],
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics
)

In [38]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge-1,Rouge-2,Rouge-l
1,No log,1.708059,"{'r': 0.4226024767202879, 'p': 0.34496959357843615, 'f': 0.3602114751016854}","{'r': 0.19066724740497631, 'p': 0.15275740366063076, 'f': 0.15643009303399508}","{'r': 0.37836219852997405, 'p': 0.30833161329969466, 'f': 0.32208903782335413}"
2,2.156300,1.604015,"{'r': 0.42870738318419865, 'p': 0.34358610140933893, 'f': 0.3612603719250026}","{'r': 0.19552963369178558, 'p': 0.15225476322754022, 'f': 0.157552839681331}","{'r': 0.3838586158422521, 'p': 0.30691966907819435, 'f': 0.3229502551530144}"
3,1.894400,1.57855,"{'r': 0.43470647375090926, 'p': 0.34330113743430485, 'f': 0.36287411491793264}","{'r': 0.20103970657002507, 'p': 0.15353668699374684, 'f': 0.16002379390579677}","{'r': 0.3898257036159917, 'p': 0.307220347956584, 'f': 0.32498640804700546}"


TrainOutput(global_step=1269, training_loss=1.9827846469984363, metrics={'train_runtime': 7747.9952, 'train_samples_per_second': 1.31, 'train_steps_per_second': 0.164, 'total_flos': 1.4608522581835776e+16, 'train_loss': 1.9827846469984363, 'epoch': 3.0})

In [39]:
# save the model
model_path = "pegasus-news"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

('pegasus-news/tokenizer_config.json',
 'pegasus-news/special_tokens_map.json',
 'pegasus-news/spiece.model',
 'pegasus-news/added_tokens.json')

In [40]:
model = AutoModelForSeq2SeqLM.from_pretrained("pegasus-news")
tokenizer = AutoTokenizer.from_pretrained("pegasus-news")

In [41]:
#for i in range(len(test_df['news']):
for i in range(5):
    print("original_news: ",test_df['news'].iloc[i])
    summarizer = pipeline("summarization", model = model,tokenizer = tokenizer, max_length = 100)
    summary = summarizer(test_df['news'].iloc[i])
    print(summary[0])
    print()

original_news:  summarize: taapsee pannu opts out of event organised by fairness cream. taapsee pannu, who was catapulted to the big league after her film pink became a runaway hit, has joined the likes of kangana ranaut and ranbir kapoor by taking a stand against fairness creams. the actor was supposed to be a part of an event in jaipur that was to be held next month, but she pulled out when she learnt that it was being organised by a fairness cream.also read: taapsee pannu opens up on being eve-teased, being told her backless dress was the problem also read: is taapsee pannu's nepotism remark directed at varun dhawan? taapsee said, "i agree it was a last moment call but when i got to know that i will have to pose with the fairness brand during the event i decided to take my name out of that event. i have actually lost a few films because of being fair so i will definitely not propagate fairness in any way."

{'summary_text': "the actor was supposed to be a part of an event in jaipur 