In [1]:
import pandas as pd
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset
from rouge import Rouge

In [2]:
class SummarizationDataset(Dataset):
    def __init__(self, data, tokenizer, max_input_length=512, max_output_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_output_length = max_output_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        article = self.data.iloc[index]["text"]
        summary = self.data.iloc[index]["headlines"]
        input_ids = self.tokenizer.encode(article, max_length=self.max_input_length, truncation=True, padding="max_length")
        output_ids = self.tokenizer.encode(summary, max_length=self.max_output_length, truncation=True, padding="max_length")
        return {"input_ids": input_ids, "attention_mask": [int(token_id != 0) for token_id in input_ids], "decoder_input_ids": output_ids[:-1], "decoder_attention_mask": [1] * (len(output_ids) - 1), "labels": output_ids[1:]}

In [3]:
data = pd.read_csv('/Users/HP/Documents/STUDIES/PYTHONCODES/DATASETS/newsMORE/news_summary.csv', encoding='latin-1')
data_more = pd.read_csv('/Users/HP/Documents/STUDIES/PYTHONCODES/DATASETS/newsMORE/news_summary_more.csv', encoding='latin-1')
train_df = pd.concat([data, data_more], axis=0).reset_index(drop=True)
train_df

Unnamed: 0,author,date,headlines,read_more,text,ctext
0,Chhavi Tyagi,"03 Aug 2017,Thursday",Daman & Diu revokes mandatory Rakshabandhan in...,http://www.hindustantimes.com/india-news/raksh...,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...
1,Daisy Mowke,"03 Aug 2017,Thursday",Malaika slams user who trolled her for 'divorc...,http://www.hindustantimes.com/bollywood/malaik...,Malaika Arora slammed an Instagram user who tr...,"From her special numbers to TV?appearances, Bo..."
2,Arshiya Chopra,"03 Aug 2017,Thursday",'Virgin' now corrected to 'Unmarried' in IGIMS...,http://www.hindustantimes.com/patna/bihar-igim...,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...
3,Sumedha Sehra,"03 Aug 2017,Thursday",Aaj aapne pakad liya: LeT man Dujana before be...,http://indiatoday.intoday.in/story/abu-dujana-...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Aarushi Maheshwari,"03 Aug 2017,Thursday",Hotel staff to get training to spot signs of s...,http://indiatoday.intoday.in/story/sex-traffic...,Hotels in Maharashtra will train their staff t...,Hotels in Mumbai and other Indian cities are t...
...,...,...,...,...,...,...
102910,,,CRPF jawan axed to death by Maoists in Chhatti...,,A CRPF jawan was on Tuesday axed to death with...,
102911,,,First song from Sonakshi Sinha's 'Noor' titled...,,"'Uff Yeh', the first song from the Sonakshi Si...",
102912,,,'The Matrix' film to get a reboot: Reports,,"According to reports, a new version of the 199...",
102913,,,Snoop Dogg aims gun at clown dressed as Trump ...,,A new music video shows rapper Snoop Dogg aimi...,


In [4]:
train_df = train_df[['headlines', 'text']]

In [5]:
print(train_df['text'][0])

The Administration of Union Territory Daman and Diu has revoked its order that made it compulsory for women to tie rakhis to their male colleagues on the occasion of Rakshabandhan on August 7. The administration was forced to withdraw the decision within 24 hours of issuing the circular after it received flak from employees and was slammed on social media.


In [6]:
from sklearn.model_selection import train_test_split

# Split the data into training and temporary data
temp_df, test_df = train_test_split(train_df, test_size=0.2, random_state=42)

# Split the temporary data into training and validation data
train_df, val_df = train_test_split(temp_df, test_size=0.25, random_state=42)

# The sizes of the datasets
print(f"Train set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Test set size: {len(test_df)}")


Train set size: 61749
Validation set size: 20583
Test set size: 20583


In [7]:
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large")
model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-large")

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
train_dataset = SummarizationDataset(train_df, tokenizer)
val_dataset = SummarizationDataset(val_df, tokenizer)

In [9]:
def collate_fn(batch):
    input_ids = [item["input_ids"] for item in batch]
    attention_mask = [item["attention_mask"] for item in batch]
    decoder_input_ids = [item["decoder_input_ids"] for item in batch]
    decoder_attention_mask = [item["decoder_attention_mask"] for item in batch]
    labels = [item["labels"] for item in batch]
    max_input_length = max(len(ids) for ids in input_ids)
    max_output_length = max(len(ids) for ids in decoder_input_ids)
    input_ids = [ids + [0] * (max_input_length - len(ids)) for ids in input_ids]
    attention_mask = [mask + [0] * (max_input_length - len(mask)) for mask in attention_mask]
    decoder_input_ids = [ids + [0] * (max_output_length - len(ids)) for ids in decoder_input_ids]
    decoder_attention_mask = [mask + [0] * (max_output_length - len(mask)) for mask in decoder_attention_mask]
    labels = [ids + [-100] * (max_output_length - len(ids)) for ids in labels]
    return {"input_ids": torch.tensor(input_ids), "attention_mask": torch.tensor(attention_mask), "decoder_input_ids": torch.tensor(decoder_input_ids), "decoder_attention_mask": torch.tensor(decoder_attention_mask), "labels": torch.tensor(labels)}

In [10]:
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=2,collate_fn=collate_fn)

In [11]:
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=500, num_training_steps=10000)



In [12]:
len(train_loader),len(val_loader)

(30875, 10292)

In [13]:
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()
for epoch in range(10):
    train_loss = 0
    for step,batch in tqdm(enumerate(train_loader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        loss = model(**batch).loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        if step%300==0 and step>0:
            print("Step-{},Train Loss-{}".format(step,loss.item()))
            break#intentionally breaking the training after 100 steps since it's going to take long to train,feel free to comment and train more
        train_loss += loss.item()
    train_loss /= len(train_loader)
    val_loss = 0
    model.eval()
    with torch.no_grad():
        for step,batch in tqdm(enumerate(val_loader)):
            batch = {k: v.to(device) for k, v in batch.items()}
            loss = model(**batch).loss
            val_loss += loss.item()
            if step%200==0 and step>0:
                print("Step-{},Val Loss-{}".format(step,loss.item()))
                break #intentionally breaking the training after 100 steps since it's going to take long to validate,feel free to comment and validate more
        val_loss /= len(val_loader)
    model.train()
    break # when you train more then uncomment this, too !
    print(f"Epoch {epoch+1} train loss: {train_loss:.4f} val loss: {val_loss:.4f}")

300it [1:16:08, 15.23s/it]


Step-300,Train Loss-10.326537132263184


200it [08:59,  2.70s/it]

Step-200,Val Loss-10.163877487182617





In [14]:
model.save_pretrained("fine_tuned_pegasus")
tokenizer.save_pretrained("fine_tuned_pegasus")

('fine_tuned_pegasus/tokenizer_config.json',
 'fine_tuned_pegasus/special_tokens_map.json',
 'fine_tuned_pegasus/spiece.model',
 'fine_tuned_pegasus/added_tokens.json')

In [15]:
model = PegasusForConditionalGeneration.from_pretrained("fine_tuned_pegasus")
tokenizer = PegasusTokenizer.from_pretrained("fine_tuned_pegasus")

In [16]:
test_dataset = SummarizationDataset(test_df, tokenizer)

In [17]:
test_loader = DataLoader(test_dataset, batch_size=1,collate_fn=collate_fn)

In [18]:
len(test_loader)

20583

In [19]:
model.to(device)
model.eval()
predictions = []
with torch.no_grad():
    for step, batch in tqdm(enumerate(test_loader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        output_ids = model.generate(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], max_length=128, decoder_start_token_id=tokenizer.pad_token_id)
        batch_predictions = [tokenizer.decode(ids, skip_special_tokens=True) for ids in output_ids]
        predictions.extend(batch_predictions)
        if step==100:
            break # breaking after generating 100 predictions.. since it's going to take long to predict on entire set

100it [18:16, 10.97s/it]


In [20]:
len(predictions)

101

In [21]:
# Save the predictions to a CSV file
test_df = test_df[:101]# for 100 predicitons only
print(len(test_df))
test_df["predictions"] = predictions
test_df.to_csv("test_predictions.csv", index=False)

101


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df["predictions"] = predictions


In [22]:
test_df

Unnamed: 0,headlines,text,predictions
29186,First humans to step foot on Moon left over 10...,"The first humans to land on the Moon, Neil Arm...","A plaque inscribed, ""Here men from the planet ..."
2436,India has 2nd highest number of adults plannin...,India has the second highest number of adults ...,The report analysing people's migration intent...
18294,South Korea urges US to declare end to Korean War,South Korean President Moon Jae-in has asked t...,South Korean President Moon Jae-in has asked t...
25026,Four students washed away in Krishna river in ...,"Four students, between seven and 14 years of a...","""The locals immediately jumped into the water ..."
72245,Uber rival Grab leads $15 mn funding in self-d...,Self-driving car startup Drive.ai has raised $...,The startup will use the funding to open a Sin...
...,...,...,...
28316,An Assamese should play Hima: Adil on Akshay's...,Reacting to Akshay Kumar's statement that he'd...,Reacting to Akshay Kumar's statement that he'd...
101769,A man had sold the Eiffel Tower twice,When the Eiffel Tower's decaying condition was...,"Six months later, Lustig returned to Paris and..."
11081,Mars mission may shorten astronauts' lives by ...,A research presented at a Moscow conference ha...,The researchers suggest that using sleeping ba...
94037,Rape incidents can't be prevented: Rajasthan m...,BJP leader and Rajasthan minister Kali Charan ...,He added that the government can only take act...


In [23]:
test_df.isnull().sum()

headlines      0
text           0
predictions    0
dtype: int64

In [24]:
# # Calculate the ROUGE scores between the predicted summaries and the actual summaries
# rouge = Rouge()

# scores = rouge.get_scores(predictions, test_df["headlines"].tolist(), avg=True)

# # Print the ROUGE scores
# print(f"ROUGE-1: {scores['rouge-1']['f']:.4f}")
# print(f"ROUGE-2: {scores['rouge-2']['f']:.4f}")
# print(f"ROUGE-L: {scores['rouge-l']['f']:.4f}")

In [25]:
from rouge_score import rouge_scorer

# Assuming predictions and test_df["headlines"] are lists of strings
references = test_df["headlines"].tolist()

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Initialize variables to accumulate scores
total_rouge1 = 0.0
total_rouge2 = 0.0
total_rougeL = 0.0
num_examples = len(predictions)

for prediction, reference in zip(predictions, references):
    # Calculate Rouge scores for each example
    scores = scorer.score(prediction, reference)

    # Accumulate scores for averaging
    total_rouge1 += scores['rouge1'].fmeasure
    total_rouge2 += scores['rouge2'].fmeasure
    total_rougeL += scores['rougeL'].fmeasure

# Calculate average scores
avg_rouge1 = total_rouge1 / num_examples
avg_rouge2 = total_rouge2 / num_examples
avg_rougeL = total_rougeL / num_examples

# Print the average ROUGE scores
print(f"Average ROUGE-1: {avg_rouge1:.4f}")
print(f"Average ROUGE-2: {avg_rouge2:.4f}")
print(f"Average ROUGE-L: {avg_rougeL:.4f}")
print("============================")
print(total_rouge1)
print(total_rouge2)
print(total_rougeL)

Average ROUGE-1: 0.2396
Average ROUGE-2: 0.0781
Average ROUGE-L: 0.2033
24.202106579939407
7.8886792237535355
20.532925425140313


In [26]:
# Average ROUGE-1: 0.2360
# Average ROUGE-2: 0.0763
# Average ROUGE-L: 0.1987

SyntaxError: invalid syntax (255488293.py, line 1)

In [27]:
# Assuming you have the input text as a string
input_text = """KUALA LUMPUR, Nov 14 — Former PKR leader Farhash Wafa Salvador Rizal Mubarak today rejected Machang MP Wan Ahmad Fayhsal Wan Ahmad Kamal’s claim that he was involved in enticing Perikatan Nasional lawmakers into pledging support to Prime Minister Datuk Seri Anwar Ibrahim.Earlier today, Wan Ahmad Fayhsal insinuated that Farhash, a former political secretary to Anwar, was one of the two men who allegedly persuaded four Parti Pribumi Bersatu (Bersatu) MPs to support the government, with the other being identified as “Datuk Botak”.“The defamation allegations were made in bad faith and without any direct evidence. Through the statement of the press conference by Wan Fayhsal, it was clear that I had been used as a political tool to spread untrue rumours for his own benefit,” he said in a statement.Farhash said he has left politics “for a long time”. “I now only run my life as a businessman. I do not understand why Wan Fayhsal’s brother has hurled such slander against me with the intention of undermining my reputation and self-respect, for his political interests. “Due to the statements and defamatory allegations levelled against me, I will make a police report for further investigation and action taken by the police against Wan Fayhsal,” he said.“I also reserve my right to file a defamation lawsuit against Wan Fayhsal,” he added. In a press conference outside of the Dewan Rakyat, Wan Ahmad Fayhsal earlier claimed there were videos online of “Datuk Botak” and a former aide to Anwar enticing Opposition MPs to support the government.He claimed that 10 more Bersatu MPs, including himself, received “offers” to support the federal government.Wan Ahmad Fayhsal said offers such as development projects in those Opposition MPs’ constituencies and cash were tabled to by “operators”."""

# Tokenize the input text
input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

# Generate predictions for the input
with torch.no_grad():
    output_ids = model.generate(
        input_ids=input_ids,
        max_length=128,
        decoder_start_token_id=tokenizer.pad_token_id
    )

# Decode the generated output
generated_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Print or use the generated output
print("Generated Output:", generated_output)


Generated Output: Earlier today, Wan Ahmad Fayhsal insinuated that Farhash, a former political secretary to Anwar, was one of the two men who allegedly persuaded four Parti Pribumi Bersatu (Bersatu) MPs to support the government, with the other being identified as “Datuk Botak”.


In [28]:
import torch
from tqdm import tqdm

def generate_prediction(model, tokenizer, text_input, device):
    model.eval()
    
    # Tokenize and encode the input text
    input_ids = tokenizer.encode(text_input, return_tensors="pt", max_length=128, truncation=True).to(device)
    
    # Generate the prediction
    with torch.no_grad():
        output_ids = model.generate(input_ids=input_ids, max_length=128, decoder_start_token_id=tokenizer.pad_token_id)
    
    # Decode the generated output
    prediction = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    return prediction

# Example usage
text_input = "Your input text goes here."
prediction = generate_prediction(model, tokenizer, text_input, device)
print("Generated Prediction:", prediction)

Generated Prediction: Your input text goes here.
