In [197]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
#input_ids = tokenizer(prompt, return_tensors="pt").input_ids

In [198]:
import torch
prompts = [ "Words hold immense power. They can inspire, uplift, and transform lives. A well-crafted sentence has the ability to ignite imagination, stir emotions, and leave an indelible mark on the human psyche. It's a testament to the remarkable capacity of language to transcend barriers and forge connections.",
    "The natural world is a tapestry of wonders. From the majestic peaks of snow-capped mountains to the intricate dance of life in the depths of the oceans, nature's artistry is a constant source of awe and reverence. It reminds us of our humble place in the grand scheme of existence and our responsibility to protect its fragile beauty.",
    "Creativity is the lifeblood of progress. It fuels innovation, drives artistic expression, and propels us forward. In a world that often values conformity, embracing our creative spirit allows us to challenge the status quo, question the established norms, and pave the way for new possibilities.",
    "Resilience is the hallmark of the human spirit. It is the ability to rise above adversity, to adapt and evolve in the face of challenges. Like a sturdy oak that withstands the fiercest storms, resilience empowers us to bend but never break, emerging stronger and wiser from life's trials.",
    "Kindness is a universal language that transcends boundaries and cultures. A simple act of compassion, a gentle word of encouragement, or a genuine smile can have a profound impact, creating ripples of positivity that touch the lives of others. In a world that often appears harsh and unforgiving, kindness is a beacon of hope and humanity."]
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50264, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50264, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): La

In [220]:

torch.random.manual_seed(42)
#Top- K sampling
generation_params = {
    #"greedy_search": {"do_sample": False, "max_length": 50},
    #"beam_search": {"num_beams": 5, "max_length": 50, "early_stopping": True},
    #"top_k_sampling": {"do_sample": True, "top_k": 50, "max_length": 50},
    "top_p_sampling": {"do_sample": True, "top_p": 0.6, "max_length": 30, "min_length": 0 }
}

In [221]:
def calculate_perplexity(input_text):
    encodings = tokenizer(input_text, return_tensors="pt").to(device)
    max_length = model.config.max_position_embeddings
    input_ids = encodings.input_ids
    target_ids = input_ids.clone()
    target_ids[:, :-1] = -100

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)
        neg_log_likelihood = outputs.loss

    return torch.exp(neg_log_likelihood).item()

In [222]:
perplexities = {}
for prompt in prompts:
  for strategy_name, params in generation_params.items():
      input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
      generated_outputs = model.generate(input_ids, **params, pad_token_id=tokenizer.eos_token_id)
      generated_text = tokenizer.decode(generated_outputs[0], skip_special_tokens=True)
      perplexity = calculate_perplexity(generated_text)
      perplexities[strategy_name] = perplexity
      print(f"Strategy: {strategy_name}\nGenerated Text: {generated_text}\nPerplexity: {perplexity}\n")

Strategy: top_p_sampling
Generated Text: A well-crafted sentence has the ability to ignite imagination, stir emotions, and leave an indelible mark on the human
Perplexity: 4170.8115234375

Strategy: top_p_sampling
Generated Text: The natural world is a tapestry of wonders. From the majestic peaks of snow-capped mountains to the intricate dance of life
Perplexity: 1551.637939453125

Strategy: top_p_sampling
Generated Text: Creativity is the lifeblood of progress. It fuels innovation, drives artistic expression, and propels us forward. In a world that
Perplexity: 1006.3665161132812

Strategy: top_p_sampling
Generated Text: Resilience is the hallmark of the human spirit. It is the ability to rise above adversity. Like a sturdy oak that withstands
Perplexity: 1354.5804443359375

Strategy: top_p_sampling
Generated Text: Kindness is a universal language that transcends boundaries and cultures. A simple act of compassion, a gentle word of encouragement, or a
Perplexity: 14885.330078125



In [223]:
from datasets import load_dataset
cnn=load_dataset("cnn_dailymail", '2.0.0')

In [224]:
cnn['train']['article'][0]

'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office chart. Details o

In [225]:
#def preprocess_function(examples):
    #return tokenizer(examples['article'], truncation=True)

In [226]:
#tokenized_cnn = cnn.map(preprocess_function, batched=True)

In [227]:
#article_input_ids = tokenized_cnn['test']['input_ids'][0]

In [228]:
#input_tensors = [torch.tensor([ids]) for ids in article_input_ids]

In [229]:
#outputs = []
#for input_tensor in input_tensors:
#    output = model.generate(input_tensor, **params, pad_token_id=tokenizer.eos_token_id)
 #   tokenizer.batch_decode(output, skip_special_tokens=True)

In [230]:
#output = model.generate(article_input_ids, **params, pad_token_id=tokenizer.eos_token_id)
#tokenizer.batch_decode(output, skip_special_tokens=True)

In [231]:
# Counter to track generated summaries
#summary_count = 0

#for article in cnn['test']['article']:
  # Break the loop if 50 summaries are generated
 # if summary_count >= 50:
 #   break

 # tokenized_article = tokenizer(article, truncation=True)
 # input_ids = torch.tensor(tokenized_article['input_ids'])
 # output = model.generate(input_ids, **params, pad_token_id=tokenizer.eos_token_id)

  # Process the generated summary (e.g., print or store)
  #print(f"Summary {summary_count + 1}:", output[0]['generated_text'])  # Accessing generated text from the first output element

 # summary_count += 1


In [232]:
cnn['test']['article'][0]

'(CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC\'s founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians\' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday\'s ceremony

In [233]:
import pandas as pd
df=pd.DataFrame()

In [234]:
#!pip install evaluate, rouge_score, bert_score

In [235]:
import evaluate
rouge_score=evaluate.load('rouge')
bert_score=evaluate.load('bertscore')

In [None]:
import pandas as pd

for i in range(50):
    a = cnn['test']['article'][i]
    r=cnn[f'test']['highlights'][i]
    b = tokenizer(a, truncation=True, return_tensors="pt", max_length=1024).input_ids
    o = model.generate(b, **params, pad_token_id=tokenizer.eos_token_id)
    gt = tokenizer.decode(o[0], skip_special_tokens=True)

    rouge_score_value = rouge_score.compute(predictions=[gt], references=[cnn['test']['highlights'][i]])
    bert_score_value = bert_score.compute(predictions=[gt], references=[cnn['test']['highlights'][i]], lang="en")

    # Create a new DataFrame or append to an existing one
    if i == 0:
        df = pd.DataFrame({'Input Text': [a], 'Generated Text': [gt], 'Rouge Score': [rouge_score_value], 'BERT Score': [bert_score_value]})
    else:
        new_row = pd.DataFrame({'Input Text': [a], 'Generated Text': [gt], 'Rouge Score': [rouge_score_value], 'BERT Score': [bert_score_value]}, index=[i])
        df = pd.concat([df, new_row], ignore_index=True)

    print(f"Generated text {i}: {gt}\n Rouge: {rouge_score_value} \n Bert_Score: {bert_score_value}")

In [None]:
df.to_csv('Top-P Sampling.csv')

In [None]:
from google.colab import files
files.download('Top-P Sampling.csv')