In [173]:
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")

#input_ids = tokenizer(prompt, return_tensors="pt").input_ids

In [174]:
import torch
prompts = [ "With a heavy heart,",
    "Standing on the edge,",
    "Surrounded by strangers,",
    "Caught in a dilemma,",
    "At the crossroads,"]
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [175]:

torch.random.manual_seed(42)
#Top- K sampling
generation_params = {
    #"greedy_search": {"do_sample": False, "max_length": 50},
    #"beam_search": {"num_beams": 5, "max_length": 50, "early_stopping": True},
    #"top_k_sampling": {"do_sample": True, "top_k": 50, "max_length": 50},
    "top_p_sampling": {"do_sample": True, "top_p": 0.6, "max_length": 30, "min_length": 0 }
}

In [176]:
def calculate_perplexity(input_text):
    encodings = tokenizer(input_text, return_tensors="pt").to(device)
    max_length = model.config.n_positions
    input_ids = encodings.input_ids
    target_ids = input_ids.clone()
    target_ids[:, :-1] = -100

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)
        neg_log_likelihood = outputs.loss

    return torch.exp(neg_log_likelihood).item()

In [177]:
perplexities = {}
for prompt in prompts:
  for strategy_name, params in generation_params.items():
      input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
      generated_outputs = model.generate(input_ids, **params, pad_token_id=tokenizer.eos_token_id)
      generated_text = tokenizer.decode(generated_outputs[0], skip_special_tokens=True)
      perplexity = calculate_perplexity(generated_text)
      perplexities[strategy_name] = perplexity
      print(f"Strategy: {strategy_name}\nGenerated Text: {generated_text}\nPerplexity: {perplexity}\n")

Strategy: top_p_sampling
Generated Text: With a heavy heart, we are working hard to bring the game to a new level. We're excited to bring you the new version of the game
Perplexity: 2.0831212997436523

Strategy: top_p_sampling
Generated Text: Standing on the edge, I could hear the heavy thud of the wind, and my heart was pounding.

"What is it?" I
Perplexity: 3.0335171222686768

Strategy: top_p_sampling
Generated Text: Surrounded by strangers, I had to try to get out of the house, and I had to do it on my own. But I've been
Perplexity: 6.555111408233643

Strategy: top_p_sampling
Generated Text: Caught in a dilemma, the US has been in the business of making it harder for foreign governments to get a grip on the country.


Perplexity: 160.3777313232422

Strategy: top_p_sampling
Generated Text: At the crossroads, a couple of things happened. One was that a few years ago, I started to get a little worried that the whole thing
Perplexity: 7.0261945724487305



In [178]:
from datasets import load_dataset
cnn=load_dataset("cnn_dailymail", '2.0.0')

In [179]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

In [180]:
cnn['train']['article'][0]

'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office chart. Details o

In [181]:
#def preprocess_function(examples):
    #return tokenizer(examples['article'], truncation=True)

In [182]:
#tokenized_cnn = cnn.map(preprocess_function, batched=True)

In [183]:
#article_input_ids = tokenized_cnn['test']['input_ids'][0]

In [184]:
#input_tensors = [torch.tensor([ids]) for ids in article_input_ids]

In [185]:
#outputs = []
#for input_tensor in input_tensors:
#    output = model.generate(input_tensor, **params, pad_token_id=tokenizer.eos_token_id)
 #   tokenizer.batch_decode(output, skip_special_tokens=True)

In [186]:
#output = model.generate(article_input_ids, **params, pad_token_id=tokenizer.eos_token_id)
#tokenizer.batch_decode(output, skip_special_tokens=True)

In [187]:
# Counter to track generated summaries
#summary_count = 0

#for article in cnn['test']['article']:
  # Break the loop if 50 summaries are generated
 # if summary_count >= 50:
 #   break

 # tokenized_article = tokenizer(article, truncation=True)
 # input_ids = torch.tensor(tokenized_article['input_ids'])
 # output = model.generate(input_ids, **params, pad_token_id=tokenizer.eos_token_id)

  # Process the generated summary (e.g., print or store)
  #print(f"Summary {summary_count + 1}:", output[0]['generated_text'])  # Accessing generated text from the first output element

 # summary_count += 1


In [188]:
cnn['test']['article'][0]

'(CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC\'s founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians\' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday\'s ceremony

In [189]:
import pandas as pd
df=pd.DataFrame()

In [190]:
#!pip install evaluate, rouge_score, bert_score

In [191]:
import evaluate
rouge_score=evaluate.load('rouge')
bert_score=evaluate.load('bertscore')

In [192]:
import pandas as pd

for i in range(50):
    a = cnn['test']['article'][i]
    r=cnn[f'test']['highlights'][i]
    b = tokenizer(a, truncation=True, return_tensors="pt", max_length=1024).input_ids
    o = model.generate(b, **params, pad_token_id=tokenizer.eos_token_id)
    gt = tokenizer.decode(o[0], skip_special_tokens=True)

    rouge_score_value = rouge_score.compute(predictions=[gt], references=[cnn['test']['highlights'][i]])
    bert_score_value = bert_score.compute(predictions=[gt], references=[cnn['test']['highlights'][i]], lang="en")

    # Create a new DataFrame or append to an existing one
    if i == 0:
        df = pd.DataFrame({'Input Text': [a], 'Generated Text': [gt], 'Rouge Score': [rouge_score_value], 'BERT Score': [bert_score_value]})
    else:
        new_row = pd.DataFrame({'Input Text': [a], 'Generated Text': [gt], 'Rouge Score': [rouge_score_value], 'BERT Score': [bert_score_value]}, index=[i])
        df = pd.concat([df, new_row], ignore_index=True)

    print(f"Generated text {i}: {gt}\n Rouge: {rouge_score_value} \n Bert_Score: {bert_score_value}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Generated text 0: The Palestinian Authority becomes the 123rd member of the International Criminal Court. The move gives the court jurisdiction over alleged crimes in Palestinian territories.
 Rouge: {'rouge1': 0.44827586206896547, 'rouge2': 0.25, 'rougeL': 0.31034482758620696, 'rougeLsum': 0.44827586206896547} 
 Bert_Score: {'precision': [0.9076894521713257], 'recall': [0.8876417279243469], 'f1': [0.8975536227226257], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.38.2)'}
Generated text 1: A dog apparently hit by a car and buried in a field survives. The dog, now named Theia, was found emaciated
 Rouge: {'rouge1': 0.4375, 'rouge2': 0.25806451612903225, 'rougeL': 0.40625, 'rougeLsum': 0.40625} 
 Bert_Score: {'precision': [0.9049350619316101], 'recall': [0.8699348568916321], 'f1': [0.8870898485183716], 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.38.2)'}
Generated text 2: Mohammad Javad Zarif is the Iranian foreign minister. He has been John Kerry's

In [193]:
df.to_csv('Top-P Sampling.csv')

In [194]:
from google.colab import files
files.download('Top-P Sampling.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>