In [1]:
!pip install transformers
!pip install datasets
!pip list
!pip install rouge_score
!pip install bert_score

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Package                  Version            
------------------------ -------------------
absl-py                  2.1.0              
accelerate               0.27.2             
aiohttp                  3.9.3              
aiosignal                1.3.1              
alembic                  1.4.3              
appdirs                  1.4.4              
argon2-cffi              20.1.0             
async-generator          1.10               
async-timeout            4.0.3              
attrs                    20.3.0             
backcall                 0.2.0              
batchspawner             1.0.1              
bert-score               0.3.13             
bleach                   3.2.1              
blinker                  1.4                
certifi                  2020.12.5          
certipy                  0.1.3    

In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from tqdm import tqdm
from datasets import load_dataset

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [3]:
# By Madhan Mohan


# Task 1

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

prompts = [
    "Words hold immense power. They can inspire, uplift, and transform lives. A well-crafted sentence has the ability to ignite imagination, stir emotions, and leave an indelible mark on the human psyche. It's a testament to the remarkable capacity of language to transcend barriers and forge connections.",
    "The natural world is a tapestry of wonders. From the majestic peaks of snow-capped mountains to the intricate dance of life in the depths of the oceans, nature's artistry is a constant source of awe and reverence. It reminds us of our humble place in the grand scheme of existence and our responsibility to protect its fragile beauty.",
    "Creativity is the lifeblood of progress. It fuels innovation, drives artistic expression, and propels us forward. In a world that often values conformity, embracing our creative spirit allows us to challenge the status quo, question the established norms, and pave the way for new possibilities.",
    "Resilience is the hallmark of the human spirit. It is the ability to rise above adversity, to adapt and evolve in the face of challenges. Like a sturdy oak that withstands the fiercest storms, resilience empowers us to bend but never break, emerging stronger and wiser from life's trials.",
    "Kindness is a universal language that transcends boundaries and cultures. A simple act of compassion, a gentle word of encouragement, or a genuine smile can have a profound impact, creating ripples of positivity that touch the lives of others. In a world that often appears harsh and unforgiving, kindness is a beacon of hope and humanity."
]






In [4]:
num_beams = 5


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for i, prompt in enumerate(prompts):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    
    beam_search_config = model.generate(
        input_ids, 
        max_length=30, 
        num_beams=num_beams,
        do_sample=False,
        pad_token_id=tokenizer.eos_token_id
    )
    
    output = tokenizer.batch_decode(beam_search_config, skip_special_tokens=True)
    
    encodings = tokenizer(output, return_tensors="pt").to(device)
    print("Output:", output)
    print()
    
    # Attribution of code: https://huggingface.co/docs/transformers/main/en/perplexity
    max_length = model.config.max_position_embeddings
    stride = 512
    seq_len = encodings.input_ids.size(1)

    nlls = []
    prev_end_loc = 0
    for begin_loc in tqdm(range(0, seq_len, stride)):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
        encodings = encodings.input_ids[:, begin_loc:end_loc].to(device)
        target_ids = encodings.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(encodings, labels=target_ids)

            # loss is calculated using CrossEntropyLoss which averages over valid labels
            # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
            # to the left by 1.
            neg_log_likelihood = outputs.loss

        nlls.append(neg_log_likelihood)

        prev_end_loc = end_loc
        if end_loc == seq_len:
            break

    ppl = torch.exp(torch.stack(nlls).mean())
    print("Generated Text:", output)
    print("Perplexity: ", ppl)





Output: ['A well-crafted sentence has the ability to ignite imagination, stir emotions, and leave an indelible mark on the human psyche.']



  0%|          | 0/1 [00:00<?, ?it/s]


Generated Text: ['A well-crafted sentence has the ability to ignite imagination, stir emotions, and leave an indelible mark on the human psyche.']
Perplexity:  tensor(1.2615)
Output: ["Nature's artistry is a constant source of awe and reverence. It reminds us of our humble place in the grand scheme of existence."]



  0%|          | 0/1 [00:00<?, ?it/s]


Generated Text: ["Nature's artistry is a constant source of awe and reverence. It reminds us of our humble place in the grand scheme of existence."]
Perplexity:  tensor(1.2304)
Output: ['Creativity is the lifeblood of progress. It fuels innovation, drives artistic expression, and propels us forward. In a world that']



  0%|          | 0/1 [00:00<?, ?it/s]


Generated Text: ['Creativity is the lifeblood of progress. It fuels innovation, drives artistic expression, and propels us forward. In a world that']
Perplexity:  tensor(1.4054)
Output: ['Resilience is the hallmark of the human spirit. It is the ability to rise above adversity, to adapt and evolve in the face']



  0%|          | 0/1 [00:00<?, ?it/s]


Generated Text: ['Resilience is the hallmark of the human spirit. It is the ability to rise above adversity, to adapt and evolve in the face']
Perplexity:  tensor(1.5376)
Output: ['Kindness is a universal language that transcends boundaries and cultures. A simple act of compassion, a gentle word of encouragement, or a']



  0%|          | 0/1 [00:00<?, ?it/s]

Generated Text: ['Kindness is a universal language that transcends boundaries and cultures. A simple act of compassion, a gentle word of encouragement, or a']
Perplexity:  tensor(1.4771)





In [5]:
# Task 2

from datasets import load_dataset

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

dataset = load_dataset('cnn_dailymail', '1.0.0')

test_dataset = dataset["test"]

sentence_list = []
reference_list = []

counter = 0
for item in test_dataset:
    if counter == 50:
        break
    
    sentence_list.append(item["article"])
    reference_list.append(item["highlights"])
    counter += 1



prediction_list = []
for i in range(len(sentence_list)):
    input_ids = tokenizer(sentence_list[i], return_tensors="pt", max_length=1024, truncation=True).input_ids
    res = model.generate(input_ids, num_beams=5, do_sample=False, max_length=30, min_length=0)
    prediction_list.append(tokenizer.batch_decode(res, skip_special_tokens=True)[0])




In [6]:
print("Input Text")
for i in range(len(sentence_list)):
    print(sentence_list[i][:1024])
    print()

Input Text
(CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday's c

In [7]:
print("Reference Text")
for i in range(len(sentence_list)):
    print(reference_list[i])
    print()

Reference Text
Membership gives the ICC jurisdiction over alleged crimes committed in Palestinian territories since last June . Israel and the United States opposed the move, which could open the door to war crimes investigations against Israelis .

Theia, a bully breed mix, was apparently hit by a car, whacked with a hammer and buried in a field . "She's a true miracle dog and she deserves a good life," says Sara Mellado, who is looking for a home for Theia .

Mohammad Javad Zarif has spent more time with John Kerry than any other foreign minister . He once participated in a takeover of the Iranian Consulate in San Francisco . The Iranian foreign minister tweets in English .

17 Americans were exposed to the Ebola virus while in Sierra Leone in March . Another person was diagnosed with the disease and taken to hospital in Maryland . National Institutes of Health says the patient is in fair condition after weeks of treatment .

Student is no longer on Duke University campus and will fa

In [8]:
# Task 3

import evaluate

rouge = evaluate.load("rouge")
rouge_results = rouge.compute(predictions=prediction_list, references=reference_list, use_aggregator=False)
print("Rouge Results: ", rouge_results)




Rouge Results:  {'rouge1': [0.44827586206896547, 0.46875, 0.48275862068965525, 0.3692307692307692, 0.44776119402985076, 0.22222222222222224, 0.1842105263157895, 0.27692307692307694, 0.44897959183673475, 0.46511627906976744, 0.490566037735849, 0.20000000000000004, 0.24489795918367346, 0.14705882352941177, 0.2631578947368421, 0.12307692307692307, 0.4482758620689655, 0.26666666666666666, 0.5652173913043478, 0.30188679245283023, 0.2105263157894737, 0.21874999999999997, 0.3692307692307692, 0.4210526315789474, 0.5660377358490566, 0.2028985507246377, 0.3111111111111111, 0.19047619047619047, 0.20833333333333331, 0.4782608695652174, 0.30379746835443033, 0.18181818181818182, 0.45, 0.20833333333333331, 0.4482758620689655, 0.3333333333333333, 0.3508771929824562, 0.34374999999999994, 0.2711864406779661, 0.18181818181818185, 0.6222222222222222, 0.1095890410958904, 0.6956521739130435, 0.5333333333333333, 0.5714285714285715, 0.13043478260869565, 0.8799999999999999, 0.22641509433962265, 0.2307692307692

In [9]:
bert = evaluate.load("bertscore")
bert_results = bert.compute(predictions=prediction_list, references=reference_list, lang="en")
print("Bert Results: ", bert_results)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Bert Results:  {'precision': [0.9087958335876465, 0.9242240190505981, 0.9053109288215637, 0.9023255109786987, 0.9133458733558655, 0.86474609375, 0.8537155389785767, 0.883267879486084, 0.8927016258239746, 0.9159455299377441, 0.8737826347351074, 0.8537482023239136, 0.8677037358283997, 0.8582708239555359, 0.8554202318191528, 0.8369802236557007, 0.92963707447052, 0.8826193809509277, 0.9455327987670898, 0.8646371960639954, 0.8641473054885864, 0.8715906143188477, 0.8318599462509155, 0.9185168147087097, 0.9049757719039917, 0.872330367565155, 0.8750764727592468, 0.8370938301086426, 0.8639880418777466, 0.8971128463745117, 0.9125757813453674, 0.8597882390022278, 0.9108281135559082, 0.8357371687889099, 0.896733820438385, 0.9103935956954956, 0.8994429707527161, 0.8983388543128967, 0.8771633505821228, 0.8362583518028259, 0.9047712087631226, 0.8293336629867554, 0.9364964365959167, 0.9340121746063232, 0.9145631194114685, 0.8639483451843262, 0.9715203046798706, 0.8682656288146973, 0.8685991764068604, 

In [10]:
# Inter-annotator Agreement Calculation

In [11]:
for i in range(len(rouge_results["rouge1"])):
    print("rouge1: " + str(rouge_results["rouge1"][i]) +", " +  "rouge2: " + str(rouge_results["rouge2"][i]) +", " + "rougeL: " + str(rouge_results["rougeL"][i]) +", " + "rougeLsum: " + str(rouge_results["rougeLsum"][i]))

rouge1: 0.44827586206896547, rouge2: 0.25, rougeL: 0.31034482758620696, rougeLsum: 0.31034482758620696
rouge1: 0.46875, rouge2: 0.3548387096774194, rougeL: 0.46875, rougeLsum: 0.46875
rouge1: 0.48275862068965525, rouge2: 0.25, rougeL: 0.3103448275862069, rougeLsum: 0.3103448275862069
rouge1: 0.3692307692307692, rouge2: 0.19047619047619044, rougeL: 0.3384615384615384, rougeLsum: 0.3384615384615384
rouge1: 0.44776119402985076, rouge2: 0.24615384615384614, rougeL: 0.3880597014925373, rougeLsum: 0.3880597014925373
rouge1: 0.22222222222222224, rouge2: 0.13953488372093023, rougeL: 0.22222222222222224, rougeLsum: 0.22222222222222224
rouge1: 0.1842105263157895, rouge2: 0.05405405405405406, rougeL: 0.13157894736842105, rougeLsum: 0.13157894736842105
rouge1: 0.27692307692307694, rouge2: 0.06349206349206349, rougeL: 0.1846153846153846, rougeLsum: 0.1846153846153846
rouge1: 0.44897959183673475, rouge2: 0.1702127659574468, rougeL: 0.28571428571428575, rougeLsum: 0.28571428571428575
rouge1: 0.465116

In [12]:
import numpy as np

In [13]:
print("rouge1: " + str(np.mean(rouge_results["rouge1"])) +", " +  "rouge2: " + str(np.mean(rouge_results["rouge2"])) +", " + "rougeL: " + str(np.mean(rouge_results["rougeL"])) +", " + "rougeLsum: " + str(np.mean(rouge_results["rougeLsum"][i])))

rouge1: 0.349736749363087, rouge2: 0.17959541945261262, rougeL: 0.28435886776053587, rougeLsum: 0.3255813953488372


In [14]:
# for item in results["rouge1"]:
#     print(item)

In [15]:
# for item in results["precision"]:
#     print(item)

In [16]:
 for i in range(len(bert_results["precision"])):
    print("precision: " + str(bert_results["precision"][i]) +", " +  "recall: " + str(bert_results["recall"][i]) +", " + "f1: " + str(bert_results["f1"][i]))

precision: 0.9087958335876465, recall: 0.888218104839325, f1: 0.8983891010284424
precision: 0.9242240190505981, recall: 0.8783526420593262, f1: 0.9007046818733215
precision: 0.9053109288215637, recall: 0.8778842687606812, f1: 0.8913866281509399
precision: 0.9023255109786987, recall: 0.8661049604415894, f1: 0.8838443160057068
precision: 0.9133458733558655, recall: 0.8735873103141785, f1: 0.8930243253707886
precision: 0.86474609375, recall: 0.8554022908210754, f1: 0.8600488305091858
precision: 0.8537155389785767, recall: 0.8346283435821533, f1: 0.8440640568733215
precision: 0.883267879486084, recall: 0.8631405830383301, f1: 0.8730882406234741
precision: 0.8927016258239746, recall: 0.8694136142730713, f1: 0.8809037208557129
precision: 0.9159455299377441, recall: 0.9109427332878113, f1: 0.9134373068809509
precision: 0.8737826347351074, recall: 0.8807424902915955, f1: 0.8772487640380859
precision: 0.8537482023239136, recall: 0.841271698474884, f1: 0.8474639654159546
precision: 0.86770373582

In [17]:
print("precision: " + str(np.mean(bert_results["precision"])) +", " +  "recall: " + str(np.mean(bert_results["recall"])) +", " + "f1: " + str(np.mean(bert_results["f1"])) )

precision: 0.8857560527324676, recall: 0.8636165821552276, f1: 0.8744480299949646


In [41]:
# Agreement Score
from nltk import agreement

# beam coherance

# madhan = [4, 5, 4, 5, 4, 4, 5, 4, 3, 3, 4, 5, 4, 4, 3, 5, 4, 4, 5, 4]
# ritwick = [5, 5, 4, 5, 4, 5, 5, 4, 3, 4, 4, 5, 4, 4, 3, 4, 5, 4, 4,4]
# masha = [4, 5, 4, 5, 4, 4, 5, 4, 4, 3, 4, 5, 4, 4, 3, 5, 4, 4, 5, 4]
# layan = [5, 4, 4, 5, 4, 4, 5, 4, 4, 5, 4, 5, 4, 4, 3, 4, 4, 4, 5, 4]

# beam factuality

# madhan = [4, 4, 3, 4, 4, 4, 5, 3, 4, 4, 3, 4, 5, 4, 3, 5, 4, 4, 5, 5]
# ritwick = [4, 4, 5, 3, 4, 5, 3, 4, 2, 3, 4, 3, 4, 3, 2, 5, 3, 4, 5, 5]
# masha = [4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 3, 4, 3, 3, 3, 5, 4, 4, 5, 4]
# layan = [5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 5, 4, 3, 5, 4, 4, 5, 5]

# beam formality

madhan=[4, 4, 4, 5, 5, 5, 5, 4, 3, 3, 4, 5, 4, 3, 2, 5, 4, 4, 5, 4]
ritwick=[4, 4, 4, 3, 5, 4, 4, 4, 3, 4, 3, 3, 3, 3, 2, 5, 5, 3, 5, 4]
masha=[4, 4, 4, 4, 5, 4, 3, 4, 3, 3, 4, 4, 4, 3, 3, 5, 4, 4, 5, 4]
layan=[4, 4, 4, 5, 5, 5, 5, 4, 3, 3, 4, 5, 4, 3, 3, 5, 4, 4, 5, 4]

print(len(madhan))
print(len(ritwick))
print(len(masha))
print(len(layan))


taskdata = [
    [0, str(i), str(madhan[i])] for i in range(len(madhan))] + [[1, str(i), str(ritwick[i])] for i in range(len(ritwick))] + [
    [2, str(i), str(masha[i])] for i in range(len(masha))] + [[3, str(i), str(layan[i])] for i in range(len(layan))]

rating_task = agreement.AnnotationTask(data=taskdata)
print("alpha " + str(rating_task.alpha()))

20
20
20
20
alpha 0.5177510040160642


In [42]:
print("precision: " + str(np.mean(bert_results["precision"][:20])) +", " +  "recall: " + str(np.mean(bert_results["recall"][:20])) +", " + "f1: " + str(np.mean(bert_results["f1"][:20])) )

precision: 0.8866355568170547, recall: 0.8611939728260041, f1: 0.8736424028873444


In [43]:
print("rouge1: " + str(np.mean(rouge_results["rouge1"][:20])) +", " +  "rouge2: " + str(np.mean(rouge_results["rouge2"][:20])) +", " + "rougeL: " + str(np.mean(rouge_results["rougeL"][:20])) +", " + "rougeLsum: " + str(np.mean(rouge_results["rougeLsum"][:20])))


rouge1: 0.343251624657117, rouge2: 0.17377764906567655, rougeL: 0.27658590409837785, rougeLsum: 0.27658590409837785


In [44]:
print("precision: " + str(np.mean(bert_results["precision"][:50])) +", " +  "recall: " + str(np.mean(bert_results["recall"][:50])) +", " + "f1: " + str(np.mean(bert_results["f1"][:50])) )

precision: 0.8857560527324676, recall: 0.8636165821552276, f1: 0.8744480299949646


In [45]:
print("rouge1: " + str(np.mean(rouge_results["rouge1"][:50])) +", " +  "rouge2: " + str(np.mean(rouge_results["rouge2"][:50])) +", " + "rougeL: " + str(np.mean(rouge_results["rougeL"][:50])) +", " + "rougeLsum: " + str(np.mean(rouge_results["rougeLsum"][:50])))


rouge1: 0.349736749363087, rouge2: 0.17959541945261262, rougeL: 0.28435886776053587, rougeLsum: 0.28435886776053587


In [46]:

print(np.mean(masha+madhan+layan+ritwick))

3.975
