# Instructions
This notebook is meant to evaluate the summary texts generated by GPT-4 and Claude models.

Make sure to replace the "<ENTER_Value>" throughout the code:

- aws_access_key = "<ENTER_Value>"

- aws_access_secret = "<ENTER_Value>"

- os.environ['OPENAI_API_KEY'] = "<ENTER_Value>"

- wandb.login(key="<ENTER_Value>") # your api key - https://wandb.ai/settings

- wandb.init(project='text_summarization_comparison', entity='<ENTER_Value>', settings=wandb.Settings(start_method="thread")) # your username - https://wandb.ai/settings
    


## 1- Summarization

### Dependencies

In [2]:
!pip install langchain
!pip install --upgrade boto3

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Collecting boto3
  Downloading boto3-1.34.32-py3-none-any.whl.metadata (6.6 kB)
Collecting botocore<1.35.0,>=1.34.32 (from boto3)
  Downloading botocore-1.34.32-py3-none-any.whl.metadata (5.7 kB)
Downloading boto3-1.34.32-py3-none-any.whl (139 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading botocore-1.34.32-py3-none-any.whl (11.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.9/11.9 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: botocore, boto3
  Attempting uninstall: botocore
    Found existing installation: botocore 1.34.31
    Uninstalling botocore-1.34.31:
      Successfully uninstalled botocore-1.34.31
  Attempting uninstall: boto3
    Found existing installat

### Set Up

In [3]:
from langchain.chat_models import ChatOpenAI, ChatAnthropic
from langchain import LLMChain
from langchain.prompts import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
)
import os
import boto3
import json
import re
import requests
from bs4 import BeautifulSoup

In [4]:
# AWS and gpt keys
aws_access_key = "<ENTER_Value>"
aws_access_secret = "<ENTER_Value>"
os.environ['OPENAI_API_KEY'] = "<ENTER_Value>"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

### Sample urls

In [5]:
links = ["https://www.nzherald.co.nz/northern-advocate/news/world-leading-robot-plane-venture-takes-off-in-kerikeri/KCNCTSD5VRGXDCBAY7HQ6D5QHU/",
"https://www.forbes.com/sites/jeremybogaisky/2022/07/13/merlin-labs-raises-105-million-to-solve-the-pilot-shortage-by-robotizing-airplanes/",
"https://www.aviationtoday.com/2022/07/14/merlin-labs-raises-105-million-new-funding-round-announces-software-work-c-130j/",
"https://www.ainonline.com/aviation-news/aerospace/2023-08-02/merlin-autonomy-vision-raises-questions-alaskas-preparedness",
"https://tracxn.com/d/trending-themes/startups-in-avionics/__Y4qeV3V5ADMi5O48CmFWbc8Sl4DarAl7uF4zxmPra2U",
"https://techcrunch.com/2022/07/13/autonomous-flight-startup-merlin-labs-lands-120m-and-u-s-air-force-partnership/",
"https://siliconangle.com/2022/07/13/autonomous-flight-startup-merlin-labs-raises-105m-funding/",
"https://executivebiz.com/2023/04/merlin-labs-to-demo-automated-aircraft-system-under-faa-contract/",
"https://airlinegeeks.com/2023/07/10/merlin-completes-autonomous-cessna-caravan-flights/"]

### Getting the raw text, using the urls

In [6]:
def get_raw_news_text(url : str) -> str: # throws http error if problems w/request
   # Send a GET request to the URL
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
    response = requests.get(url, headers=headers, timeout=10)
    response.raise_for_status()  # Raise an exception if there was an error
    
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, "html.parser")

    # Extract all text from the parsed HTML
    text = soup.get_text(separator=" ")

    # Remove leading/trailing white spaces and extra line breaks
    text = text.strip()
    
    # Replace newlines and repeated newlines with a single space
    text = re.sub(r'\n+', ' ', text)
    
    # Replace duplicate spaces with a single space
    text = re.sub(r' +', ' ', text)
    return text

In [7]:
records = []

for link in links:
    new_record = {
        "raw_text": get_raw_news_text(link),
        "gpt": {
            "summary": "",
            "similarity_metrics": {},
            "gpt_examiner_metrics": {},
            "claude_examiner_metrics": {}
        },
        "claude": {
            "summary": "",
            "similarity_metrics": {},
            "gpt_examiner_metrics": {},
            "claude_examiner_metrics": {}
        }
    }
    records.append(new_record)

records

[{'raw_text': "World-leading robot plane venture takes off in Kerikeri - NZ Herald Thursday, 01 February 2024 Search New Zealand Herald Weather Kaitaia Whangarei Dargaville NZME Network NZ Herald The Northern Advocate The Northland Age The Aucklander Waikato Herald Bay Of Plenty Times Rotorua Daily Post Hawke's Bay Today Whanganui Chronicle The Stratford Press Manawatu Guardian Kapiti News Horowhenua Chronicle Te Awamutu Courier Viva Eat Well OneRoof DRIVEN Car Guide The Country Photo Sales iHeart Radio Restaurant Hub Subscribe Advertisement Advertise with NZME. Home / Northern Advocate World-leading robot plane venture takes off in Kerikeri By Peter de Graaf 29 May, 2023 05:00 PM 5 mins to read Save share Share this article facebook copy link twitter linkedin reddit email Merlin Labs NZ chief executive Shaun Johnson has returned to his Kerikeri roots to set up a world-leading business developing autonomous aircraft — in other words, robot planes. Photo / Peter de Graaf Merlin Labs NZ 

In [8]:
# same template as the one used in news-research
human_message_template = """
        Below is the raw text content from a news article about a company. Please provide a short ~50 word summary
        of the contents of the article in the style of a neutral financial analyst.
        
        Please do not use the phrase "As a neutral financial analyst."

        {content}
    """

### Summarizing using gpt-4

In [9]:
def gpt_summarize_news(human_message_template, raw_text) -> str:
    
    llm_gpt4 = ChatOpenAI(model='gpt-4')

    human_message_prompt = HumanMessagePromptTemplate.from_template(human_message_template)

    chat_prompt = ChatPromptTemplate.from_messages([human_message_prompt])

    chain = LLMChain(llm=llm_gpt4, prompt=chat_prompt)

    return chain.run(content=raw_text)

for record in records:
    record["gpt"]["summary"] = gpt_summarize_news(human_message_template, record["raw_text"])

records

[{'raw_text': "World-leading robot plane venture takes off in Kerikeri - NZ Herald Thursday, 01 February 2024 Search New Zealand Herald Weather Kaitaia Whangarei Dargaville NZME Network NZ Herald The Northern Advocate The Northland Age The Aucklander Waikato Herald Bay Of Plenty Times Rotorua Daily Post Hawke's Bay Today Whanganui Chronicle The Stratford Press Manawatu Guardian Kapiti News Horowhenua Chronicle Te Awamutu Courier Viva Eat Well OneRoof DRIVEN Car Guide The Country Photo Sales iHeart Radio Restaurant Hub Subscribe Advertisement Advertise with NZME. Home / Northern Advocate World-leading robot plane venture takes off in Kerikeri By Peter de Graaf 29 May, 2023 05:00 PM 5 mins to read Save share Share this article facebook copy link twitter linkedin reddit email Merlin Labs NZ chief executive Shaun Johnson has returned to his Kerikeri roots to set up a world-leading business developing autonomous aircraft — in other words, robot planes. Photo / Peter de Graaf Merlin Labs NZ 

### Summarizing using Claude

In [10]:
def claude_summarize_news(prompt, raw_text):

    bedrock = boto3.client(service_name='bedrock-runtime',
                           region_name='us-west-2',
                           aws_access_key_id=aws_access_key,
                           aws_secret_access_key=aws_access_secret)

    body = json.dumps({
        "prompt": f"\n\nHuman:{prompt}\nraw text:{raw_text}\n\nAssistant:",
        "max_tokens_to_sample": 500,
        "temperature": 0.2,
        "top_p": 0.9,})


    modelId = 'anthropic.claude-v2'
    accept = 'application/json'
    contentType = 'application/json'

    response = bedrock.invoke_model(body=body, modelId=modelId, accept=accept, contentType=contentType)    
    response_body = json.loads(response.get('body').read())
    completion = response_body.get('completion')

    return completion


for record in records:
    record["claude"]["summary"] = claude_summarize_news(human_message_template, record["raw_text"])

records

[{'raw_text': "World-leading robot plane venture takes off in Kerikeri - NZ Herald Thursday, 01 February 2024 Search New Zealand Herald Weather Kaitaia Whangarei Dargaville NZME Network NZ Herald The Northern Advocate The Northland Age The Aucklander Waikato Herald Bay Of Plenty Times Rotorua Daily Post Hawke's Bay Today Whanganui Chronicle The Stratford Press Manawatu Guardian Kapiti News Horowhenua Chronicle Te Awamutu Courier Viva Eat Well OneRoof DRIVEN Car Guide The Country Photo Sales iHeart Radio Restaurant Hub Subscribe Advertisement Advertise with NZME. Home / Northern Advocate World-leading robot plane venture takes off in Kerikeri By Peter de Graaf 29 May, 2023 05:00 PM 5 mins to read Save share Share this article facebook copy link twitter linkedin reddit email Merlin Labs NZ chief executive Shaun Johnson has returned to his Kerikeri roots to set up a world-leading business developing autonomous aircraft — in other words, robot planes. Photo / Peter de Graaf Merlin Labs NZ 

# 2- Evaluation (Similarity Metrics)

### Dependencies

In [11]:
!pip install selfcheckgpt
!pip install nltk bert-score rouge sentence-transformers transformers

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


### Set Up

In [14]:
import nltk
from bert_score import score
from rouge import Rouge
from sentence_transformers import SentenceTransformer, util
from transformers import RobertaModel, RobertaTokenizer
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import single_meteor_score
import torch
from selfcheckgpt.modeling_mqag import MQAG

# Ensure necessary NLTK components are downloaded
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/eliipik/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/eliipik/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Metrics

**Lexical Similarity**: 
- BLEU: measures how many words and phrases in the hypothesis (summarized) text appear in the reference text. Scores range from 0 to 1.
- METEOR: is similar to BLEU but also considers synonyms and stems of words. It ranges from 0 to 1.

**Semantic Similarity**: 
- BERTScore: compares the embedding of the hypothesis and reference texts. It's a more context-aware metric, ranging from -1 to 1.
- MiniLM and RoBERTa: measure how semantically similar the hypothesis text is to the reference text. RoBERTa also modifies key hyperparameters, removing the next-sentence pretraining objective and training with much larger mini-batches and learning rates. Scores range from 0 to 1.

**Content Overlap**: 
- ROUGE: includes ROUGE-N (sub-scores for ROUGE-1, ROUGE-2) and ROUGE-L, focusing on the overlap of n-grams, longest common subsequence, etc., between the hypothesis and reference. Scores include recall (r), precision (p), and F1-scores (f), range from 0 to 1.

**MQAG**: Multiple-choice Question Answering and Generation for Assessing Information Consistency in Summarization
- KL Divergence (KL-div): measures how the probability distribution of words in the summarized text diverges from that in the reference text.
- Hellinger Distance: compares the probability distributions of words or features between the summary and reference text.
- Total Variation (D_TV): measures the largest difference in probabilities among events in the summary text and reference text. It's half the L1 norm (sum of absolute differences) of the probability differences for all events. It is bounded and continuous. 

In [15]:
def calculate_metrics(reference, hypothesis):

    # Tokenize for BLEU
    reference_tokens = nltk.word_tokenize(reference)
    hypothesis_tokens = nltk.word_tokenize(hypothesis)

    # Calculate BLEU score
    bleu_score = sentence_bleu([reference_tokens], hypothesis_tokens)

    # Calculate METEOR score
    meteor_score_value = single_meteor_score(reference_tokens, hypothesis_tokens)

    # Calculate BERTScore
    P, R, F1 = score([hypothesis], [reference], lang='en', verbose=True)
    bert_score = F1.mean().item()

    # Calculate ROUGE scores
    rouge = Rouge()
    rouge_scores = rouge.get_scores(hypothesis, reference, avg=True)

    # Function to get embeddings from RoBERTa
    def get_roberta_embedding(text):
        encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            model_output = roberta_model(**encoded_input)
        return model_output.last_hidden_state.mean(dim=1)

    # Calculate Semantic Similarity using MiniLM
    st_model = SentenceTransformer('all-MiniLM-L6-v2')
    reference_embedding_minilm = st_model.encode(reference, convert_to_tensor=True)
    hypothesis_embedding_minilm = st_model.encode(hypothesis, convert_to_tensor=True)
    semantic_similarity_minilm = util.pytorch_cos_sim(reference_embedding_minilm, hypothesis_embedding_minilm).item()

    # Calculate RoBERTa score
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    roberta_model = RobertaModel.from_pretrained('roberta-base')
    reference_embedding_roberta = get_roberta_embedding(reference)
    hypothesis_embedding_roberta = get_roberta_embedding(hypothesis)
    roberta_score = util.pytorch_cos_sim(reference_embedding_roberta, hypothesis_embedding_roberta).item()

    # Calculate MQAG
    torch.manual_seed(28)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    mqag_model = MQAG(
        g1_model_type='race', # race (more abstractive), squad (more extractive)
        device=device
    )
    mqag_score = mqag_model.score(candidate=hypothesis, reference=reference, num_questions=3, verbose=True)


    return {
        "BLUE": bleu_score,
        "METEOR": meteor_score_value,
        "BERT_score": bert_score,
        "RoBERTa": roberta_score,
        "ROUGE": rouge_scores,
        "Semantic_similarity_minilm": semantic_similarity_minilm,
        "KL-divergence": mqag_score['kl_div'],
        "Hellinger": mqag_score['hellinger'],
        "Total-variation": mqag_score['total_variation']
    }


In [16]:
for record in records:
    record["gpt"]["similarity_metrics"] = calculate_metrics(record["raw_text"], record["gpt"]["summary"])

records

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.98 seconds, 1.02 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MQAG (race) initialized to cpu
Initialized Generation
Initialized Answering
Q1: What company is Merlin Labs NZ being set up?
(1) [P(.|cand)=43.83%]	[P(.|ref)=66.41%]	Merlin Labs NZ.
(2) [P(.|cand)=56.03%]	[P(.|ref)=24.99%]	Merlin Labs.
(3) [P(.|cand)=0.04%]	[P(.|ref)=3.36%]	Boeing Company NZ.
(4) [P(.|cand)=0.11%]	[P(.|ref)=5.24%]	Falcon Labs NZ.
-------------------------------------------------------------------------------
Q2: Where can we probably read this article?
(1) [P(.|cand)=24.83%]	[P(.|ref)=6.09%]	In a news report.
(2) [P(.|cand)=74.78%]	[P(.|ref)=78.38%]	In a newspaper.
(3) [P(.|cand)=0.02%]	[P(.|ref)=2.89%]	In a medical journal.
(4) [P(.|cand)=0.37%]	[P(.|ref)=12.64%]	In a travel magazine.
-------------------------------------------------------------------------------
Q3: The first overseas subsidiary of US aviation technology firm Merlin Labs will _.
(1) [P(.|cand)=81.78%]	[P(.|ref)=74.60%]	start as an air freight operation
(2) [P(.|cand)=10.57%]	[P(.|ref)=6.21%]	start au

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.76 seconds, 1.31 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MQAG (race) initialized to cpu
Initialized Generation
Initialized Answering
Q1: What is Merlin Labs' main idea?
(1) [P(.|cand)=99.17%]	[P(.|ref)=97.16%]	It is an automated flight control system
(2) [P(.|cand)=0.41%]	[P(.|ref)=0.94%]	It is a cargo service
(3) [P(.|cand)=0.41%]	[P(.|ref)=0.94%]	It is a cargo service
(4) [P(.|cand)=0.01%]	[P(.|ref)=0.96%]	It is a plane company
-------------------------------------------------------------------------------
Q2: What is Merlin Labs focusing on?
(1) [P(.|cand)=59.02%]	[P(.|ref)=22.15%]	In-flight safety control.
(2) [P(.|cand)=0.21%]	[P(.|ref)=2.79%]	Business development.
(3) [P(.|cand)=38.09%]	[P(.|ref)=43.30%]	Commercial flight management.
(4) [P(.|cand)=2.68%]	[P(.|ref)=31.75%]	Commercial flight service.
-------------------------------------------------------------------------------
Q3: What is Merlin Labs doing now?
(1) [P(.|cand)=99.06%]	[P(.|ref)=90.67%]	It is developing automated flight control systems.
(2) [P(.|cand)=0.00%]	[P(.|ref)=0

[{'raw_text': "World-leading robot plane venture takes off in Kerikeri - NZ Herald Thursday, 01 February 2024 Search New Zealand Herald Weather Kaitaia Whangarei Dargaville NZME Network NZ Herald The Northern Advocate The Northland Age The Aucklander Waikato Herald Bay Of Plenty Times Rotorua Daily Post Hawke's Bay Today Whanganui Chronicle The Stratford Press Manawatu Guardian Kapiti News Horowhenua Chronicle Te Awamutu Courier Viva Eat Well OneRoof DRIVEN Car Guide The Country Photo Sales iHeart Radio Restaurant Hub Subscribe Advertisement Advertise with NZME. Home / Northern Advocate World-leading robot plane venture takes off in Kerikeri By Peter de Graaf 29 May, 2023 05:00 PM 5 mins to read Save share Share this article facebook copy link twitter linkedin reddit email Merlin Labs NZ chief executive Shaun Johnson has returned to his Kerikeri roots to set up a world-leading business developing autonomous aircraft — in other words, robot planes. Photo / Peter de Graaf Merlin Labs NZ 

In [17]:
for record in records:
    record["claude"]["similarity_metrics"] = calculate_metrics(record["raw_text"], record["claude"]["summary"])

records

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.74 seconds, 1.36 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MQAG (race) initialized to cpu
Initialized Generation
Initialized Answering
Q1: What is Merlin Labs NZ expected to do?
(1) [P(.|cand)=62.32%]	[P(.|ref)=96.44%]	To develop and test autonomous aircraft.
(2) [P(.|cand)=36.41%]	[P(.|ref)=3.41%]	To integrate automation technology into its Cessna fleet.
(3) [P(.|cand)=1.28%]	[P(.|ref)=0.11%]	To work with New Zealand's aviation authority.
(4) [P(.|cand)=0.00%]	[P(.|ref)=0.04%]	To provide New Zealand with qualified pilots.
-------------------------------------------------------------------------------
Q2: What's the main idea of the sentence "Merlin Labs NZ is a subsidiary of US aviation firm Merlin Labs". It means that _.
(1) [P(.|cand)=36.69%]	[P(.|ref)=41.43%]	Merlin Labs NZ is a subsidiary of Merlin Labs
(2) [P(.|cand)=5.04%]	[P(.|ref)=33.09%]	Merlin Labs NZ has developed and test autonomous aircraft
(3) [P(.|cand)=8.86%]	[P(.|ref)=5.24%]	Merlin Labs NZ is a US aviation firm
(4) [P(.|cand)=49.41%]	[P(.|ref)=20.24%]	Merlin Labs NZ is a subs

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.78 seconds, 1.28 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MQAG (race) initialized to cpu
Initialized Generation
Initialized Answering
Q1: What company has completed a limited autonomy system?
(1) [P(.|cand)=92.25%]	[P(.|ref)=55.23%]	Merlin Labs.
(2) [P(.|cand)=0.26%]	[P(.|ref)=2.75%]	Baillie Gifford.
(3) [P(.|cand)=7.24%]	[P(.|ref)=38.04%]	C/130J Hercules.
(4) [P(.|cand)=0.24%]	[P(.|ref)=3.97%]	Snowpoint Ventures.
-------------------------------------------------------------------------------
Q2: Which of the following is NOT true about Merlin Labs?
(1) [P(.|cand)=5.14%]	[P(.|ref)=2.63%]	It aims to provide a limited autonomy system for a Cessna Caravan in New Zealand.
(2) [P(.|cand)=5.93%]	[P(.|ref)=11.97%]	It will automate the C-130J Super Hercules.
(3) [P(.|cand)=20.75%]	[P(.|ref)=0.74%]	It has received $105 million.
(4) [P(.|cand)=68.18%]	[P(.|ref)=84.66%]	It has won regulatory approval for the C-130J Super Hercules.
-------------------------------------------------------------------------------
Q3: The passage mainly tells us that _.
(1) 

[{'raw_text': "World-leading robot plane venture takes off in Kerikeri - NZ Herald Thursday, 01 February 2024 Search New Zealand Herald Weather Kaitaia Whangarei Dargaville NZME Network NZ Herald The Northern Advocate The Northland Age The Aucklander Waikato Herald Bay Of Plenty Times Rotorua Daily Post Hawke's Bay Today Whanganui Chronicle The Stratford Press Manawatu Guardian Kapiti News Horowhenua Chronicle Te Awamutu Courier Viva Eat Well OneRoof DRIVEN Car Guide The Country Photo Sales iHeart Radio Restaurant Hub Subscribe Advertisement Advertise with NZME. Home / Northern Advocate World-leading robot plane venture takes off in Kerikeri By Peter de Graaf 29 May, 2023 05:00 PM 5 mins to read Save share Share this article facebook copy link twitter linkedin reddit email Merlin Labs NZ chief executive Shaun Johnson has returned to his Kerikeri roots to set up a world-leading business developing autonomous aircraft — in other words, robot planes. Photo / Peter de Graaf Merlin Labs NZ 

# 3- Evaluation (LLM-as-an-Examiner)

In [18]:
# Evaluation prompt template 
EVALUATION_PROMPT_TEMPLATE = """
You will be given one summary written for an article. Your task is to rate the summary on one metric.
Your answer should only be an integer score between 1-5. Do not provide any explanation and text.
Please make sure you read and understand these instructions very carefully. 
Please keep this document open while reviewing, and refer to it as needed.

Evaluation Criteria:

{criteria}

Evaluation Steps:

{steps}

Example:

Source Text:

{document}

Summary:

{summary}

Evaluation Form (scores ONLY):

- {metric_name}
"""

# Metric 1: Relevance

RELEVANCY_SCORE_CRITERIA = """
Relevance(1-5) - selection of important content from the source. \
The summary should include only important information from the source document. \
Annotators were instructed to penalize summaries which contained redundancies and excess information.
"""

RELEVANCY_SCORE_STEPS = """
1. Read the summary and the source document carefully.
2. Compare the summary to the source document and identify the main points of the article.
3. Assess how well the summary covers the main points of the article, and how much irrelevant or redundant information it contains.
4. Assign a relevance score from 1 to 5.
"""

# Metric 2: Coherence

COHERENCE_SCORE_CRITERIA = """
Coherence(1-5) - the collective quality of all sentences. \
We align this dimension with the DUC quality question of structure and coherence \
whereby "the summary should be well-structured and well-organized. \
The summary should not just be a heap of related information, but should build from sentence to a\
coherent body of information about a topic."
"""

COHERENCE_SCORE_STEPS = """
1. Read the article carefully and identify the main topic and key points.
2. Read the summary and compare it to the article. Check if the summary covers the main topic and key points of the article,
and if it presents them in a clear and logical order.
3. Assign a score for coherence on a scale of 1 to 5, where 1 is the lowest and 5 is the highest based on the Evaluation Criteria.
"""

# Metric 3: Consistency

CONSISTENCY_SCORE_CRITERIA = """
Consistency(1-5) - the factual alignment between the summary and the summarized source. \
A factually consistent summary contains only statements that are entailed by the source document. \
Annotators were also asked to penalize summaries that contained hallucinated facts.
"""

CONSISTENCY_SCORE_STEPS = """
1. Read the article carefully and identify the main facts and details it presents.
2. Read the summary and compare it to the article. Check if the summary contains any factual errors that are not supported by the article.
3. Assign a score for consistency based on the Evaluation Criteria.
"""

# Metric 4: Fluency

FLUENCY_SCORE_CRITERIA = """
Fluency(1-3): the quality of the summary in terms of grammar, spelling, punctuation, word choice, and sentence structure.
1: Poor. The summary has many errors that make it hard to understand or sound unnatural.
2: Fair. The summary has some errors that affect the clarity or smoothness of the text, but the main points are still comprehensible.
3: Good. The summary has few or no errors and is easy to read and follow.
"""

FLUENCY_SCORE_STEPS = """
Read the summary and evaluate its fluency based on the given criteria. Assign a fluency score from 1 to 3.
"""


### GPT-4 as examiner

In [19]:
def get_eval_score_gpt4(criteria: str, steps: str, document: str, summary: str, metric_name: str):
    try:
        llm_gpt4 = ChatOpenAI(model='gpt-4')
        human_message_prompt = HumanMessagePromptTemplate.from_template(EVALUATION_PROMPT_TEMPLATE)

        chat_prompt = ChatPromptTemplate.from_messages([human_message_prompt])

        chain = LLMChain(llm=llm_gpt4, prompt=chat_prompt)

        return chain.run(criteria=criteria,
            steps=steps,
            metric_name=metric_name,
            document=document,
            summary=summary)
    except Exception as e:
        print(e)
        return None

### Claude as examiner

In [20]:
def get_eval_score_claude(criteria: str, steps: str, document: str, summary: str, metric_name: str):
    try:
    
        bedrock = boto3.client(service_name='bedrock-runtime',
                            region_name='us-west-2',
                            aws_access_key_id=aws_access_key,
                            aws_secret_access_key=aws_access_secret)
        
        prompt = EVALUATION_PROMPT_TEMPLATE.format(
            criteria=criteria,
            steps=steps,
            metric_name=metric_name,
            document=document,
            summary=summary,
        )

        body = json.dumps({
            "prompt": f"\n\nHuman:{prompt}\nraw text:{prompt}\n\nAssistant: I only provide an integer score between 1-5.",
            "max_tokens_to_sample": 2000,
            "temperature": 0.2,
            "top_p": 0.9,})


        modelId = 'anthropic.claude-v2'
        accept = 'application/json'
        contentType = 'application/json'

        response = bedrock.invoke_model(body=body, modelId=modelId, accept=accept, contentType=contentType)    
        response_body = json.loads(response.get('body').read())
        completion = response_body.get('completion')
        score_num = re.search(r'\d+', completion).group()


        return score_num
    except Exception as e:
        print(e)
        return None

In [21]:
llm_evaluation_metrics = {
    "Relevance": (RELEVANCY_SCORE_CRITERIA, RELEVANCY_SCORE_STEPS),
    "Coherence": (COHERENCE_SCORE_CRITERIA, COHERENCE_SCORE_STEPS),
    "Consistency": (CONSISTENCY_SCORE_CRITERIA, CONSISTENCY_SCORE_STEPS),
    "Fluency": (FLUENCY_SCORE_CRITERIA, FLUENCY_SCORE_STEPS),
}

for record in records:
    for eval_type, (criteria, steps) in llm_evaluation_metrics.items():

        record["claude"]["claude_examiner_metrics"][eval_type] = get_eval_score_claude(criteria, steps, record["raw_text"], record["claude"]["summary"], eval_type)
        record["claude"]["gpt_examiner_metrics"][eval_type] = get_eval_score_gpt4(criteria, steps, record["raw_text"], record["claude"]["summary"], eval_type)   
        record["gpt"]["claude_examiner_metrics"][eval_type] = get_eval_score_claude(criteria, steps, record["raw_text"], record["gpt"]["summary"], eval_type)
        record["gpt"]["gpt_examiner_metrics"][eval_type] = get_eval_score_gpt4(criteria, steps, record["raw_text"], record["gpt"]["summary"], eval_type)


records

Relevance
Relevance
Coherence
Coherence
Consistency
Consistency
Fluency
Fluency
Relevance
Relevance
Coherence
Coherence
Consistency
Consistency
Fluency
Fluency


[{'raw_text': "World-leading robot plane venture takes off in Kerikeri - NZ Herald Thursday, 01 February 2024 Search New Zealand Herald Weather Kaitaia Whangarei Dargaville NZME Network NZ Herald The Northern Advocate The Northland Age The Aucklander Waikato Herald Bay Of Plenty Times Rotorua Daily Post Hawke's Bay Today Whanganui Chronicle The Stratford Press Manawatu Guardian Kapiti News Horowhenua Chronicle Te Awamutu Courier Viva Eat Well OneRoof DRIVEN Car Guide The Country Photo Sales iHeart Radio Restaurant Hub Subscribe Advertisement Advertise with NZME. Home / Northern Advocate World-leading robot plane venture takes off in Kerikeri By Peter de Graaf 29 May, 2023 05:00 PM 5 mins to read Save share Share this article facebook copy link twitter linkedin reddit email Merlin Labs NZ chief executive Shaun Johnson has returned to his Kerikeri roots to set up a world-leading business developing autonomous aircraft — in other words, robot planes. Photo / Peter de Graaf Merlin Labs NZ 

# 3- wandb

### Dependencies

In [None]:
!pip install wandb

### Set Up

In [None]:
import wandb
wandb.login(key="<ENTER_Value>")

os.environ["LANGCHAIN_WANDB_TRACING"] = "true"

In [24]:
# Initialize wandb
wandb.init(project='text_summarization_comparison', 
               entity='<ENTER_Value>',
               settings=wandb.Settings(start_method="thread"))

# Function to log summary, metrics, raw text, and LLM interactions
def log_summary_and_metrics(raw_text, summary, similarity_metrics, examiner_metrics, examiner_name, model_name, prompt):
    wandb.log({
        f"raw_text_{model_name}": raw_text,
        f"summary_{model_name}": summary,
        f"prompt_{model_name}": prompt,
        **{f"{key}_{model_name}_model": value for key, value in similarity_metrics.items()},
        **{f"{key}_{model_name}_model_{examiner_name}_examiner": value for key, value in examiner_metrics.items()}
    })


for record in records:
    log_summary_and_metrics(record["raw_text"], record["gpt"]["summary"], record["gpt"]["similarity_metrics"], record["gpt"]["claude_examiner_metrics"], "Claude", "GPT-4", human_message_template)
    log_summary_and_metrics(record["raw_text"], record["gpt"]["summary"], record["gpt"]["similarity_metrics"], record["gpt"]["gpt_examiner_metrics"], "GPT-4", "GPT-4", human_message_template)
    log_summary_and_metrics(record["raw_text"], record["claude"]["summary"], record["claude"]["similarity_metrics"], record["claude"]["claude_examiner_metrics"], "Claude", "Claude", human_message_template)
    log_summary_and_metrics(record["raw_text"], record["claude"]["summary"], record["claude"]["similarity_metrics"], record["claude"]["gpt_examiner_metrics"], "GPT-4", "Claude", human_message_template)


wandb.finish()



VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
BERT_score_Claude_model,▁▁
BERT_score_GPT-4_model,▁▁
BLUE_Claude_model,▁▁
BLUE_GPT-4_model,▁▁
Hellinger_Claude_model,▁▁
Hellinger_GPT-4_model,▁▁
KL-divergence_Claude_model,▁▁
KL-divergence_GPT-4_model,▁▁
METEOR_Claude_model,▁▁
METEOR_GPT-4_model,▁▁

0,1
BERT_score_Claude_model,0.82744
BERT_score_GPT-4_model,0.84109
BLUE_Claude_model,0.0
BLUE_GPT-4_model,0.0
Coherence_Claude_model_Claude_examiner,5
Coherence_Claude_model_GPT-4_examiner,5
Coherence_GPT-4_model_Claude_examiner,4
Coherence_GPT-4_model_GPT-4_examiner,5
Consistency_Claude_model_Claude_examiner,5
Consistency_Claude_model_GPT-4_examiner,5
