In [1]:
import re
import anthropic
import json
import random
import pandas as pd
import sys


random.seed(42)
with open("../data/summeval.json") as file:
    data = json.load(file)

# LLM Initiation

In [2]:
with open("../api_keys.json", "r") as file:
    api_keys = json.load(file)

ANTHROPIC_API_KEY = api_keys["anthropic"]

model = anthropic.Anthropic(
    api_key=ANTHROPIC_API_KEY
)

# Dimension Data Preparation

In [3]:
DIMENSION = "coherence"
DIMENSION_RUBRIC = {
    "coherence":'Coherence ( 1 - 5 ) - the collective quality of all sentences. We align this dimension with the DUC quality question of structure and coherence whereby "the summary should be well-structured and well-organized. The summary should not just be a heap of related information, but should build from sentence to a coherent body of information about a topic."',
    "consistency":"Consistency ( 1 - 5 ) - the factual alignment between the summary and the summarized source. A factually consistent summary contains only statements that are entailed by the source document. Annotators were also asked to penalize summaries that contained hallucinated facts.",
    "fluency":'Fluency ( 1 - 5 ) - the quality of individual sentences. Drawing again from the DUC quality guidelines, sentences in the summary "should have no formatting problems, capitalization errors or obviously ungrammatical sentences (e.g., fragments, missing components) that make the text difficult to read."',
    "relevance":"Relevance ( 1 - 5 ) - selection of important content from the source. The summary should include only important information from the source document. Annotators were instructed to penalize summaries which contained redundancies and excess information."
}

# Prompt Construction

In [4]:
sample_list = pd.read_pickle(f"../data/{DIMENSION}/sample.pkl")

sampled_source = list()
sampled_summary = list()
sampled_score = list()

for sample in sample_list:
    sampled_source.append(sample["source"])
    sampled_summary.append(sample["system_output"])
    sampled_score.append(sample["scores"][DIMENSION])


# Prompt Construction

In [5]:
with open("../prompts/think_aloud/claude/user_prompt.txt", "r") as file:
    user_prompt = file.read()

In [6]:
message = model.messages.create(
    model="claude-3-5-sonnet-20240620",
    max_tokens=4096,
    messages=[
        {"role": "user", "content": user_prompt.format(DIMENSION, DIMENSION, DIMENSION, DIMENSION,
                                        DIMENSION, DIMENSION_RUBRIC[DIMENSION], sampled_source[0], sampled_summary[0], sampled_score[0],
                                        sampled_source[1], sampled_summary[1], sampled_score[1], sampled_source[2], sampled_summary[2], sampled_score[2],
                                        sampled_source[3], sampled_summary[3], sampled_score[3])}
    ]
)

claude_response = ''.join(re.findall(r'\{[^}]*\}', message.content[0].text))

In [7]:
eval(claude_response)

{1: 'Check if the summary presents information in a logical and organized manner, rather than as disconnected facts',
 2: 'Assess whether the summary builds from sentence to sentence to create a coherent narrative about the topic',
 3: 'Evaluate if the summary has a clear focus and avoids including irrelevant or tangential information',
 4: 'Look for appropriate use of transitional phrases or words to connect ideas and improve flow between sentences',
 5: 'Consider if the summary provides context and background information needed to understand the main points',
 6: 'Examine whether the summary maintains a consistent tone and perspective throughout',
 7: 'Check if the key points from the original article are presented in a sensible order in the summary',
 8: 'Assess whether the summary avoids unnecessary repetition of information',
 9: 'Evaluate if the summary has a clear beginning, middle, and end structure',
 10: 'Consider whether the summary stands alone as a coherent piece, without 