# Loading SummEval Dataset

In [3]:
import re
import json
import random
import pandas as pd
import sys

script_dir = "../src"

sys.path.append(script_dir)

from gpt_model import Model

random.seed(42)
with open("../data/summeval.json") as file:
    data = json.load(file)


# LLM Initiation

In [4]:
with open("../api_keys.json", "r") as file:
    api_keys = json.load(file)
OPENAI_API_KEY = api_keys["openai"]

gpt4 = Model(model="gpt-4", temperature=0.0, api_key=OPENAI_API_KEY)

gpt-4


# Dimension Data Preparation

In [5]:
DIMENSION = "coherence"
DIMENSION_RUBRIC = {
    "coherence":'Coherence ( 1 - 5 ) - the collective quality of all sentences. We align this dimension with the DUC quality question of structure and coherence whereby "the summary should be well-structured and well-organized. The summary should not just be a heap of related information, but should build from sentence to a coherent body of information about a topic."',
    "consistency":"Consistency ( 1 - 5 ) - the factual alignment between the summary and the summarized source. A factually consistent summary contains only statements that are entailed by the source document. Annotators were also asked to penalize summaries that contained hallucinated facts.",
    "fluency":'Fluency ( 1 - 5 ) - the quality of individual sentences. Drawing again from the DUC quality guidelines, sentences in the summary "should have no formatting problems, capitalization errors or obviously ungrammatical sentences (e.g., fragments, missing components) that make the text difficult to read."',
    "relevance":"Relevance ( 1 - 5 ) - selection of important content from the source. The summary should include only important information from the source document. Annotators were instructed to penalize summaries which contained redundancies and excess information."
}

# Prompt Construction

In [6]:
sample_list = pd.read_pickle(f"../data/{DIMENSION}/sample.pkl")

sampled_source = list()
sampled_summary = list()
sampled_score = list()

for sample in sample_list:
    sampled_source.append(sample["source"])
    sampled_summary.append(sample["system_output"])
    sampled_score.append(sample["scores"][DIMENSION])


with open("../prompts/think_aloud/gpt/system_prompt.txt", "r") as file:
    sys_prompt = file.read()
with open("../prompts/think_aloud/gpt/user_prompt.txt", "r") as file:
    user_prompt = file.read()

prompt_list = [
    {"role":"system", "content": sys_prompt.format(DIMENSION, DIMENSION, DIMENSION, DIMENSION)},
    {"role": "user", "content": user_prompt.format(DIMENSION, DIMENSION_RUBRIC[DIMENSION], sampled_source[0], sampled_summary[0], sampled_score[0], 
                                                   sampled_source[1], sampled_summary[1], sampled_score[1], sampled_source[2], sampled_summary[2], sampled_score[2],
                                                   sampled_source[3], sampled_summary[3], sampled_score[3])}
]

# Think Aloud

In [7]:
gpt4_response = gpt4.ask_chatgpt(prompt_list)
gpt4_response = ''.join(re.findall(r'\{[^}]*\}', gpt4_response))


In [11]:
eval(gpt4_response)

{1: 'Check if the summary is well-structured and well-organized, not just a heap of related information.',
 2: 'Ensure the summary builds from sentence to a coherent body of information about a topic.',
 3: 'Assess if the summary maintains a logical flow of ideas and events.',
 4: 'Evaluate if the summary provides a clear and concise overview of the main points in the news article.',
 5: 'Consider if the summary maintains the context and meaning of the original news article.',
 6: 'Check if the summary avoids unnecessary repetition of information.',
 7: 'Ensure the summary does not introduce new information not present in the original article.',
 8: 'Assess if the summary maintains the same tone and style as the original news article.',
 9: 'Consider if the summary is free from grammatical and spelling errors, which can disrupt coherence.',
 10: 'Evaluate if the summary maintains the same perspective as the original news article.'}