Generate evaluation triplets from studies 


1. parse document 
2. extract claims that are accosiated with refereces via LLM and save in json with referenced paper as name
3. generate topic of claim like (sustanability, business, money, or something like that)
4. repeat step 1-3 for multiple reports from diverse topics that care be releated tho like (genai, sustability, business report)
5. bundle 2-3 claims together from multiple related documents into one claim via LLM
6. generate question for bundled claim 
7. store question, claim and added references in json objects
8. repeat step 6 and 7




In [9]:

from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from langchain.schema import Document
import os
import json
from typing import List, Dict, Any

class Response(BaseModel):
    question: str = Field(description="Question")
    ground_truth: str = Field(description="Ground Truth")
    context: str = Field(description="Context")
    source: str = Field(description="List of document names used")

class Responses(BaseModel):
    responses: list[Response] = Field(description="List of responses")

In [None]:

from gen_ai_hub.proxy.langchain.openai import ChatOpenAI
from gen_ai_hub.proxy.core.proxy_clients import get_proxy_client

proxy_client = get_proxy_client('gen-ai-hub')
llm = ChatOpenAI(proxy_model_name='gpt-4o', proxy_client=proxy_client)

INPUT_DIR = "input/"
OUTPUT_FILE = "evaluation_references.json"




## Load Documents

In [None]:
documents = []
for file in os.listdir(INPUT_DIR):
    if file.endswith(".md"):
        file_path = os.path.join(INPUT_DIR, file)
        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()
            documents.append(Document(page_content=content, metadata={"source": file}))


## Extract claims with references using an LLM chain

In [24]:
import os
import json

from langchain.prompts import PromptTemplate
from langchain.docstore.document import Document


class Response(BaseModel):
    claim: str = Field(description="Claim"),
    references: List[str] = Field(description="List of references")

class Responses(BaseModel):
    responses: list[Response] = Field(description="List of responses")

parser = PydanticOutputParser(pydantic_object=Responses)

extract_prompt = PromptTemplate(
    input_variables=["text"],
    partial_variables={"format_output": parser.get_format_instructions()},
    template="""You are a helpful assistant that identifies academic claims and their citations.

Given the following text:

{text}

Extract all claims and their inline citations. A claim is a statement or sentence that is supported by one or more inline citations (e.g. "(Author et al., YEAR)"). 

The output MUST strictly adhere to the following JSON format, and NO other text MUST be included:    
{format_output}

""", 
)

extract_chain = extract_prompt | llm | parser

claims = []
for doc in documents:
    response = extract_chain.invoke({"text": doc.page_content})
    for res in response.responses:
        claim_dict = res.dict()
        claim_dict["doc_source"] = doc.metadata["source"]
        claims.append(claim_dict)






In [25]:
print(claims)

[{'claim': "The breadth of SAP's portfolio is unmatched, covering 12 Lines of Businesses.", 'references': ['Corporate factsheet (LINK) as of January 27, 2024'], 'doc_source': '2024-02_SAP Product Strategy.md'}, {'claim': 'SAP has more than 25k Customer agreements allowing it to use customer data sets to train AI models & build products.', 'references': ['Corporate factsheet (LINK) as of January 27, 2024'], 'doc_source': '2024-02_SAP Product Strategy.md'}, {'claim': "SAP's experience is unmatched with more than 50 years of in-depth process knowledge, covering 26 Industries in packaged solutions.", 'references': ['Corporate factsheet (LINK) as of January 27, 2024'], 'doc_source': '2024-02_SAP Product Strategy.md'}, {'claim': '99 of the 100 largest companies in the world are SAP customers.', 'references': ['Corporate factsheet (LINK) as of January 27, 2024'], 'doc_source': '2024-02_SAP Product Strategy.md'}, {'claim': 'SAP has more than 24k partner companies globally.', 'references': ['Co

## Categorize the claims 

In [27]:
from typing import List
from pydantic import BaseModel, Field
from langchain.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser

class CategoryResponse(BaseModel):
    category: str = Field(description="The category of the given claim")

parser = PydanticOutputParser(pydantic_object=CategoryResponse)

extract_prompt = PromptTemplate(
    input_variables=["claim", "categories"],
    partial_variables={"format_output": parser.get_format_instructions()},
    template="""You have a list of existing categories:
{categories}

Please classify the following claim into one of these categories if it fits well. 
If it does not fit into any of the listed categories, create a new category name.
Here are some examples for Categories you can use, or create new ones if you need to:
"Business", "Sustainability", "Technology", "Economics", "Policy", "Healthcare", "Innovation", etc.
Claim: {claim}

Return only the final chosen or newly created category as a JSON object following the format instructions.

{format_output}
"""
)

extract_chain = extract_prompt | llm | parser

categories = []
final_claims = []
for item in claims:
    categories_str = ", ".join(categories) if categories else "No categories yet."
    response = extract_chain.invoke({"claim": item["claim"], "categories": categories_str})
    chosen_category = response.category.strip()
    if chosen_category not in categories:
        categories.append(chosen_category)

    item['category'] = chosen_category
    final_claims.append(item)

print("Final Claims with Categories:", final_claims)
print("All Discovered Categories:", categories)


Final Claims with Categories:
[{'claim': "The breadth of SAP's portfolio is unmatched, covering 12 Lines of Businesses.", 'references': ['Corporate factsheet (LINK) as of January 27, 2024'], 'doc_source': '2024-02_SAP Product Strategy.md', 'category': 'Business'}, {'claim': 'SAP has more than 25k Customer agreements allowing it to use customer data sets to train AI models & build products.', 'references': ['Corporate factsheet (LINK) as of January 27, 2024'], 'doc_source': '2024-02_SAP Product Strategy.md', 'category': 'Technology'}, {'claim': "SAP's experience is unmatched with more than 50 years of in-depth process knowledge, covering 26 Industries in packaged solutions.", 'references': ['Corporate factsheet (LINK) as of January 27, 2024'], 'doc_source': '2024-02_SAP Product Strategy.md', 'category': 'Business'}, {'claim': '99 of the 100 largest companies in the world are SAP customers.', 'references': ['Corporate factsheet (LINK) as of January 27, 2024'], 'doc_source': '2024-02_SAP 

## Bundle and Merge Claims

In [None]:
from collections import defaultdict
from langchain.prompts import PromptTemplate

# Assign IDs to claims so we can store them in sets easily
for idx, claim in enumerate(final_claims):
    claim["id"] = idx

# Group claims by category
claims_by_category = defaultdict(list)
for c in final_claims:
    claims_by_category[c["category"]].append(c)

# Flatten all claims for easy access
all_claims = [c for cat_list in claims_by_category.values() for c in cat_list]

merge_prompt = PromptTemplate(
    input_variables=["claims_text"],
    template="""You are a writing assistant.
Merge the following claims into a single cohesive claim that captures all their key points:

Claims:
{claims_text}

Return one paragraph unifying these claims.
""")


group_size = 3
merged_claims = []

def get_other_category_claim(current_category, used_claims):
    # Find a claim not in the current category and not already used
    for c in all_claims:
        if c["category"] != current_category and c["id"] not in used_claims:
            return c
    return None

used_claims = set()  

for cat, cat_claims in claims_by_category.items():
    # Process this category in chunks
    for i in range(0, len(cat_claims), group_size):
        group = cat_claims[i:i+group_size]

        # Check diversity: if all are from the same category
        if len(set(g["category"] for g in group)) == 1:
            # Try to add a claim from another category if we don't have full group_size yet
            if len(group) < group_size:
                other_claim = get_other_category_claim(cat, used_claims)
                if other_claim is not None:
                    group.append(other_claim)
                    used_claims.add(other_claim["id"])
            else:
                # If the group is already full
                other_claim = get_other_category_claim(cat, used_claims)
                if other_claim is not None:
                    replaced = group.pop()  # remove one claim
                    group.append(other_claim)
                    # We do not add replaced again to used_claims since we removed it from the group
                    used_claims.add(other_claim["id"])

        # Mark all claims in the final group as used by their IDs
        for g in group:
            used_claims.add(g["id"])

        # Prepare the text for merging
        claims_text = "\n".join([f"- {g['claim']}" for g in group])
        merged_claim = (merge_prompt | llm).invoke({"claims_text": claims_text})

        # Collect references from all grouped claims
        all_refs = []
        for g in group:
            all_refs.extend(g.get("references", []))

        merged_claims.append({
            "original_claims": group,
            "merged_claim": merged_claim.content.strip(),
            "category": cat,
            "merged_references": list(set(all_refs))
        })

print("Merged Claims:")
for mc in merged_claims:
    print(mc)


Merged Claims:
{'original_claims': [{'claim': "The breadth of SAP's portfolio is unmatched, covering 12 Lines of Businesses.", 'references': ['Corporate factsheet (LINK) as of January 27, 2024'], 'doc_source': '2024-02_SAP Product Strategy.md', 'category': 'Business', 'id': 0}, {'claim': "SAP's experience is unmatched with more than 50 years of in-depth process knowledge, covering 26 Industries in packaged solutions.", 'references': ['Corporate factsheet (LINK) as of January 27, 2024'], 'doc_source': '2024-02_SAP Product Strategy.md', 'category': 'Business', 'id': 2}, {'claim': 'SAP has more than 25k Customer agreements allowing it to use customer data sets to train AI models & build products.', 'references': ['Corporate factsheet (LINK) as of January 27, 2024'], 'doc_source': '2024-02_SAP Product Strategy.md', 'category': 'Technology', 'id': 1}], 'merged_claim': "SAP stands out with an unparalleled portfolio that spans 12 Lines of Businesses, demonstrating its unmatched breadth and ve

In [None]:

question_promt = PromptTemplate(
    input_variables=["merged_claim"],
    template=
    """You have the following merged claim:
    "{merged_claim}"
    Generate a single question that, if answered, would naturally be resolved by this claim.
    This question should be broad and global scoped in nature.
    """
)

final_dataset = []

for mc in merged_claims:
    question_response = (question_promt | llm).invoke({"merged_claim": mc["merged_claim"]})
    question = question_response.content.strip()
    
    entry = {
        "question": question,
        "ground_truth": mc["merged_claim"],
        "references": mc["merged_references"]
    }
    
    final_dataset.append(entry)


with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(final_dataset, f, indent=4, ensure_ascii=False)


for entry in final_dataset:
    print(json.dumps(entry, indent=4, ensure_ascii=False))

NameError: name 'PromptTemplate' is not defined