Generate evaluation triplets from studies 


1. parse document 
2. extract claims that are accosiated with refereces via LLM and save in json with referenced paper as name
3. generate topic of claim like (sustanability, business, money, or something like that)
4. repeat step 1-3 for multiple reports from diverse topics that care be releated tho like (genai, sustability, business report)
5. bundle 2-3 claims together from multiple related documents into one claim via LLM
6. generate question for bundled claim 
7. store question, claim and added references in json objects
8. repeat step 6 and 7




In [None]:
import os
import json
from typing import List, Dict
from langchain.text_splitter import CharacterTextSplitter
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from PyPDF2 import PdfReader

# Initialize the LLM
llm = OpenAI(model="gpt-4")

def parse_pdf(file_path: str) -> str:
    """Parse PDF content into a single text string."""
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

def extract_claims_with_references(document: str) -> List[Dict]:
    """Extract claims and associated references using LLM."""
    prompt = PromptTemplate(
        input_variables=["document"],
        template="Extract claims from the document that are associated with references. Return as JSON format: "
                 "[{'claim': '...', 'references': ['ref1', 'ref2', ...]}]. Document: {document}"
    )
    response = llm(prompt.format(document=document))
    return json.loads(response)

def generate_topic(claim: str) -> str:
    """Generate a topic for a claim."""
    prompt = PromptTemplate(
        input_variables=["claim"],
        template="Identify the topic of the claim (e.g., sustainability, business, money, etc.). Claim: {claim}"
    )
    response = llm(prompt.format(claim=claim))
    return response.strip()

def bundle_claims(claims: List[Dict]) -> Dict:
    """Bundle 2-3 claims together into one comprehensive claim."""
    prompt = PromptTemplate(
        input_variables=["claims"],
        template="Combine the following claims into one coherent claim. "
                 "Claims: {claims}. Return as JSON format: {'claim': '...', 'references': [...]}."
    )
    response = llm(prompt.format(claims=claims))
    return json.loads(response)

def generate_question(claim: str) -> str:
    """Generate a question for a bundled claim."""
    prompt = PromptTemplate(
        input_variables=["claim"],
        template="Generate a question that aligns with this claim. Claim: {claim}"
    )
    response = llm(prompt.format(claim=claim))
    return response.strip()

def process_reports(directory: str) -> List[Dict]:
    """Process multiple reports to generate evaluation triplets."""
    evaluation_triplets = []

    for file in os.listdir(directory):
        if file.endswith(".pdf"):
            file_path = os.path.join(directory, file)
            document = parse_pdf(file_path)
            claims = extract_claims_with_references(document)

            for claim_data in claims:
                claim_data['topic'] = generate_topic(claim_data['claim'])
            
            # Store the claims for this document
            with open(f"{file}_claims.json", "w") as f:
                json.dump(claims, f, indent=4)
    
    # Bundle claims and generate questions
    all_claims = []
    for file in os.listdir(directory):
        if file.endswith("_claims.json"):
            with open(os.path.join(directory, file), "r") as f:
                all_claims.extend(json.load(f))
    
    # Group claims by topic
    grouped_claims = {}
    for claim in all_claims:
        topic = claim['topic']
        grouped_claims.setdefault(topic, []).append(claim)
    
    for topic, claims in grouped_claims.items():
        for i in range(0, len(claims), 2):  # Bundling 2-3 claims together
            claim_subset = claims[i:i+3]
            if len(claim_subset) > 1:
                bundled_claim = bundle_claims(claim_subset)
                question = generate_question(bundled_claim['claim'])
                evaluation_triplets.append({
                    "question": question,
                    "claim": bundled_claim['claim'],
                    "references": bundled_claim['references']
                })
    
    # Save evaluation triplets
    with open("evaluation_triplets.json", "w") as f:
        json.dump(evaluation_triplets, f, indent=4)
    
    return evaluation_triplets

# Directory containing your reports
reports_directory = "./reports"
triplets = process_reports(reports_directory)
print("Evaluation triplets generated and saved to 'evaluation_triplets.json'.")
