# Import Libraries and Load Environment Variables

In [26]:
import json
import os
from dotenv import load_dotenv
import nest_asyncio
from llama_parse import LlamaParse

from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from langchain.schema import Document
from gen_ai_hub.proxy.langchain.openai import ChatOpenAI
from gen_ai_hub.proxy.core.proxy_clients import get_proxy_client

nest_asyncio.apply()
load_dotenv(override=True)

data_dir = "test/"

proxy_client = get_proxy_client('gen-ai-hub')
llm = ChatOpenAI(proxy_model_name='gpt-4o', proxy_client=proxy_client)

# Parse PDF Documents to Markdown

In [None]:

for file in os.listdir(data_dir):
    if file.endswith(".pdf"):
        try:
            print(f"Converting {file} to markdown")
            md_text = LlamaParse(
                result_type="markdown", 
                verbose=True,
                use_vendor_multimodal_model=True,
                vendor_multimodal_model_name="openai-gpt-4o-mini",
                vendor_multimodal_api_key=os.getenv("OPENAI_API_KEY"),
                language="en",
                numWorkers=5).load_data(data_dir + file)
            combined_md_text = "\n\n".join([doc.text for doc in md_text])
            md_file_path = data_dir + file.replace(".pdf", ".md")
            print(f"Saving markdown to {md_file_path}")
            with open(md_file_path, "w") as f:
                f.write(combined_md_text)
            print(f"Successfully converted {file}")
        except Exception as e:
            print(f"Error converting {file}: {e}")

Converting s10668-023-02933-7.pdf to markdown


KeyboardInterrupt: 

Started parsing the file under job_id 90506d25-0e2b-4224-bc1c-6ab0d699ea57


In [None]:
with open(md_file_path, "w", encoding="utf-8") as f:
    f.write(combined_md_text)
    print(f"Successfully converted {file}")

Successfully converted s10668-023-02933-7.pdf


# Generate Dataset from Data

In [None]:
from typing import List
from pydantic import BaseModel, Field
import tiktoken

class Response(BaseModel):
    question: str = Field(description="Question")
    ground_truth: str = Field(description="Ground Truth")
    references: List[str] = Field(description="List of document or reference names used")

class Responses(BaseModel):
    responses: list[Response] = Field(description="List of responses")

documents = []
for file in os.listdir(data_dir):
    if file.endswith(".md"):
        file_path = os.path.join(data_dir, file)
        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()
            documents.append(Document(page_content=content, metadata={"source": file}))
combined_documents_content = "\n\n".join([doc.page_content for doc in documents])

parser = PydanticOutputParser(pydantic_object=Responses)

prompt = PromptTemplate(
    input_variables=["documents"],
    partial_variables={"format_output": parser.get_format_instructions()},
    template="""
You are tasked with generating 20 realistic and diverse business questions based on the documents provided below, which include SAP internal documents and other reports about varios business topics. 
Each question should be written from the perspective of SAP, focusing on what SAP can do to achieve specific business objectives (e.g., increase margins, improve sustainability, enhance product offerings), considering various factors.

Integration of Diverse and Unlinked Knowledge Areas:
- Each question must blend insights from multiple business domains (e.g., finance, sustainability, technology, operations)
- Include concepts that are not necessarily semantically linked

Multi-hop Information Synthesis Within and Across Documents:
- Answers should require synthesizing multiple pieces of relevant information.
- Draw information from multiple documents and multiple sections within a single document.
- Ensure that multi-step reasoning is necessary to combine various information fragments into a comprehensive answer.

Abstract Reasoning and Insight Generation:
- Questions should push beyond simple fact retrieval and basic sensemaking.
- Require abstract reasoning
- Question should be global themed and require a understand of the whole documents to be answered

For Each Question, Provide the Following:

Question: 
- Formulate a complex business question that meets all the above constraints.

Optimal Answer (Ground Truth):
- Provide a comprehensive answer to the question.
- The answer should synthesize all relevent information from the relevant documents and include abstract reasoning.

Source Referenced:
- Provide an array/list of the document names or identifiers that were used to form the answer.

Question Examples:
1. How can we capitalize on eco-friendly manufacturing practices to both improve our sustainability credentials and boost our financial performance?
2. What innovative products should we consider integrating into our portfolio, and which current products have the highest potential for enhancement through machine learning technologies?

The output MUST strictly adhere to the following JSON format, and NO other text MUST be included:    
{format_output}

Use only the following documents: 
<documents>
{documents}
</documents>
"""
)

chain = prompt | llm | parser

encoding = tiktoken.encoding_for_model("gpt-4o")

tokens = encoding.encode(combined_documents_content)

print(len(tokens))

108236


In [28]:
response = chain.invoke({"documents": combined_documents_content})

Save LLM response to file

In [29]:
with open("./synthetic_data_big_context_test_citet.json", "w", encoding="utf-8") as f:
    json.dump(response.dict(), f, indent=4)