In [3]:
%load_ext autoreload
%autoreload 2

# Import Libraries and Load Environment Variables

In [2]:
import json
import os
from dotenv import load_dotenv
import nest_asyncio


nest_asyncio.apply()
load_dotenv(override=True)

data_dir = "reports/"

# Parse PDF Documents to Markdown

In [3]:
from llama_parse import LlamaParse

for file in os.listdir(data_dir):
    if file.endswith(".pdf"):
        try:
            print(f"Converting {file} to markdown")
            md_text = LlamaParse(
                result_type="markdown", 
                verbose=True,
                use_vendor_multimodal_model=True,
                vendor_multimodal_model_name="openai-gpt-4o-mini",
                vendor_multimodal_api_key=os.getenv("OPENAI_API_KEY"),
                language="en",
                numWorkers=5).load_data(data_dir + file)
            combined_md_text = "\n\n".join([doc.text for doc in md_text])
            md_file_path = data_dir + file.replace(".pdf", ".md")
            print(f"Saving markdown to {md_file_path}")
            with open(md_file_path, "w") as f:
                f.write(combined_md_text)
            print(f"Successfully converted {file}")
        except Exception as e:
            print(f"Error converting {file}: {e}")

Converting s10668-023-02933-7.pdf to markdown
Started parsing the file under job_id 17ef2d37-7093-4899-b6cb-4683d31c2f36
.Saving markdown to reports/s10668-023-02933-7.md
Error converting s10668-023-02933-7.pdf: 'charmap' codec can't encode character '\u0131' in position 68980: character maps to <undefined>


In [4]:
with open(md_file_path, "w", encoding="utf-8") as f:
    f.write(combined_md_text)
    print(f"Successfully converted {file}")

Successfully converted s10668-023-02933-7.pdf


# Generate Dataset from Data

In [None]:

from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from langchain.schema import Document

class Response(BaseModel):
    question: str = Field(description="Question")
    ground_truth: str = Field(description="Ground Truth")
    context: str = Field(description="Context")
    source: str = Field(description="List of document names used")

class Responses(BaseModel):
    responses: list[Response] = Field(description="List of responses")

documents = []
for file in os.listdir(data_dir):
    if file.endswith(".md"):
        file_path = os.path.join(data_dir, file)
        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()
            documents.append(Document(page_content=content, metadata={"source": file}))
combined_documents_content = "\n\n".join([doc.page_content for doc in documents])

parser = PydanticOutputParser(pydantic_object=Responses)

prompt = PromptTemplate(
    input_variables=["documents"],
    partial_variables={"format_output": parser.get_format_instructions()},
    template="""
You are tasked with generating 20 realistic and diverse business questions based on the documents provided below, which include SAP internal documents and other reports about varios business topics. 
Each question should be written from the perspective of SAP, focusing on what SAP can do to achieve specific business objectives (e.g., increase margins, improve sustainability, enhance product offerings), considering various factors.

Integration of Diverse and Unlinked Knowledge Areas:
- Each question must blend insights from multiple business domains (e.g., finance, sustainability, technology, operations)
- Include concepts that are not necessarily semantically linked
- The combination of these unlinked concepts should be meaningful within a business context.

Multi-hop Information Synthesis Within and Across Documents:
- Answers should require synthesizing multiple pieces of relevant information.
- Draw information from multiple documents and multiple sections within a single document.
- Ensure that multi-step reasoning is necessary to combine various information fragments into a comprehensive answer.

Abstract Reasoning and Insight Generation:
- Questions should push beyond simple fact retrieval and basic sensemaking.
- Require abstract reasoning, prompting the system to generalize and derive higher-level insights not explicitly stated in the documents.
- Focus on understanding broader patterns and deriving conclusions that support holistic decision-making and strategic thinking.


For Each Question, Provide the Following:

Question: 
- Formulate a complex business question that meets all the above constraints.

Optimal Answer (Ground Truth):
- Provide a comprehensive answer to the question.
- The answer should synthesize all relevent information from the relevant documents and include abstract reasoning.

Context Needed to Form the Answer:
- List the specific excerpts or summarized points from the documents that are necessary to answer the question.
- The context should be directly taken from the provided documents.

Source Referenced:
- Provide an array/list of the document names or identifiers that were used to form the answer.

Think step by Step!

The output MUST strictly adhere to the following JSON format, and NO other text MUST be included:    
{format_output}


Examples:
1.
    "question": "How can we capitalize on eco-friendly manufacturing practices to both improve our sustainability credentials and boost our financial performance?",
    "ground_truth": "We can capitalize on eco-friendly manufacturing practices by implementing energy-efficient technologies, adopting sustainable sourcing, and optimizing production processes to reduce waste. By investing in renewable energy sources like solar or wind power for our manufacturing facilities, we not only reduce operational costs over time but also decrease our carbon footprint, enhancing our sustainability credentials. Moreover, embracing circular economy principles—such as recycling materials and reusing waste products—can lead to significant cost savings and open up new revenue streams through the sale of by-products. Aligning our operations with global sustainability standards can also make us eligible for government incentives and tax breaks. By marketing our commitment to sustainability, we can strengthen our brand image, meet the growing consumer demand for eco-friendly products, and potentially command premium pricing. This approach not only contributes positively to the environment but also drives financial performance through cost reduction and increased sales.",
    "context": "Over the past year, SAP has made significant strides in reducing our environmental impact. By investing in energy-efficient technologies across our manufacturing plants, we have achieved a 20 percent reduction in energy consumption. This not only aligns with our commitment to sustainability but has also resulted in a 15 percent decrease in energy costs, positively impacting our bottom line.
                Companies that integrate eco-friendly practices into their manufacturing processes often see a dual benefit of cost savings and enhanced brand reputation. Adoption of renewable energy sources can lead to long-term financial gains. Our studies show that firms investing in solar and wind technologies report an average of 25 percent savings on energy expenses over five years.
                Through our comprehensive waste reduction program, SAP has successfully reduced manufacturing waste by 30%. By recycling and reusing materials, we have cut raw material procurement costs by 10%. These initiatives not only contribute to environmental conservation but also provide significant cost efficiencies.
                Consumer behavior is increasingly influenced by corporate sustainability efforts. Approximately 70 percent of consumers prefer to purchase from companies with strong environmental commitments. Brands that effectively communicate their sustainability initiatives can enhance customer loyalty and are often able to charge premium prices for their products.",
    "source": "SAP Sustainability Report 2023, McKinsey Paper on Green Manufacturing (2022), SAP Waste Reduction Initiative Document"

2. 
    "question": "Which business functions could see the most value from generative AI?",
    "ground_truth": "We should consider integrating cloud-based AI analytics platforms and IoT solutions into our portfolio. The AI analytics platforms can help clients derive actionable insights from big data, while IoT solutions can enable real-time monitoring and automation across various industries. Our current products with the highest potential for enhancement through machine learning are: 1. SAP S/4HANA (ERP System): By embedding machine learning algorithms, we can offer predictive analytics for supply chain optimization, anomaly detection, and automated decision-making processes. 2. SAP Customer Experience (CRM Suite): Machine learning can enhance customer segmentation, personalize marketing efforts, and improve sales forecasting accuracy. 3. SAP SuccessFactors (HR Management): Incorporating machine learning can optimize talent acquisition through predictive hiring and improve employee retention via sentiment analysis. Enhancing these products with machine learning technologies will not only improve their functionality and value proposition but also position us as a leader in innovative enterprise solutions.",
    "context": "Our strategic roadmap emphasizes the importance of AI and machine learning in driving future growth. By developing cloud-based AI analytics platforms, we aim to empower clients with advanced data processing capabilities. The integration of IoT solutions will further enhance our product offerings by enabling seamless connectivity and real-time data exchange.
                Machine learning is set to revolutionize enterprise software. Companies that incorporate machine learning into their ERP and CRM systems are projected to improve operational efficiency by up to 30%. The demand for AI-driven solutions is expected to grow exponentially over the next decade.
                SAP S/4HANA has shown robust performance in the market, but client feedback indicates a strong interest in advanced analytics features. Similarly, SAP Customer Experience and SAP SuccessFactors users are seeking more intelligent, personalized functionalities that can be achieved through machine learning enhancements.
                Businesses leveraging machine learning technologies report significant competitive advantages, including improved decision-making speed and accuracy. Machine learning applications in customer relationship management and human resources have led to measurable increases in customer satisfaction and employee productivity.",
    "source": "SAP Sustainability Report 2023, McKinsey Technology Trends Report (2023), SAP Product Performance Review, McKinsey Study on AI in Business (2022)"



How can we capitalize on eco-friendly manufacturing practices to both improve our sustainability credentials and boost our financial performance?

2. What innovative products should we consider integrating into our portfolio, and which current products have the highest potential for enhancement through machine learning technologies?


Use only the following documents: 
<documents>
{documents}
</documents>
"""
)

model = ChatOpenAI(model_name="gpt-4o-mini", temperature=0, verbose=True)

chain = prompt | model | parser

print(prompt)

response = chain.invoke({"documents": combined_documents_content})

input_variables=['documents'] input_types={} partial_variables={'format_output': 'The output should be formatted as a JSON instance that conforms to the JSON schema below.\n\nAs an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}\nthe object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.\n\nHere is the output schema:\n```\n{"$defs": {"Response": {"properties": {"question": {"description": "Question", "title": "Question", "type": "string"}, "ground_truth": {"description": "Ground Truth", "title": "Ground Truth", "type": "string"}, "context": {"description": "Context", "title": "Context", "type": "string"}, "source": {"description": "List of document names used", "title": "Source", "type": "string"}}, "required": ["question", "ground_truth", "context", "source"], "title": "Res

Save LLM response to file

In [15]:
with open("./synthetic_data_big_context.json", "w", encoding="utf-8") as f:
    json.dump(response.dict(), f, indent=4)