# Import Libraries and Load Environment Variables

In [None]:
import json
import os
import time
import nest_asyncio
import tiktoken

from dotenv import load_dotenv
from llama_parse import LlamaParse

from langchain.prompts import PromptTemplate
from pydantic import BaseModel, Field
from langchain.schema import Document
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.output_parsers import PydanticOutputParser
from typing import List


nest_asyncio.apply()
load_dotenv(override=True)

INPUT_DIR = "input_big_context/"
OUTPUT_FILE = "evaluation_dataset_big_context.json"

llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0, max_retries=2)

  from .autonotebook import tqdm as notebook_tqdm


# Parse PDF Documents to Markdown

In [5]:

for file in os.listdir(INPUT_DIR):
    if file.endswith(".pdf"):
        try:
            print(f"Converting {file} to markdown")
            md_text = LlamaParse(
                result_type="markdown", 
                verbose=True,
                #use_vendor_multimodal_model=True,
                #vendor_multimodal_model_name="openai-gpt-4o-mini",
                #vendor_multimodal_api_key=os.getenv("OPENAI_API_KEY"),
                language="en",
                numWorkers=3).load_data(INPUT_DIR + file)
            print("text:", md_text)
            combined_md_text = "\n\n".join([doc.text for doc in md_text])
            md_file_path = INPUT_DIR + file.replace(".pdf", ".md")
            print(f"Saving markdown to {md_file_path}")
            with open(md_file_path, "w", encoding="utf-8") as f:
                f.write(combined_md_text)
            print(f"Successfully converted {file}")
        except Exception as e:
            print(f"Error converting {file}: {e}")

Converting 3490099.3511119.pdf to markdown
Started parsing the file under job_id 35594849-2fcc-435d-bdfc-b3722ea08ef7
..text: [Document(id_='ddc47cf6-e481-428f-b9ee-dd7788042541', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text='# Investigating Explainability of Generative AI for Code through Scenario-based Design\n\nJiao Sun∗\n\nQ. Vera Liao†\n\nMichael Muller\n\nUniversity of Southern California\n\nMicrosoft Research\n\nIBM Research AI\n\nLos Angeles, USA\n\nMontréal, Canada\n\nYorktown Heights, USA\n\njiaosun@usc.edu\n\nveraliao@microsoft.com\n\nmichael_muller@us.ibm.com\n\nMayank Agarwal\n\nStephanie Houde\n\nKartik Talamadupula\n\nIBM Research AI\n\nIBM Research AI\n\nIBM Research AI\n\nYorktown Heights, USA\n\nYorktown Heights, USA\n\nYorktown Heights, USA\n\nMayank.Agarwal@ibm.com\n\nStephanie.Houde@ibm.com\n\nkrtalamad@us.ibm.com\n\nJustin D. Weisz\n\

Combine Documents to one

In [3]:
documents = []
for file in os.listdir(INPUT_DIR):
    if file.endswith(".md"):
        file_path = os.path.join(INPUT_DIR, file)
        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()
            documents.append(Document(page_content=content, metadata={"source": file}))
combined_documents_content = "\n\n".join([doc.page_content for doc in documents])

doc stats

In [4]:
from vertexai.preview import tokenization

model_name = "gemini-1.5-flash"
tokenizer = tokenization.get_tokenizer_for_model(model_name)
result = tokenizer.count_tokens(combined_documents_content)
print("Token count: ", result)



Token count:  CountTokensResult(total_tokens=297447)


In [5]:

encoding = tiktoken.encoding_for_model("gpt-4o-mini")
tokens = encoding.encode(combined_documents_content)
print(len(tokens))

276557


# Generate Querys

In [9]:
class Response(BaseModel):
    question: str = Field(description="Generated Business Question")
    ground_truth: str = Field(description="Comprehensive Answer to the Question")
    references: List[str] = Field(description="List of Document Names Referenced")

class Responses(BaseModel):
    responses: List[Response] = Field(description="List of Generated Responses")

class QuestionEntry(BaseModel):
    question: str

class Questions(BaseModel):
    questions: List[QuestionEntry]


documents = []
for file in os.listdir(INPUT_DIR):
    if file.endswith(".md"):
        file_path = os.path.join(INPUT_DIR, file)
        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()
            documents.append(Document(page_content=content, metadata={"source": file}))

combined_documents_content = "\n\n".join([doc.page_content for doc in documents])


question_parser = PydanticOutputParser(pydantic_object=Questions)

question_prompt = PromptTemplate(
    input_variables=["documents"],
    partial_variables={"format_output": question_parser.get_format_instructions()},
    template="""You are tasked with generating 50 realistic and diverse abstract business questions based on the documents provided below, which include SAP internal documents and other reports covering various business topics.

Each question should be written from SAP’s perspective, focusing on strategic actions SAP can take to achieve broad business objectives (e.g., increasing profitability, enhancing sustainability, expanding market reach), while considering multiple influencing factors.

**Guidelines for Question Generation:**

1. **Integration of Diverse Knowledge Areas:**
   - Each question should incorporate insights from at least two different business domains (e.g., finance, sustainability, technology, operations).
   - Blend concepts that may not be directly related to encourage interdisciplinary thinking.

2. **Encourage Multi-hop Reasoning:**
   - Formulate questions that require synthesizing information from multiple sources or sections within the provided documents.
   - Ensure that answering the questions involves connecting various ideas rather than retrieving single facts.

3. **Promote Abstract Reasoning:**
   - Design questions that go beyond factual inquiries and require strategic thinking and abstract reasoning.
   - Focus on overarching themes and big-picture considerations rather than specific details.

4. **Maintain Brevity and Clarity:**
   - Keep each question concise, aiming for a length that is clear and direct without unnecessary complexity.
   - Avoid referencing specific reports or documents directly in the questions.

**Question Examples:**
1. In what ways can SAP integrate sustainable practices to drive long-term growth and enhance its market position?
2. How can SAP leverage emerging technologies to diversify its product offerings and meet evolving customer needs?

The output MUST strictly adhere to the following JSON format, and NO other text MUST be included:
{format_output}

**Use only knowledge from the following documents:**
<documents>
{documents}
</documents>

"""
)

question_chain = question_prompt | llm | question_parser 

print("Generating questions...")
question_output = question_chain.invoke({"documents": combined_documents_content})
questions = [q.question for q in question_output.questions]
print(f"Generated {len(questions)} questions.")

print(question_output)
print(len(questions))

Generating questions...
Generated 60 questions.
questions=[QuestionEntry(question='How can SAP leverage generative AI to enhance its sustainability solutions, enabling customers to more effectively track and reduce their carbon footprints while simultaneously improving the efficiency of their internal processes?'), QuestionEntry(question='Considering the increasing importance of ESG factors, how can SAP integrate sustainable practices into its core business operations, including its data centers and supply chain, to minimize its environmental impact and enhance its brand reputation?'), QuestionEntry(question='Given the rapid advancements in AI, how can SAP strategically invest in research and development to maintain its leadership position in enterprise applications while simultaneously expanding its capabilities in business AI and generative AI?'), QuestionEntry(question='How can SAP utilize AI to optimize its global supply chain, improving efficiency, reducing costs, and mitigating r

In [11]:
# ---------------------------
# Step 5: Generate Answers for Each Question
# ---------------------------

answer_parser = PydanticOutputParser(pydantic_object=Response)

answer_prompt_template = """
You are tasked with providing a comprehensive answer to the following business question based on the provided documents.

**Question:**
{question}

Each answer should synthesize all relevant information from the relevant documents and include abstract reasoning.

**Provide the following in your response:**

**Ground Truth:**
- A comprehensive answer to the question.

**Source Referenced:**
- An array/list of the document names or identifiers that were used to form the answer.

The output MUST strictly adhere to the following JSON format, and NO other text MUST be included:
{format_output}

**Use only the following documents to ground your answer:** 
<documents>
{documents}
</documents>
"""


answer_prompt = PromptTemplate(
    input_variables=["question", "documents"],
    partial_variables={"format_output": answer_parser.get_format_instructions()},
    template=answer_prompt_template
)

answer_chain = answer_prompt | llm | answer_parser

# Initialize the Responses list
responses = []

print("Generating answers for each question with rate limiting (1 request every 45 seconds)...")
for idx, question in enumerate(questions, 1):
    print(f"Processing Question {idx}/{len(questions)}")
    try:
        answer_output = answer_chain.invoke({"question":question, "documents":combined_documents_content})
        responses.append(Response(
            question=question,
            ground_truth=answer_output.ground_truth,
            references=answer_output.references
        ))
        print(f"Question {idx} processed successfully.")
    except Exception as e:
        print(f"Error processing question {idx}: {e}")
    finally:
        if idx < len(questions):
            print("Waiting for 21 seconds before the next request to respect rate limits...")
            time.sleep(21)

Generating answers for each question with rate limiting (1 request every 45 seconds)...
Processing Question 1/60
Question 1 processed successfully.
Waiting for 21 seconds before the next request to respect rate limits...
Processing Question 2/60
Question 2 processed successfully.
Waiting for 21 seconds before the next request to respect rate limits...
Processing Question 3/60
Question 3 processed successfully.
Waiting for 21 seconds before the next request to respect rate limits...
Processing Question 4/60
Question 4 processed successfully.
Waiting for 21 seconds before the next request to respect rate limits...
Processing Question 5/60
Question 5 processed successfully.
Waiting for 21 seconds before the next request to respect rate limits...
Processing Question 6/60
Question 6 processed successfully.
Waiting for 21 seconds before the next request to respect rate limits...
Processing Question 7/60
Question 7 processed successfully.
Waiting for 21 seconds before the next request to resp

Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised InternalServerError: 500 Unable to submit request because the service is temporarily unavailable..


Question 27 processed successfully.
Waiting for 21 seconds before the next request to respect rate limits...
Processing Question 28/60
Question 28 processed successfully.
Waiting for 21 seconds before the next request to respect rate limits...
Processing Question 29/60
Question 29 processed successfully.
Waiting for 21 seconds before the next request to respect rate limits...
Processing Question 30/60
Question 30 processed successfully.
Waiting for 21 seconds before the next request to respect rate limits...
Processing Question 31/60
Question 31 processed successfully.
Waiting for 21 seconds before the next request to respect rate limits...
Processing Question 32/60
Question 32 processed successfully.
Waiting for 21 seconds before the next request to respect rate limits...
Processing Question 33/60
Question 33 processed successfully.
Waiting for 21 seconds before the next request to respect rate limits...
Processing Question 34/60
Question 34 processed successfully.
Waiting for 21 seco

In [15]:
all_responses = Responses(responses=responses)
json_output = all_responses.model_dump_json(indent=2)
print(len(responses))
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    f.write(json_output)

print(f"Data generation complete. JSON saved to {OUTPUT_FILE}")

60
Data generation complete. JSON saved to evaluation_dataset_big_context2.json
