In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
%%writefile requirements.txt
langchain
langchain-openai
llama-parse
python-dotenv
pydantic

Overwriting requirements.txt


# Import Libraries and Load Environment Variables

In [5]:
import json
import os
from dotenv import load_dotenv
import nest_asyncio


nest_asyncio.apply()
load_dotenv(override=True)

data_dir = "data/"

# Parse PDF Documents to Markdown

In [6]:
from llama_parse import LlamaParse

for file in os.listdir(data_dir):
    if file.endswith(".pdf"):
        try:
            print(f"Converting {file} to markdown")
            md_text = LlamaParse(result_type="markdown", verbose=True, language="en").load_data(data_dir + file)
            combined_md_text = "\n\n".join([doc.text for doc in md_text])
            md_file_path = data_dir + file.replace(".pdf", ".md")
            print(f"Saving markdown to {md_file_path}")
            with open(md_file_path, "w") as f:
                f.write(combined_md_text)
            print(f"Successfully converted {file}")
        except Exception as e:
            print(f"Error converting {file}: {e}")

Converting the-economic-potential-of-generative-ai-the-next-productivity-frontier.pdf to markdown
Started parsing the file under job_id 1653af61-9394-4c89-bc69-6aa24d61af1d
..Saving markdown to data/the-economic-potential-of-generative-ai-the-next-productivity-frontier.md
Successfully converted the-economic-potential-of-generative-ai-the-next-productivity-frontier.pdf


# Generate Dataset from Data

In [7]:

from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from langchain.schema import Document

# Define data models for responses
class Response(BaseModel):
    question: str = Field(description="Question")
    ground_truth: str = Field(description="Ground Truth")
    context: str = Field(description="Context")
    documents: str = Field(description="Name of the documents used")

class Responses(BaseModel):
    responses: list[Response] = Field(description="List of responses")

# Combine content from all markdown files in the directory
documents = []
for file in os.listdir(data_dir):
    if file.endswith(".md"):
        file_path = os.path.join(data_dir, file)
        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()
            documents.append(Document(page_content=content, metadata={"source": file}))
combined_documents_content = "\n\n".join([doc.page_content for doc in documents])

parser = PydanticOutputParser(pydantic_object=Responses)

prompt = PromptTemplate(
    input_variables=["documents"],
    partial_variables={"format_output": parser.get_format_instructions()},
    template="""\
Generate broad questions and their answers (ground truth) along with the context relevant to get that answer. 
The response should be in JSON format. Don't hallucinate or make up any information. 

The output MUST strictly adhere to the following JSON format, and NO other text MUST be included:    
{format_output}

Use only the following documents: 
<documents>
{documents}
</documents>
"""
)

model = ChatOpenAI(model_name="gpt-4o-mini", temperature=0, verbose=True)

chain = prompt | model | parser

response = chain.invoke({"documents": combined_documents_content})

Save LLM response to file

In [None]:
with open("./synthetic_data_big_context.json", "w", encoding="utf-8") as f:
    json.dump(response.dict(), f, indent=4)

: 