# Process Reports and Generate Questions

1. turns pdf into text documents
2. extract the questions that the reports are trying to ask
3. categorize the questions.


In [1]:

import pathlib
import asyncio
import pandas as pd

## Part 1: Convert PDF into Document

In [3]:
from kruppe.data_source.directory import DirectoryData

pdf_parser = DirectoryData.simple_pdf_parser
df = pd.DataFrame()

In [4]:
report_directory = pathlib.Path("/Users/danielliu/Workspace/fin-rag/experiments/human_report_original")
processed_report_directory = pathlib.Path("/Users/danielliu/Workspace/fin-rag/experiments/human_report")

for file in report_directory.rglob("*.pdf"):
    pdf_text, pdf_meta = pdf_parser(file.as_posix())
    category = file.parent.name

    processed_file = processed_report_directory / (file.name+".txt")

    with open(processed_file, "w") as f:
        f.write(pdf_text)

    new_row = {
        "category": category,
        "human_report_loc": processed_file.as_posix(),
    }
    
    df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

df
    

Unnamed: 0,category,human_report_loc
0,Energy (Oil),/Users/danielliu/Workspace/fin-rag/experiments...
1,Energy (Oil),/Users/danielliu/Workspace/fin-rag/experiments...
2,Energy (Oil),/Users/danielliu/Workspace/fin-rag/experiments...
3,Energy (Oil),/Users/danielliu/Workspace/fin-rag/experiments...
4,Energy (Oil),/Users/danielliu/Workspace/fin-rag/experiments...
...,...,...
68,NVDA,/Users/danielliu/Workspace/fin-rag/experiments...
69,NVDA,/Users/danielliu/Workspace/fin-rag/experiments...
70,NVDA,/Users/danielliu/Workspace/fin-rag/experiments...
71,NVDA,/Users/danielliu/Workspace/fin-rag/experiments...


In [5]:
df.to_csv("./reports.csv", index=False)

## Part 2: Extract Question

In [6]:
df = pd.read_csv("./reports.csv", index_col=False)
df

Unnamed: 0,category,human_report_loc
0,Energy (Oil),/Users/danielliu/Workspace/fin-rag/experiments...
1,Energy (Oil),/Users/danielliu/Workspace/fin-rag/experiments...
2,Energy (Oil),/Users/danielliu/Workspace/fin-rag/experiments...
3,Energy (Oil),/Users/danielliu/Workspace/fin-rag/experiments...
4,Energy (Oil),/Users/danielliu/Workspace/fin-rag/experiments...
...,...,...
68,NVDA,/Users/danielliu/Workspace/fin-rag/experiments...
69,NVDA,/Users/danielliu/Workspace/fin-rag/experiments...
70,NVDA,/Users/danielliu/Workspace/fin-rag/experiments...
71,NVDA,/Users/danielliu/Workspace/fin-rag/experiments...


In [8]:
from kruppe.llm import OpenAILLM
from kruppe.prompts.experiments import GENERATE_QUESTION_USER, GENERATE_QUESTION_SYSTEM

llm = OpenAILLM(model="gpt-4o")

async def generate_questions(path: str):
    with open(path, 'r') as f:
        doc_text = f.read()
    
    user_message = GENERATE_QUESTION_USER.format(document=doc_text)

    messages = [
        {"role": "system", "content": GENERATE_QUESTION_SYSTEM},
        {"role": "user", "content": user_message},
    ]

    llm_response = await llm.async_generate(messages)
    llm_string = llm_response.text
    
    return llm_string


In [4]:
# example
await generate_questions(df["human_report_loc"][0])

'How is Amazon positioned to grow its advertising revenue and market share in the competitive landscape dominated by Google and Meta?'

In [9]:
questions = await asyncio.gather(*[generate_questions(path) for path in df["human_report_loc"]])
df["question"] = questions
df

Unnamed: 0,category,human_report_loc,question
0,Energy (Oil),/Users/danielliu/Workspace/fin-rag/experiments...,What is the potential timing and impact of Con...
1,Energy (Oil),/Users/danielliu/Workspace/fin-rag/experiments...,What is the projected free cash flow growth tr...
2,Energy (Oil),/Users/danielliu/Workspace/fin-rag/experiments...,What are the updated 1Q25 earnings estimates f...
3,Energy (Oil),/Users/danielliu/Workspace/fin-rag/experiments...,What are the expectations and potential impact...
4,Energy (Oil),/Users/danielliu/Workspace/fin-rag/experiments...,What is the rationale behind J.P. Morgan's dec...
...,...,...,...
68,NVDA,/Users/danielliu/Workspace/fin-rag/experiments...,What are the expectations and key topics of di...
69,NVDA,/Users/danielliu/Workspace/fin-rag/experiments...,What are the key capital expenditure trends an...
70,NVDA,/Users/danielliu/Workspace/fin-rag/experiments...,What is the expected future performance of NVI...
71,NVDA,/Users/danielliu/Workspace/fin-rag/experiments...,What is the current state and future outlook o...


In [10]:
df.to_csv("./reports.csv", index=False)

## Part 3: Extract Category
To be completed

In [11]:
df = pd.read_csv("./reports.csv", index_col=False)
df

Unnamed: 0,category,human_report_loc,question
0,Energy (Oil),/Users/danielliu/Workspace/fin-rag/experiments...,What is the potential timing and impact of Con...
1,Energy (Oil),/Users/danielliu/Workspace/fin-rag/experiments...,What is the projected free cash flow growth tr...
2,Energy (Oil),/Users/danielliu/Workspace/fin-rag/experiments...,What are the updated 1Q25 earnings estimates f...
3,Energy (Oil),/Users/danielliu/Workspace/fin-rag/experiments...,What are the expectations and potential impact...
4,Energy (Oil),/Users/danielliu/Workspace/fin-rag/experiments...,What is the rationale behind J.P. Morgan's dec...
...,...,...,...
68,NVDA,/Users/danielliu/Workspace/fin-rag/experiments...,What are the expectations and key topics of di...
69,NVDA,/Users/danielliu/Workspace/fin-rag/experiments...,What are the key capital expenditure trends an...
70,NVDA,/Users/danielliu/Workspace/fin-rag/experiments...,What is the expected future performance of NVI...
71,NVDA,/Users/danielliu/Workspace/fin-rag/experiments...,What is the current state and future outlook o...
