In [None]:
import os
import asyncio
from dotenv import load_dotenv
from langchain_openai import AzureChatOpenAI
from langchain_core.documents import Document
from core.metadata import Metadata
from core.document_loader import DocumentLoader
from core.document_summarizer import DocumentSummarizer
from langchain_core.prompts import ChatPromptTemplate

async def get_summary(llm: AzureChatOpenAI, docs: list[Document]) -> dict[str, any]:
    summarizer: DocumentSummarizer = DocumentSummarizer(llm, docs)
    
    app = summarizer.compile_app()

    step: dict = {}
    async for step in app.astream(
        {"contents": [doc.page_content for doc in docs]},
        {"recursion_limit": 10},
    ):
        pass

    return step

load_dotenv()

llm: AzureChatOpenAI = AzureChatOpenAI(
    model = "gpt-4o-mini",
    api_version = "2024-08-01-preview"
)

In [None]:
def extract_document(llm: AzureChatOpenAI, file_path: str):
    document_loader: DocumentLoader = DocumentLoader()
    docs: list[Document] = document_loader.load(file_path)

    prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                "You are an expert extraction algorithm. "
                "Only extract relevant information from the text. "
                "If you do not know the value of an attribute asked to extract, "
                "return null for the attribute's value.",
            ),
            ("human", "{text}"),
        ]
    )

    runnable = prompt | llm.with_structured_output(schema = Metadata)

    summary = asyncio.run(get_summary(llm, docs))
    metadata = runnable.invoke({"text": summary})

    return  metadata.subject, metadata.keywords, summary['generate_final_summary']['final_summary']

In [None]:
import pandas as pd

df = pd.DataFrame(columns=['file_name', 'subject', 'keywords', 'summary'])

for file_name in os.listdir(os.environ['DOCUMENT_PATH']):
    file_path = os.path.join(os.environ['DOCUMENT_PATH'], file_name)
    if os.path.isfile(file_path):
        subject, keywords, summary = extract_document(llm, file_path)

        df = pd.concat([df, pd.DataFrame([{
            'file_name': file_name, 
            'subject': subject,
            'keywords': keywords,
            'summary': summary
        }])], ignore_index=True)

df.to_excel("output.xlsx")  

In [1]:
import pandas as pd

df = pd.read_excel('output_file.xlsx')
df.head()

Unnamed: 0.1,Unnamed: 0,file_name,title,keywords,summary
0,0,Get_Started_With_Smallpdf-output.pdf,Themes in Document Management and Collaboratio...,"Document Management, User Experience, Accessib...",The main themes distilled from the provided do...


In [6]:
print(df['keywords'][0])

Document Management, User Experience, Accessibility, Collaboration, Communication, File Handling, Efficiency, Productivity, Security, Privacy, Integration, Technology, Visual Design, Aesthetics
