## Installs

In [None]:
!pip3 install --upgrade --quiet langchain langchain-community langchain-openai chromadb 
!pip3 install --upgrade --quiet pypdf pandas streamlit python-dotenv

## Imports

In [6]:
# Import Langchain modules
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import Chroma
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field

# Other modules and packages
import os
import tempfile
import streamlit as st  
import pandas as pd
from dotenv import load_dotenv
import uuid

## Env variables

In [7]:
load_dotenv()
OPEN_AI_API_KEY = os.environ.get("OPEN_AI_API_KEY")

## Process PDF documents

### Find and load all PDF documents

In [8]:
documents = []

# Traverse the directory tree and collect specified file in the ./data folder
loader = PyPDFLoader("data/Ubisoft_FY25_Q1_Sales_EN_vFinal.pdf")
documents=loader.load()

### Split documents

In [None]:
# Split the documents into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1200, 
                                               chunk_overlap=120, 
                                               length_function=len, 
                                               separators=["\n\n", "\n", " "])

documents = text_splitter.split_documents(documents)
documents

### Define embedding function

In [10]:
# Use embedding function from OpenAI
embedding_function = embeddings = OpenAIEmbeddings(
        model="text-embedding-ada-002", openai_api_key=OPEN_AI_API_KEY
    )

## Create and populate vector database

In [11]:
def create_vectorstore(documents, embedding_function, vectorstore_path):

    # Create a list of unique ids for each document based on the content
    ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in documents]
    
    # Ensure that only unique docs with unique ids are kept
    unique_ids = set()
    unique_documents = []
    
    for doc, id in zip(documents, ids):     
        if id not in unique_ids:       
            unique_ids.add(id)
            unique_documents.append(doc) 

    # Create a new Chroma database from the unique_documents
    chromaDB = Chroma.from_documents(documents=unique_documents, 
                                        ids=list(unique_ids),
                                        embedding=embedding_function, 
                                        persist_directory = vectorstore_path)

    chromaDB.persist()
    
    return chromaDB

In [None]:
# Create vectorstore
vectorstore = create_vectorstore(documents=documents, 
                                 embedding_function=embedding_function, 
                                 vectorstore_path="vectorstore")

## Query for relevant data

In [None]:
# Load vectorstore
vectorstore = Chroma(persist_directory="vectorstore_test", 
                     embedding_function=embedding_function)

In [None]:
# Create retriever and get relevant chunks
retriever = vectorstore.as_retriever(search_type="similarity")

question = "What is the title of this fiscal year (FY) quarter (Q) result report?"

relevant_chunks = retriever.invoke(question)
relevant_chunks

## Generate structured responses

### Set LLM

In [15]:
# Set LLM
llm = ChatOpenAI(model="gpt-4o-mini", api_key=OPEN_AI_API_KEY)

### Set prompt template

In [16]:
# Prompt template
PROMPT_TEMPLATE = """
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer
the question. If you don't know the answer, say that you
don't know. DON'T MAKE UP ANYTHING.

{context}

---

Answer the question based on the above context: {question}

Additionally, if found in the context, provide the following:
- Revenue and Earnings: Assess year-over-year (YoY) or quarter-over-quarter (QoQ) growth.
- Profit Margins: Review operating and net profit margins for efficiency.
- Expenses: Compare R&D, marketing, and operating costs to previous periods.
- Cash Flow: Check operational cash flow for liquidity and sustainability.
- Balance Sheet: Analyze assets, liabilities, and equity for financial health.
- Forward Guidance: Look for management's projections or economic outlook.
- Key Metrics: Industry-specific data (e.g., customer growth, product sales).
- Trends: Spot patterns or anomalies over time.
"""

In [None]:
# Concatenate context text
context_text = "\n\n---\n\n".join(
    [f"Page {doc.metadata['page']}:\n{doc.page_content}" for doc in relevant_chunks]
)

# Create prompt
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)

prompt = prompt_template.format(context=context_text, question=question)
print(prompt)

### Create model for reponse

In [18]:
class AnswerWithSources(BaseModel):
    """An answer to the question, with sources and reasoning"""
    answer: str = Field(description="Answer to question")
    sources: str = Field(description="Full direct text chunk from the context used to answer the question")
    reasoning: str = Field(description="Explain the reasoning of the answer based on the sources")
    
class ExtractedInfo(BaseModel):
    """Extracted information about the report"""
    report_company_name: AnswerWithSources
    report_summary: AnswerWithSources
    revenue_and_earnings: AnswerWithSources
    profit_margins: AnswerWithSources
    expenses: AnswerWithSources
    cash_flow: AnswerWithSources
    balance_sheet: AnswerWithSources
    forward_guidance: AnswerWithSources
    key_metrics: AnswerWithSources
    trends: AnswerWithSources

### Invoke

In [19]:
# Using Langchain Expression Language
def format_docs(docs):
    return "\n\n---\n\n".join(
    [f"Page {doc.metadata['page']}:\n{doc.page_content}" for doc in relevant_chunks]
)

rag_chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | prompt_template
            | llm.with_structured_output(ExtractedInfo, strict=True)
        )

structured_response = rag_chain.invoke("Give me the company name, summary, and details about revenue, profit margins, expenses, cash flow, balance sheet, forward guidance, key metrics, and trends from the fiscal year (FY) quarter (Q) result report.")