## Installs

In [None]:
!pip3 install --upgrade --quiet langchain langchain-community langchain-openai chromadb 
!pip3 install --upgrade --quiet pypdf pandas streamlit python-dotenv

## Imports

In [21]:
# Import Langchain modules
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import Chroma
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field

# Other modules and packages
import os
import tempfile
import streamlit as st  
import pandas as pd
from dotenv import load_dotenv
import uuid

## Env variables

In [22]:
load_dotenv()
OPEN_AI_API_KEY = os.environ.get("OPEN_AI_API_KEY")

## Process PDF documents

### Find and load all PDF documents

In [23]:
documents = []

# Traverse the directory tree and collect specified file in the ./data folder
loader = PyPDFLoader("data/Ubisoft_FY25_Q1_Sales_EN_vFinal.pdf")
documents=loader.load()

### Split documents

In [24]:
# Split the documents into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1200, 
                                               chunk_overlap=120, 
                                               length_function=len, 
                                               separators=["\n\n", "\n", " "])

documents = text_splitter.split_documents(documents)
documents

[Document(metadata={'source': 'data/Ubisoft_FY25_Q1_Sales_EN_vFinal.pdf', 'page': 0}, page_content='1 \n \n \n \nUBISOFT REPORTS FIRST-QUARTER 2024-25 SALES \n \nSolid start to the year with Q1 net bookings ahead of target  \nRobust engagement metrics driven by our GaaS franchises \nStrong upcoming release slate with Star Wars Outlaws™ & Assassin’s Creed® Shadows \n2024-25 targets confirmed \n \n \n \n \n▪ Net bookings of €290.0 million, ahead of target of around €275.0 million \n \n \nIn €m \nQ1 \n2024-25 \nReported \nchange vs. \nQ1 2023-24 \n% of total net bookings  \n Q1  \n2024-25 \nQ1  \n2023-24 \nIFRS 15 sales 323.5 +12.0% NA NA \nNet bookings 290.0 +8.3% NA NA \nDigital net bookings 257.2 +6.6% 88.7% 90.2% \nPRI net bookings 158.7 +24.6% 54.7% 47.6% \nBack-catalog net bookings 249.5 +1.5% 86.0% 91.8% \n \nParis, July 18, 2024 – Today, Ubisoft released its sales figures for the first quarter of fiscal \n2024-25, i.e., the three months ended June 30, 2024.  \n \n \n \n \nYves Gui

### Define embedding function

In [25]:
# Use embedding function from OpenAI
embedding_function = embeddings = OpenAIEmbeddings(
        model="text-embedding-ada-002", openai_api_key=OPEN_AI_API_KEY
    )

## Create and populate vector database

In [26]:
def create_vectorstore(documents, embedding_function, vectorstore_path):

    # Create a list of unique ids for each document based on the content
    ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in documents]
    
    # Ensure that only unique docs with unique ids are kept
    unique_ids = set()
    unique_documents = []
    
    for doc, id in zip(documents, ids):     
        if id not in unique_ids:       
            unique_ids.add(id)
            unique_documents.append(doc) 

    # Create a new Chroma database from the unique_documents
    chromaDB = Chroma.from_documents(documents=unique_documents, 
                                        ids=list(unique_ids),
                                        embedding=embedding_function, 
                                        persist_directory = vectorstore_path)

    chromaDB.persist()
    
    return chromaDB

In [27]:
# Create vectorstore
vectorstore = create_vectorstore(documents=documents, 
                                 embedding_function=embedding_function, 
                                 vectorstore_path="vectorstore")

## Query for relevant data

In [28]:
# Load vectorstore
vectorstore = Chroma(persist_directory="vectorstore_test", 
                     embedding_function=embedding_function)

In [29]:
# Create retriever and get relevant chunks
retriever = vectorstore.as_retriever(search_type="similarity")

question = "What is the title of this fiscal year (FY) quarter (Q) result report?"

relevant_chunks = retriever.invoke(question)
relevant_chunks

[Document(metadata={'page': 3, 'source': 'data/Ubisoft_FY25_Q1_Sales_EN_vFinal.pdf'}, page_content='Full-year 2024-25 \n \nThe line -up for the rest of FY25 includes Assassin’s Creed Shadows and Star Wars Outlaws. \nRainbow Six® Mobile and The Division® Resurgence are no longer expected in FY25 as the teams \nare taking the necessary time to ensure that these experiences deliver on expectations with \noptimized KPIs in the context of a demanding yet very large market.  \n \nThe Company confirms its financial targets. It expects solid net bookings growth, a slight increase \nin non -IFRS operating income and growing non -IFRS Cash Flow from Operations leading to \npositive Free Cash Flow. \n \n \n \n \n \n \n \n \n  \n \n1 Sales at constant exchange rates are calculated by applying to the data for the period under review the average exchange rate s used for \nthe same period of the previous fiscal year.'),
 Document(metadata={'page': 3, 'source': 'data/Ubisoft_FY25_Q1_Sales_EN_vFinal.pd

## Generate structured responses

### Set LLM

In [30]:
# Set LLM
llm = ChatOpenAI(model="gpt-4o-mini", api_key=OPEN_AI_API_KEY)

### Set prompt template

In [31]:
# Prompt template
PROMPT_TEMPLATE = """
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer
the question. If you don't know the answer, say that you
don't know. DON'T MAKE UP ANYTHING.

{context}

---

Answer the question based on the above context: {question}

Additionally, if found in the context, provide the following:
- Revenue and Earnings: Assess year-over-year (YoY) or quarter-over-quarter (QoQ) growth.
- Profit Margins: Review operating and net profit margins for efficiency.
- Expenses: Compare R&D, marketing, and operating costs to previous periods.
- Cash Flow: Check operational cash flow for liquidity and sustainability.
- Balance Sheet: Analyze assets, liabilities, and equity for financial health.
- Forward Guidance: Look for management's projections or economic outlook.
- Key Metrics: Industry-specific data (e.g., customer growth, product sales).
- Trends: Spot patterns or anomalies over time.
"""

In [32]:
# Concatenate context text
context_text = "\n\n---\n\n".join(
    [f"Page {doc.metadata['page']}:\n{doc.page_content}" for doc in relevant_chunks]
)

# Create prompt
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)

prompt = prompt_template.format(context=context_text, question=question)
print(prompt)

Human: 
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer
the question. If you don't know the answer, say that you
don't know. DON'T MAKE UP ANYTHING.

Page 3:
Full-year 2024-25 
 
The line -up for the rest of FY25 includes Assassin’s Creed Shadows and Star Wars Outlaws. 
Rainbow Six® Mobile and The Division® Resurgence are no longer expected in FY25 as the teams 
are taking the necessary time to ensure that these experiences deliver on expectations with 
optimized KPIs in the context of a demanding yet very large market.  
 
The Company confirms its financial targets. It expects solid net bookings growth, a slight increase 
in non -IFRS operating income and growing non -IFRS Cash Flow from Operations leading to 
positive Free Cash Flow. 
 
 
 
 
 
 
 
 
  
 
1 Sales at constant exchange rates are calculated by applying to the data for the period under review the average exchange rate s used for 
the same period of the previous f

### Create model for reponse

In [33]:
class AnswerWithSources(BaseModel):
    """An answer to the question, with sources and reasoning"""
    answer: str = Field(description="Answer to question")
    sources: str = Field(description="Full direct text chunk from the context used to answer the question")
    reasoning: str = Field(description="Explain the reasoning of the answer based on the sources")
    
class ExtractedInfo(BaseModel):
    """Extracted information about the report"""
    report_company_name: AnswerWithSources
    report_summary: AnswerWithSources
    revenue_and_earnings: AnswerWithSources
    profit_margins: AnswerWithSources
    expenses: AnswerWithSources
    cash_flow: AnswerWithSources
    balance_sheet: AnswerWithSources
    forward_guidance: AnswerWithSources
    key_metrics: AnswerWithSources
    trends: AnswerWithSources

### Invoke

In [34]:
# Using Langchain Expression Language
def format_docs(docs):
    return "\n\n---\n\n".join(
    [f"Page {doc.metadata['page']}:\n{doc.page_content}" for doc in relevant_chunks]
)

rag_chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | prompt_template
            | llm.with_structured_output(ExtractedInfo, strict=True)
        )

structured_response = rag_chain.invoke("Give me the company name, summary, and details about revenue, profit margins, expenses, cash flow, balance sheet, forward guidance, key metrics, and trends from the fiscal year (FY) quarter (Q) result report.")

## Transform response into a dataframe

In [35]:
df = pd.DataFrame([structured_response.model_dump()])

# Transforming into a table with two rows: 'answer' and 'source'
answer_row = []
source_row = []
reasoning_row = []

for col in df.columns:
    answer_row.append(df[col][0]['answer'])
    source_row.append(df[col][0]['sources'])
    reasoning_row.append(df[col][0]['reasoning'])

# Create new dataframe with two rows: 'answer' and 'source'
structured_response_df = pd.DataFrame([answer_row, source_row, reasoning_row], columns=df.columns, index=['answer', 'source', 'reasoning'])
structured_response_df

Unnamed: 0,report_company_name,report_summary,revenue_and_earnings,profit_margins,expenses,cash_flow,balance_sheet,forward_guidance,key_metrics,trends
answer,Ubisoft,Ubisoft reported a solid start to fiscal year ...,Ubisoft's IFRS 15 sales for Q1 2024-25 were €3...,The report does not provide specific figures f...,The report does not provide detailed expense f...,Ubisoft expects growing non-IFRS Cash Flow fro...,The report does not provide specific details o...,"For the second quarter of 2024-25, Ubisoft exp...",Digital net bookings for Q1 2024-25 were €257....,"Ubisoft's net bookings showed an upward trend,..."
source,"Paris, July 18, 2024 – Today, Ubisoft released...",In €m Q1 2024-25 Reported change vs. Q1 2023-2...,IFRS 15 sales for the first quarter of 2024-25...,The Company confirms its financial targets. It...,The context does not specify any expense detai...,The Company confirms its financial targets. It...,The context does not specify any balance sheet...,Net bookings for the second quarter of 2024-25...,Digital net bookings 257.2 +6.6% 88.7% 90.2%,"Net bookings totaled €290.0 million, exceeding..."
reasoning,The context explicitly mentions Ubisoft as the...,The summary captures the key financial perform...,This shows the year-over-year growth in revenu...,The context discusses expectations rather than...,No specific information on expenses is provide...,This indicates a positive outlook for cash flo...,No specific information on the balance sheet i...,The context provides specific forward guidance...,This metric highlights the digital segment's c...,The context indicates consistent growth in net...
