## Installs

In [None]:
!pip3 install --upgrade --quiet langchain langchain-community langchain-openai chromadb 
!pip3 install --upgrade --quiet pypdf pandas streamlit python-dotenv

## Imports

In [1]:
# Import Langchain modules
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import Chroma
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
from typing import Optional

# Other modules and packages
import os
import tempfile
import streamlit as st  
import pandas as pd
from dotenv import load_dotenv
import uuid

## Env variables

In [2]:
load_dotenv()
OPEN_AI_API_KEY = os.environ.get("OPEN_AI_API_KEY")

## Process PDF documents

### Find and load all PDF documents

In [3]:
documents = []

# Traverse the directory tree and collect specified file in the ./data folder
loader = PyPDFLoader("data/Ubisoft_FY25_Q1_Sales_EN_vFinal.pdf")
documents=loader.load()

### Split documents

In [None]:
# Split the documents into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1200, 
                                               chunk_overlap=120, 
                                               length_function=len, 
                                               separators=["\n\n", "\n", " "])

documents = text_splitter.split_documents(documents)
documents

### Define embedding function

In [5]:
# Use embedding function from OpenAI
embedding_function = embeddings = OpenAIEmbeddings(
        model="text-embedding-ada-002", openai_api_key=OPEN_AI_API_KEY
    )

## Create and populate vector database

In [None]:
def create_vectorstore(documents, embedding_function, vectorstore_path):

    # Create a list of unique ids for each document based on the content
    ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in documents]
    
    # Ensure that only unique docs with unique ids are kept
    unique_ids = set()
    unique_documents = []
    
    for doc, id in zip(documents, ids):     
        if id not in unique_ids:       
            unique_ids.add(id)
            unique_documents.append(doc) 

    # Create a new Chroma database from the unique_documents
    chromaDB = Chroma.from_documents(documents=unique_documents, 
                                        ids=list(unique_ids),
                                        embedding=embedding_function, 
                                        persist_directory = vectorstore_path)

    chromaDB.persist()
    
    return chromaDB

In [None]:
# Create vectorstore
vectorstore = create_vectorstore(documents=documents, 
                                 embedding_function=embedding_function, 
                                 vectorstore_path="vectorstore")

## Query for relevant data

In [None]:
# Load vectorstore
vectorstore = Chroma(persist_directory="vectorstore_test", embedding_function=embedding_function)

In [None]:
# Create retriever and get relevant chunks
retriever = vectorstore.as_retriever(search_type="similarity")

question = "What is the title of this fiscal year (FY) quarter (Q) result report?"

relevant_chunks = retriever.invoke(question)
relevant_chunks