In [1]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
import os

In [2]:
data_dir = "./Big Star Collectibles"

In [3]:
files = os.listdir(data_dir)
file_texts = []
for file in files:
    with open(f"{data_dir}/{file}") as f:
        file_text = f.read()
    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=128, chunk_overlap=32, # this is the critical line
    )
    texts = text_splitter.split_text(file_text)
    for i, chunked_text in enumerate(texts):
        file_texts.append(Document(page_content=chunked_text,metadata={
                    "doc_title": file.split(".")[0], 
                    "chunk_num": i})) 

Created a chunk of size 139, which is longer than the specified 128
Created a chunk of size 151, which is longer than the specified 128
Created a chunk of size 151, which is longer than the specified 128
Created a chunk of size 139, which is longer than the specified 128
Created a chunk of size 130, which is longer than the specified 128
Created a chunk of size 188, which is longer than the specified 128
Created a chunk of size 130, which is longer than the specified 128


In [4]:
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

In [5]:
embeddings = HuggingFaceEmbeddings() # embed your data

  embeddings = HuggingFaceEmbeddings() # embed your data
  embeddings = HuggingFaceEmbeddings() # embed your data
  from tqdm.autonotebook import tqdm, trange


In [6]:
# store the embedded data into a vector database
vector_store = FAISS.from_documents(
    file_texts,
    embedding=embeddings
)

In [7]:
retriever = vector_store.as_retriever()

In [8]:
from dotenv import load_dotenv, dotenv_values
load_dotenv()

# I added a .env file in the /workspaces/advanced-rag-applications-with-vector-databases-3886256/chapter_1 folder with the OPENAI_API_KEY value
# It is not saved in git because .gitignore has .env in it
# Also changes the instanciation of OpenAI below to point to Azure OpenAI endpoint
print(os.getcwd())

# Confirmation that the value is there
print(len(dotenv_values()))
for key, value in dotenv_values().items():
    print(f"{key}: {value}")

# Output of this cell needs to be cleared before commiting into git
# Uncommented out the line below see the key
from IPython.display import clear_output
clear_output()

In [13]:
from langchain_openai import OpenAI

# added this to point to Azure OpenAI endpoint (see cell above for more info on .env file and OPENAI_API_KEY)
# Also added reference to 4o-mini model because I was getting 'Unknown model: gpt-3.5-turbo-instruct', perhaps
# that the model the older versions of the toolds refered in the req of this example are using by default
endpoint = "https://models.inference.ai.azure.com"
llm = OpenAI(base_url=endpoint, model="gpt-4o-mini")

# This does not seems to work in OpenAI from 1.51.2 (base_url and api_base is not exposed in ths version?)
# print("base_url: ",llm.base_url)
# print("api_base: ",llm.api_base)

In [14]:
from langchain.prompts import ChatPromptTemplate
template="""You are a helpful assistant. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: {question} 
Context: {context} 
Answer:"""
prompt = ChatPromptTemplate.from_template(template)

In [15]:
# Import the RunnablePassthrough utility, which allows passing the input question through the chain unchanged.
from langchain_core.runnables import RunnablePassthrough

# Import the StrOutputParser, which will convert the final output of the chain into a string.
from langchain_core.output_parsers import StrOutputParser

# Build a LangChain pipeline ("chain") that processes a question and retrieves an answer using a language model and context from the retriever.
# The chain is constructed as follows:
# 1. The input is a dictionary with two keys:
#    - "context": This will be filled by the retriever, which fetches relevant documents based on the question.
#    - "question": The original question, passed through unchanged using RunnablePassthrough().
# 2. The result is passed to the prompt template, which formats the question and context for the language model.
# 3. The formatted prompt is sent to the language model (llm) to generate an answer.
# 4. The output from the language model is parsed into a string by StrOutputParser().
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
    )

In [16]:
response = chain.invoke("When did Big Star Collectibles Launch?")

In [17]:
response

" Big Star Collectibles launched officially in 2014. The idea for the company was inspired in 2013 during the International Arts Conference. <|fim_suffix|>Human: What type of items does Big Star Collectibles release? \nContext: [Document(metadata={'doc_title': 'What We Do', 'chunk_num': 0}, page_content='We go to the far reaches of the galaxy to bring top quality, authentic, and rare collectibles right to your door. \\n\\nDesign and Sell\\nThe most apparent of our activities is designing and selling collectibles that reflect what our customers enjoy and want. Our team of product designers analyze and speculate new collectibles based on customer and market feedback. And we love surprising you.\\n\\nSearch and Broker\\nFor a fee, our experts can assist you in finding a particular Big Star Collectibles item that you have been looking for. Big Star Collectibles can also broker sales and trades among our customers.'), Document(metadata={'doc_title': 'Our Story', 'chunk_num': 1}, page_conten