In [1]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
import os

In [2]:
data_dir = "./Big Star Collectibles"

In [3]:
files = os.listdir(data_dir)
file_texts = []
for file in files:
    with open(f"{data_dir}/{file}") as f:
        file_text = f.read()
    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=128, chunk_overlap=32,
    )
    texts = text_splitter.split_text(file_text)
    for i, chunked_text in enumerate(texts):
        file_texts.append(Document(page_content=chunked_text,metadata={ 
                    "doc_title": file.split(".")[0], 
                    "chunk_num": i}))

Created a chunk of size 139, which is longer than the specified 128
Created a chunk of size 151, which is longer than the specified 128
Created a chunk of size 151, which is longer than the specified 128
Created a chunk of size 139, which is longer than the specified 128
Created a chunk of size 130, which is longer than the specified 128
Created a chunk of size 188, which is longer than the specified 128
Created a chunk of size 130, which is longer than the specified 128


In [4]:
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

In [5]:
embeddings = HuggingFaceEmbeddings() # embed your data

  embeddings = HuggingFaceEmbeddings() # embed your data
  embeddings = HuggingFaceEmbeddings() # embed your data
  from tqdm.autonotebook import tqdm, trange


In [6]:
# store the embedded data into a vector database
vector_store = FAISS.from_documents(
    file_texts,
    embedding=embeddings
)

In [7]:
retriever = vector_store.as_retriever()

In [8]:
from dotenv import load_dotenv
# I added a .env file in the /workspaces/advanced-rag-applications-with-vector-databases-3886256/chapter_1 folder with the OPENAI_API_KEY value
# It is not saved in git because .gitignore has .env in it
# Also changes the instanciation of OpenAI below to point to Azure OpenAI endpoint

load_dotenv()

True

In [15]:
from langchain_openai import OpenAI

# added this to point to Azure OpenAI endpoint (see cell above for more info on .env file and OPENAI_API_KEY)
# Also added reference to 4o-mini model because I was getting 'Unknown model: gpt-3.5-turbo-instruct', perhaps
# that the model the older versions of the toolds refered in the req of this example are using by default
endpoint = "https://models.inference.ai.azure.com"
llm = OpenAI(base_url=endpoint, model="gpt-4o")

In [16]:
from langchain.prompts import ChatPromptTemplate
template="""You are a helpful assistant. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Cite your sources.
Question: {question} 
Context: {context} 
Answer:"""
prompt = ChatPromptTemplate.from_template(template)

In [17]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser


chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [18]:
response = chain.invoke("When did Big Star Collectibles Launch? Cite where you found this information.") # critical line

In [19]:
response

' Big Star Collectibles officially launched in 2014. [Our Story, chunk_num=1]'

The first answer was "' Big Star Collectibles officially launched in 2014. This information can be found in the "Our Story" document.<|fim_suffix|>'"

I resolved the problem by switching from gpt-4o-mini to gpt-4o, previously I tried another solution that did not work

Ignored solution proposed by Copilot:
You are getting "fims_suffix" (or similar placeholder text) instead of an actual document reference in cell 17 because the chain and prompt do not explicitly instruct the language model to include the metadata (such as document title or chunk number) from the retrieved documents in the context passed to the LLM.

By default, the retriever returns only the text content, not the metadata. If you want the LLM to cite sources, you need to:
1. Modify the prompt so it includes metadata (like `doc_title` and `chunk_num`) in the context.
2. Adjust how the context is constructed so that each chunk includes its source information.

To fix this, update the code that builds the context for the prompt so it formats each chunk with its metadata, for example:

def format_docs(docs):
    return "\n\n".join(
        f"Source: {doc.metadata.get('doc_title', 'unknown')} (chunk {doc.metadata.get('chunk_num', '?')}):\n{doc.page_content}"
        for doc in docs
    )

# Then, in your chain, use a RunnableLambda to format the context:
from langchain_core.runnables import RunnableLambda

chain = (
    {
        "context": retriever | RunnableLambda(format_docs),
        "question": RunnablePassthrough()
    }
    | prompt
    | llm
    | StrOutputParser()
)

This way, the LLM will see the source information in the context and can cite it in its answer. If you want, I can update your notebook to implement this fix.

"fim_suffix" is not a standard output or variable from LangChain, OpenAI, or your notebook code. It is likely a hallucinated or placeholder string generated by the language model (LLM) when it does not have access to real citation metadata in the context.

This happens because:

The LLM is prompted to "Cite your sources," but the context it receives does not include explicit source information (like document titles or chunk numbers).
Without this information, the LLM may invent a placeholder like "fim_suffix" or similar, as it tries to fulfill the instruction to cite a source.
To get real citations, you must format the context passed to the LLM to include the actual metadata (such as document title and chunk number) for each chunk. Otherwise, the LLM will not know the true source and may generate made-up references.

In [20]:
def format_docs(docs):
    return "\n\n".join(
        f"Source: {doc.metadata.get('doc_title', 'unknown')} (chunk {doc.metadata.get('chunk_num', '?')}):\n{doc.page_content}"
        for doc in docs
    )

# Then, in your chain, use a RunnableLambda to format the context:
from langchain_core.runnables import RunnableLambda

chain = (
    {
        "context": retriever | RunnableLambda(format_docs),
        "question": RunnablePassthrough()
    }
    | prompt
    | llm
    | StrOutputParser()
)

This way, the LLM will see the source information in the context and can cite it in its answer. If you want, I can update your notebook to implement this fix.