In [None]:
%pip install faiss-cpu sentence-transformers

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.documents import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores.faiss import FAISS
import os
import getpass

In [None]:
os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter in your OpenAI API Key:")

# Loader = PyPDFLoader("data/principles_of_marketing_book.pdf") # If the file was local
loader = PyPDFLoader(
    "https://storage.googleapis.com/strapi_cms_assets/principles_of_marketing_book.pdf"
)
raw_documents = loader.load_and_split()
print(raw_documents[0])

In [None]:
len(raw_documents)

In [None]:
# Let's just use the first 100 documents for this example:
raw_documents = raw_documents[:100]

In [None]:
# Load the marketing principles .pdf, split it into chunks, embed each chunk and load it into the vector store.
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(raw_documents)
# You can use OpenSource embeddings instead of OpenAIEmbeddings --> embeddings = HuggingFaceEmbeddings()
embeddings = OpenAIEmbeddings()
db = FAISS.from_documents(documents, embeddings)

In [None]:
db

In [None]:
query = "What is the license used within the principles of marketing book?"
docs = db.similarity_search(query)
print(docs[0].page_content)

# Building a simple QA retrieval system using FAISS


By combining similarity search across a vector database with a chat model, you can easily create a very simple question answering (QA) system. Expanding on the GPT best practices to avoid hallucinations by asking the chat model to only answer using reference text from the database, you can create a simple system that can answer questions about a specific topic.


In [None]:
from langchain_openai.chat_models import ChatOpenAI
from langchain_core.prompts.chat import SystemMessagePromptTemplate, ChatPromptTemplate

query = "License used within the principles of marketing book"
docs = db.similarity_search(query)

# Combine all of the docs into a single string:
combined_docs = " ".join([doc.page_content for doc in docs])

# Check the length of the combined docs:
print(len(combined_docs))

# Create the chat model:
chat = ChatOpenAI()

template = """Given the following text, answer the following question.
        Question: {question}
        
        You must follow the following principles:
        - You must only answer using the reference text provided.
        - If you don't the answer without the reference text, you must return "I don't know".
        - It is vital that you return the answer with the reference text and not without.

        Reference Text: {combined_docs}"""

# Make the template:
system_message = SystemMessagePromptTemplate.from_template(template)
chat_prompt = ChatPromptTemplate.from_messages([system_message])

# Create the messages:
query = "What is the license used within the principles of marketing book?"
messages = chat_prompt.format_prompt(
    question=query, combined_docs=combined_docs
).to_messages()

# Call the chat model:
response = chat(messages)
print(response.content)

# Alternatively if we try something that the LLM doesn't have access too within the .pdf marketing book:
query = "What is data science?"
messages = chat_prompt.format_prompt(
    question=query, combined_docs=combined_docs
).to_messages()

# Call the chat model:
response = chat(messages)
print(response.content)

---


It's possible write the above code in a simpler way with _a chain_. For now think of a chain as a _multiple steps that are executed to accomplish a task_. Additionally you will use the vector database as your back end for searching results, commonly referred to as a _retriever._


In [None]:
db.as_retriever()

In [None]:
from langchain.chains import RetrievalQA

qa = RetrievalQA.from_chain_type(
    llm=chat,
    chain_type="map_reduce",
    retriever=db.as_retriever(),
    return_source_documents=True,
)
qa.invoke({"query": "What is book's title?"})

You can find more information about Document QA retrieval here https://python.langchain.com/docs/modules/chains/additional/question_answering.html.


# Similarity Search with score


FAISS-specific techniques exist, one of which is termed as `.similarity_search_with_score`. This particular method permits the retrieval of not only the corresponding documents but also the distance measure between the query and the said documents. The returned distance measure utilizes L2 distance, where a smaller score is a better match.


In [None]:
query = "What is the license used within the principles of marketing book?"
docs_and_scores = db.similarity_search_with_score(query=query)

In [None]:
docs_and_scores

It's possible to add documents to the FAISS vector store.


In [None]:
new_documents = [
    Document(
        page_content="Data engineering begins with data collection. Often, data streams in from multiple sources, such as customer interactions, website activity, social media, IoT devices, and more.",
        metadata={"title": "Data Engineering Book"},
    ),
    Document(
        page_content="Pandas is a popular open-source Python library widely used for data manipulation and analysis. It provides powerful data structures like DataFrames and Series, which allow users to handle and analyze structured data easily. ",
        metadata={"title": "Pandas Analysis Book"},
    ),
]

In [None]:
db.add_documents(documents=new_documents)

In [None]:
# Now if you search for pandas you should see the new documents in the results.
db.similarity_search_with_score(query="pandas")

# Loading and Saving the Vector Store


In [None]:
db.save_local(
    "data/vectorstore"
)  # This creates a index.faiss and index.pkl file in the data/vectorstore directory.

In [None]:
new_db = FAISS.load_local("data/vectorstore", embeddings)

docs = new_db.similarity_search_with_score(query="pandas")

print(docs[0])

In [None]:
# Merging two indexes together:
documents = [
    Document(
        page_content="I love data engineering",
        metadata={"title": "Data Engineering Book"},
    ),
    Document(
        page_content="I love pandas",
        metadata={"title": "Pandas Analysis Book"},
    ),
]


vectorstore_one = FAISS.from_documents([documents[0]], embeddings)
vectorstore_two = FAISS.from_documents([documents[1]], embeddings)

In [None]:
print(vectorstore_one.docstore._dict)
print(vectorstore_two.docstore._dict)

In [None]:
vectorstore_one.merge_from(vectorstore_two)

In [None]:
print(vectorstore_one.docstore._dict)

# Filtering and Similarity Search


The capability to filter is also available in the FAISS vectorstore. However, since FAISS does not inherently support this feature, it requires manual implementation. This process entails initially retrieving more results than `k`, followed by their filtration. Document filtration can be executed based on metadata. Additionally, you have the option to determine the quantity of documents you wish to fetch prior to filtering by setting the `fetch_k` parameter during any search method invocation. To illustrate, consider the following minor example:


In [None]:
from langchain.schema import Document

# Sample list of data engineering documents with content and metadata
documents = [
    Document(
        page_content="Data engineering involves designing data pipelines.",
        metadata=dict(page=1),
    ),
    Document(
        page_content="ETL is a key process in data engineering workflows.",
        metadata=dict(page=1),
    ),
    Document(
        page_content="Data modeling ensures data integrity and accessibility.",
        metadata=dict(page=2),
    ),
    Document(
        page_content="Data warehouses are centralized repositories for structured data.",
        metadata=dict(page=2),
    ),
    Document(
        page_content="Scalability and performance are crucial in data engineering.",
        metadata=dict(page=3),
    ),
    Document(
        page_content="Data lakes store raw data for future processing and analysis.",
        metadata=dict(page=3),
    ),
    Document(
        page_content="Data governance ensures data security and compliance.",
        metadata=dict(page=4),
    ),
    Document(
        page_content="Data science and data engineering collaborate for data insights.",
        metadata=dict(page=4),
    ),
]

db = FAISS.from_documents(documents=documents, embedding=embeddings)
results_with_scores = db.similarity_search_with_score(
    "data engineering", k=2, fetch_k=20, filter={"page": 1}
)  # Fetch top 20 results and return top 2 results with scores, filtered by page 1
for doc, score in results_with_scores:
    print(f"Content: {doc.page_content}, Metadata: {doc.metadata}, Score: {score}")

# Maximal Marginal Relevance (MMR)

Maximal Marginal Relevance (MMR) search is an algorithm used to diversify the results obtained from an information retrieval system. It was introduced to address the issue of redundancy in information retrieval results.

In the context of generative AI models like GPT-4, an MMR search would strive to balance the trade-off between relevance (how closely the response matches the prompt) and novelty (how different each response is from the others).

The MMR algorithm works by iteratively selecting the item that maximizes a certain criterion, typically a weighted combination of relevance to the query and dissimilarity to the already selected items. The main idea is to re-rank the list of items retrieved by an initial search in order to promote diversity.

It's possible to use MMR whilst searching on a vector database:


In [None]:
results = db.max_marginal_relevance_search("data engineering", filter=dict(page=1))
for doc in results:
    print(f"Content: {doc.page_content}, Metadata: {doc.metadata}")