# MMR RETRIEVER

## SETUP

In [2]:
import os
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [3]:
from langchain.document_loaders import DirectoryLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import WebBaseLoader

In [4]:
# Loading a single website
loader = WebBaseLoader("http://www.paulgraham.com/wealth.html")
docs = loader.load()

# split website into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000, chunk_overlap=0)
chunks = text_splitter.split_documents(docs)

print(f"Your {len(docs)} documents have been split into {len(chunks)} chunks")

Your 1 documents have been split into 28 chunks


In [8]:
# create embeddings
embedding = OpenAIEmbeddings()
vectordb = Chroma.from_documents(documents=chunks, embedding=embedding)

In [9]:
# create retrievers
retriever_vanilla = vectordb.as_retriever(
    search_type="similarity", search_kwargs={"k": 8}
)

retriever_mmr = vectordb.as_retriever(search_type="mmr", search_kwargs={"k": 8})

In [10]:
# get relevant docs from retrievers to compare
question = "What is the best way to make and keep wealth?"

docs_vanilla = retriever_vanilla.get_relevant_documents(question)
docs_mmr = retriever_mmr.get_relevant_documents(question)

In [13]:
docs_mmr[2]

Document(page_content='plan that centers on things you like doing.\nThat is where your idea of what\'s valuable is least\nlikely to coincide with other people\'s.[5]\nIn the average car restoration you probably do make everyone\nelse microscopically poorer, by doing a small amount of damage to\nthe environment.  While environmental costs should be taken\ninto account, they don\'t\nmake wealth a zero-sum game.  For example, if you repair\na machine that\'s broken because a part has come unscrewed,\nyou create wealth with no environmental cost.[5b]\nThis essay was written before Firefox.[6]\nMany people feel confused and depressed in\ntheir early twenties.  Life seemed so much more fun in college.\nWell, of course it was.  Don\'t be fooled by the surface similarities.\nYou\'ve gone from guest to servant.\nIt\'s possible to have fun in this new world. \nAmong other things, you now get to go behind the doors that say\n"authorized personnel only."\nBut the change is a shock at first, and al

In [14]:
def analyze_list_overlap(list1, list2, content_attr="page_content"):
    """
    Analyze the overlap and uniqueness between two lists of objects using a specified content attribute.

    parameters:
    list1 (list): The first list of objects to compare.
    list2 (list): """