# Multi vector retrieval (and inverse HyDE)

In [None]:
%pip install -qU langchain
%pip install -qU langchain-community
%pip install -qU langchain-text-splitters
%pip install -qU langchain_openai

### Imports

In [1]:
import uuid

from langchain.retrievers.multi_vector import MultiVectorRetriever

from langchain.storage import InMemoryByteStore
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain.schema.runnable import RunnablePassthrough

### Config

In [2]:
embedding_model = OpenAIEmbeddings()
model_name = 'gpt-3.5-turbo-0125'
collection_name="taylor-swift"

### Load Texts and split them into chunks

Because of copyright issues, I cannot provide the lyrics I used for this notebook. However, you can use any lyrics you want. Just make sure to use this format:

```text
Title: The title of the text
[Verse 1]
Verse 1

[Chorus]
Chorus

[Verse 2]
Verse 2
```

and so on...

put each text in a separate file into the ```data/lyrics/``` folder (or any folder you want, really) and load them using the code below.

In [3]:
loaders = [
    TextLoader("./data/lyrics/anti_hero.txt", encoding='utf-8'),
    TextLoader("./data/lyrics/bejewled.txt", encoding='utf-8'),
    TextLoader("./data/lyrics/lavender_haze.txt", encoding='utf-8'),
    TextLoader("./data/lyrics/maroon.txt", encoding='utf-8'),
    TextLoader("./data/lyrics/snow_on_the_beach.txt", encoding='utf-8')
]

docs = []
for loader in loaders:
    docs.extend(loader.load())
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000)
docs = text_splitter.split_documents(docs)

### Build chain for summarization and summarize texts

In [4]:
summery_chain = (
    {"doc": lambda x: x.page_content}
    | ChatPromptTemplate.from_template("Summarize the following document. Do not include the title. Do not mention the Document.\n\n{doc}")
    | ChatOpenAI(model=model_name, max_retries=0)
    | StrOutputParser()
)

summaries = summery_chain.batch(docs, {"max_concurrency": 5})

for i, summery in enumerate(summaries):
    print(f"Document {i+1}:\n{summery}")

Document 1:
The speaker reflects on their struggles with depression and self-destructive behavior, acknowledging that they are their own problem. They feel like an outcast and struggle with their own narcissism, realizing that they may push people away. The speaker also mentions a disturbing dream about being killed by their daughter-in-law for money. The chorus repeats the theme of being the problem and feeling like an anti-hero, with the speaker acknowledging the toll it takes on those around them.
Document 2:
The lyrics of the song talk about a woman who realizes she has been too kind and accommodating in a relationship. She decides to reclaim her power and value, comparing herself to bejeweled and shining despite the challenges she faces. The lyrics convey a sense of empowerment and self-worth as the woman asserts her value and refuses to be put in a lesser position.
Document 3:
The lyrics of the song discuss meeting at midnight, feeling a lavender haze creeping up, and not caring 

### Build chain for hypothetical questions

In [5]:
functions = [
    {
        "name": "hypothetical_questions",
        "description": "Generate hypothetical questions",
        "parameters": {
            "type": "object",
            "properties": {
                "questions": {
                    "type": "array",
                    "items": {"type": "string"},
                },
            },
            "required": ["questions"],
        },
    }
]

chain = (
    {"doc": lambda x: x.page_content}
    | ChatPromptTemplate.from_template(
        """Generate a list of exactly 3 hypothetical questions that a person, 
        who seeks emotional guidence would ask that could be answered by this song's lyrics and or meaning. 
        Do not mention the song or the lyrics in these questions.
        Do not add any counter to these questions.:\n\n{doc}"""
    )
    | ChatOpenAI(max_retries=0, model=model_name).bind(
        functions=functions, function_call={"name": "hypothetical_questions"}
    )
    | JsonKeyOutputFunctionsParser(key_name="questions")
)

hypothetical_questions = chain.batch(docs, {"max_concurrency": 5})

for i, questions in enumerate(hypothetical_questions):
    print(f"Document {i+1}:\n{questions}")


Document 1:
['How can I deal with feeling like I never learn from my mistakes as I grow older?', 'What should I do when I realize that my actions have hurt those around me?', 'Is there a way to break free from the cycle of self-sabotage and be seen in a different light by others?']
Document 2:
['How can I maintain my sense of self-worth and confidence when faced with challenges in relationships?', 'What steps can I take to prioritize my own happiness and fulfillment without neglecting the needs of others?', 'How can I navigate the balance between staying true to myself and adapting to meet the expectations of those around me?']
Document 3:
["How do you navigate feelings of melancholia when others don't understand your silence?", 'What challenges arise when others try to define your identity based on societal expectations?', 'How do you prioritize your own emotional well-being amidst external pressures to conform?']
Document 4:
['What emotions are evoked when reminiscing about a shared 

### Create collection and init retriever

Retriever is empty at the start. It is filled with the chunks of the texts. The chunks are indexed by the retriever. The retriever is then used to retrieve the chunks that are relevant to the query. The chunks are then used to retrieve the original texts.

In [6]:
db = Chroma(collection_name=collection_name, embedding_function=embedding_model)

store = InMemoryByteStore() # The storage layer for the parent documents
doc_ids = [str(uuid.uuid4()) for _ in docs] # generate ids for the documents, so they can be retrieved from store
id_key = "doc_id"

retriever = MultiVectorRetriever(
    vectorstore=db,
    byte_store=store,
    id_key=id_key,
)

### Add documents and summaries to the retriever

In [7]:
summary_docs = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(summaries)
]

retriever.vectorstore.add_documents(summary_docs)
retriever.docstore.mset(list(zip(doc_ids, docs)))

### Add Questions to the retriever

In [8]:
question_docs = []
for i, question_list in enumerate(hypothetical_questions):
    question_docs.extend(
        [Document(page_content=s, metadata={id_key: doc_ids[i]}) for s in question_list]
    )
retriever.vectorstore.add_documents(question_docs)

['891bc17e-bd5d-4c3c-85af-599b8063b06e',
 '833cb6bd-43c0-4611-95eb-bcb87323749d',
 '5624779f-fcb5-4f66-a3d4-46cf395db4de',
 'b775464c-b32e-48b8-ae6e-1f72b8e8e675',
 '94237a53-19a2-4c1b-8b03-de86e2603ab6',
 '79e36682-9db1-4d6e-812c-ca129811466d',
 '4762f963-8479-4913-9387-ce54ba1799ed',
 'b833727e-065b-4a81-8df0-0ce8b78e3f5c',
 '0e2686a3-48f4-4559-b6b4-b084dceec0c2',
 'c4e66ff8-a155-497b-ac62-008c569539b6',
 '3239c954-b143-4abe-9fc2-04d714b78a50',
 '3522d69c-c4ee-4389-b034-2240fc26c97b',
 '9a368094-4876-4708-b8f0-a812359da27e',
 '4ce66f41-a5ae-4308-ac6a-7e7323db9c09',
 '1cb26154-80a3-4be8-8b05-20f84f56a6e8']

### Add original documents to the retriever

Add the ids of the full documents as metadata to the chunks, because we will embed these.

In [9]:
for i, doc in enumerate(docs):
    doc.metadata[id_key] = doc_ids[i]
    
retriever.vectorstore.add_documents(docs)

['94e8384a-3096-4d98-9c95-aab2878007a8',
 '9a2661ff-75a8-4670-bda7-6ba5e5fc6ee0',
 '05ecf238-8456-4109-9d3a-bde79789ee00',
 'cda5838b-6bb1-4735-bde6-071113337ace',
 '8f20f99e-463d-4836-9554-af6b260aee6a']

## Tests

In [14]:
# query = "Song about importance of self-worth and independence in a relationship." # bejewled
# query = "What can i do to make things right?" # bejewled
# query = "I am the one at fault." # anti hero
# query = "Everybody expects too mutch of me. I'm tired of it. I need to be free. What should I do?" # bejewled
# query = "One day we are dancing and being happy, the next day we are fighting and crying. What is wrong with us?" # maroon
# query = "I feel like my mind is hazy. I can't think straight. What should I do?" # lavender haze
query = "Someone splashed wine on my t-shirt. Should i confront this person?" # maroon
# query = "Can i get free tickets to the concert?"
# query = "I unexpectedly found a beatiful stone on the beach. Shoud I keep it?"


#### Direct Query for testing

In [15]:
sub_docs = db.similarity_search(query)

print(sub_docs[0].page_content)

What emotions are evoked when reminiscing about a shared moment over wine?


In [16]:
retrieved_docs = retriever.invoke(query)

print(retrieved_docs[0].page_content)

Title: Maroon
[Verse 1]
When the morning came
We were cleaning incense off your vinyl shelf
'Cause we lost track of time again
Laughing with my feet in your lap
Like you were my closest friend
"How'd we end up on the floor, anyway?" you say
"Your roommate's cheap-ass screw-top rosé, that's how"
I see you every day now

[Chorus]
And I chose you
The one I was dancing with
In New York, no shoes
Looked up at the sky and it was
The burgundy on my t-shirt
When you splashed your wine into me
And how the blood rushed into my cheeks
So scarlet, it was
The mark thеy saw on my collarbone
The rust that grew bеtween telephones
The lips I used to call home
So scarlet, it was maroon

[Verse 2]
When the silence came
We were shaking, blind and hazy
How the hell did we lose sight of us again?
Sobbing with your head in your hands
Ain't that the way shit always ends?
You were standing hollow-eyed in the hallway
Carnations you had thought were roses, that's us
I feel you, no matter what
The rubies that I g

### RAG

In [None]:
from langchain.globals import set_debug

set_debug(True)

In [17]:
template = """You are Taylor Swift. 
A person, who seeks emotional guidence asks for your help. 
Tell this person exactly what he or she needs to do to resolve his/her issues. 
Do mention your song's title and that listening to it will help the person.
Use a passage from the song to support your advice.
Answer the Question only using the context you are provided with.:

{context}

[Question]: 
{question}
"""
prompt = ChatPromptTemplate.from_template(template)

model = ChatOpenAI(model_name = model_name)

chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

chain.invoke(query)

'You should listen to my song "Maroon" to find guidance on whether to confront this person. In the song, I sing about the burgundy on my t-shirt when someone splashed wine into me. The lyrics describe the emotions and thoughts that come with such a situation. Take a moment to listen to the song and reflect on your feelings before deciding whether to confront the person who splashed wine on your t-shirt. Sometimes, addressing the issue can lead to resolution and closure.'