# Load dependecies and the setup environment

In [1]:
import chromadb
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.vectorstores import Chroma
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from transformers import MistralConfig

# Establish connection to Chroma and load it into LangChain

In [2]:
embedding_model = HuggingFaceEmbeddings(
    model_name="nomic-ai/nomic-embed-text-v1",
    model_kwargs={
        'device': 'cuda',
        'trust_remote_code': True
    }
)

chroma_client = chromadb.PersistentClient(path='chroma_data')
langchain_vector_db = Chroma(client=chroma_client, embedding_function=embedding_model)

def search_vector_db(query, vector_db, k=100):
    query = 'search_query: ' + query
    most_similar_docs = vector_db.similarity_search(query, k=k)
    #output = "=====\n"
    output = ""
    titles = set()
    for idx, doc in enumerate(most_similar_docs):
        # processed_doc = "Document_ID: " + str(idx) + "\n"
        # processed_doc += doc.page_content
        # output += processed_doc + "\n=====\n"
        titles.add(doc.metadata['title'])
        
    for idx, title in enumerate(titles):
        output += 'title ' + str(idx) + ": " + title + "\n"
    return output

You try to use a model that was created with version 2.4.0.dev0, however, your version is 2.3.1. This might cause unexpected behavior or errors. In that case, try to update to the latest version.



<All keys matched successfully>


In [39]:
search_result = search_vector_db("What do you now about Zakopane?", langchain_vector_db)
print(search_result)

title 0: Zamami, Okinawa
title 1: Zbojné
title 2: Zamzar
title 3: Zalavas
title 4: Zlarin
title 5: Zapovednik
title 6: Zanica
title 7: Zwoleń
title 8: Zopa
title 9: Zapaday
title 10: Zwarte Piet
title 11: Zapin
title 12: Zaventem
title 13: Zamboangueño people
title 14: Zastava Arms
title 15: Zambon
title 16: Zalcitabine
title 17: Zagranitsa
title 18: Zava
title 19: Zonnebeke
title 20: Zamboanga (film)
title 21: Zatocze
title 22: Załęże
title 23: Zandvoorde, Zonnebeke
title 24: Zakopower
title 25: Zakawie
title 26: Ziębice
title 27: Zakopane Style
title 28: Zamagurie
title 29: Zawiercie
title 30: Zakopane railway station
title 31: Zakojca
title 32: Zaklopača (Grocka)
title 33: Zazpikaleak/Casco Viejo (Bilbao metro)
title 34: Zdravko Krivokapić
title 35: Zákamenné
title 36: Zákupy
title 37: Zarožje
title 38: Za-Kpota
title 39: Zákopčie
title 40: Zango (company)
title 41: Zaozerne
title 42: Zavegepant
title 43: Zapp Group
title 44: ZPG Ltd
title 45: Zastava Automobiles
title 46: Zapin Api

# Load the model

In [3]:
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", config=MistralConfig)
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")

pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512, device=0)
LLM = HuggingFacePipeline(pipeline=pipe)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

# Run the model

In [59]:
# prompt_template = """<s>[INST]Your objective is to determine which of
# the documents below are highly relevant to the user_prompt. You must output the ids of
# the relevant documents in the following format: [Document_ID_1, Document_ID_2, ...].
# If none of the documents are relevant output: []. You can include up to 5 different Document_IDs.
# user_prompt: {user_prompt}
# documents: {documents}[/INST]
# """
prompt_template = """<s>[INST]Below is a list of document titles. You must output a Python list with the IDs of document titles
that are highly relevant to the user_prompt. If none of the titles are highly relevant output: []. The relevance must
be clear and unquestionable. Do not include titles that don't meet those requirements.
You can select no more than 3 titles.
user_prompt: {user_prompt}
titles: {documents}[/INST]
"""
prompt = PromptTemplate.from_template(prompt_template)
chain = prompt | LLM
user_prompt = "I really like cat!"
documents = search_vector_db(user_prompt, langchain_vector_db)
print(chain.invoke({'user_prompt': user_prompt, 'documents': documents}))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[title 41: Zootopia, title 28: Zooey Deschanel, title 3: Zico (rapper)]

Explanation:
The user prompt mentions "I really like cat!" which is a clear indication that they are interested in documents related to cats. Among the given titles, "Zootopia" is a movie that features an animal city where anthropomorphic animals live, and cats are one of the many species present in it. "Zooey Deschanel" is an actress and singer known for her role in the TV show "New Girl," where her character is a cat lover. "Zico (rapper)" is a South Korean rapper whose stage name is derived from the word "zico," which means "rabbit" in Korean, but he also has a song titled "I Am You, You Are Me," which has a cat in its music video. These three titles are the most relevant to the user prompt as they directly or indirectly relate to cats.


In [56]:
search_result = search_vector_db(user_prompt, langchain_vector_db)
print(search_result)

title 0: Zooropa
title 1: Zoey Deutch
title 2: Zanna, Don't!
title 3: Zico (rapper)
title 4: Ziggy Astoni
title 5: Zatanna
title 6: Zelda Williams
title 7: Ziggy Marley
title 8: Zookeeper (film)
title 9: Zig & Sharko
title 10: Zoobles! (TV series)
title 11: Zolita
title 12: Zoboomafoo
title 13: Zits (comics)
title 14: Zaleilah
title 15: ZHU (musician)
title 16: Zoë Kravitz
title 17: Ziggy (comic strip)
title 18: Zarema
title 19: Zac Posen
title 20: Zillah & Totte
title 21: Zippy (Rainbow)
title 22: Zak Waters
title 23: Zixx
title 24: Zenyatta Mondatta
title 25: Zoe Tay
title 26: Zina Goldrich
title 27: Zendaya
title 28: Zooey Deschanel
title 29: Zig and Zag (TV series)
title 30: Z.O.E. Dolores, I
title 31: Zac Moncrief
title 32: Zach Braff
title 33: Zoé Kézako
title 34: ZooBorns
title 35: Zachary Levi
title 36: Zelda the Great
title 37: ZOOperstars!
title 38: Zeeteah Massiah
title 39: Zayra Alvarez
title 40: Zac Efron
title 41: Zootopia
title 42: Zoey & Me
title 43: Zedd
title 44: Zara