In [1]:
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.vectorstores import Chroma
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from transformers import MistralConfig

from peft import PeftModel
import os
import chromadb
import nltk
import spacy
from datasets import Dataset
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from nltk.stem import WordNetLemmatizer

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [2]:
def load_json(file_path):
    if os.path.exists(file_path):
        with open(file_path, 'r') as file:
            return json.load(file)
    else:
        return {"text":[]}

def save_json(file_path, data):
    with open(file_path, 'w') as file:
        json.dump(data, file, ensure_ascii=False)

def extract_keywords(string):
    # Extract keywords from the prompt
    lemmatizer = WordNetLemmatizer()
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(string)
    keywords = []
    for chunk in doc.noun_chunks:
        if not chunk.text.lower().strip() in nltk.corpus.stopwords.words('english'):
            text = chunk.text
            # Remove indirect articles
            text = text.replace('a ', '').replace('an ', '').strip()
            keywords.append(text)
    # Convert keywords to their singular forms
    keywords_singular = [lemmatizer.lemmatize(word) for word in keywords]
    return keywords_singular

def contains_keywords_filter(keywords, docs):
    # Filter data by keywords
    filtered_data = []
    if len(keywords) > 0:
        for doc in docs:
            el = doc[0].page_content
            if any(keyword in el for keyword in keywords):
                filtered_data.append(doc)
        return filtered_data
    else:
        return docs

# def format_docs_for_LLM(docs):
#     documents = []
#     for doc in docs:
#         documents.append(doc[0].page_content.replace("search_document: ", '', 1))
#     return documents

def format_docs_for_LLM(docs):
    formated_documents = ""
    for idx, doc in enumerate(docs):
        page_content = "ID {}:\n".format(idx)
        page_content += "Title: {}\n".format(doc[0].metadata['title'])
        page_content += doc[0].page_content.replace("search_document: ", '', 1)
        page_content += "\n\n"
        formated_documents += page_content
    return formated_documents


def extract_titles_from_docs(docs):
    titles = set()
    for idx, doc in enumerate(docs):
        titles.add(doc[0].metadata['title'])
    return titles

file_path = os.path.join('data', 'fine_tuning', 'data.json')

In [3]:
embedding_model = HuggingFaceEmbeddings(
    model_name="nomic-ai/nomic-embed-text-v1",
    model_kwargs={
        'device': 'cuda',
        'trust_remote_code': True
    }
)

chroma_client = chromadb.PersistentClient(path='chroma_data')
langchain_vector_db = Chroma(client=chroma_client, embedding_function=embedding_model)

def search_vector_db(query, vector_db, k=512):
    query = 'search_query: ' + query
    most_similar_docs = vector_db.similarity_search_with_relevance_scores(query, k=k)
    return most_similar_docs

# Peform initial search to load everything into memory
search_vector_db("Sample query", langchain_vector_db, k=1);

You try to use a model that was created with version 2.4.0.dev0, however, your version is 2.3.1. This might cause unexpected behavior or errors. In that case, try to update to the latest version.



  _torch_pytree._register_pytree_node(
<All keys matched successfully>


In [4]:
# Get the docs count
len(langchain_vector_db.get()['documents'])

16809763

In [4]:
base_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", config=MistralConfig)
model = base_model
# model = PeftModel.from_pretrained(base_model, os.path.join('fine_tuning', 'fine_tuned_models'))
# model.load_adapter(os.path.join('fine_tuning', 'fine_tuned_models'), 'test_adapter')
# model.set_adapter('test_adapter')
#model.merge_and_unload()
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")


pipe = pipeline(task="text-generation", model=base_model, tokenizer=tokenizer, max_new_tokens=16, device=0)
pipe.model = model
LLM = HuggingFacePipeline(pipeline=pipe)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
user_prompt = """Where does he live??"""
docs_with_score = search_vector_db(user_prompt, langchain_vector_db)

keywords = extract_keywords(user_prompt)
filtered_docs = contains_keywords_filter(keywords, docs_with_score)
filtered_docs = filtered_docs[:8]
documents = format_docs_for_LLM(filtered_docs)

# prompt_template = """<s>[INST] Solve the following prompt. internal_db contains information that may be helpful in solving the prompt.

# <internal_db>
# {internal_db}
# </internal_db>

# Prompt: {prompt} [/INST]"""

prompt_template = """<s>[INST] Below is a list of documents. Return up to 4 IDs of documents most useful for solving the user_prompt. If no documents are relevant, output -1. {format}. 

<documents>
{documents}
</documents>

user_prompt: {user_prompt}

[/INST]IDs: """
prompt = PromptTemplate.from_template(prompt_template)
chain = prompt | LLM
# for doc in filtered_docs:
#     print(doc[0].metadata['title'])
#     print(doc[0].page_content)
#     print('------------')
#print(documents)
# response = chain.invoke({'internal_db': documents, 'prompt': user_prompt})
# print(response)
from langchain.output_parsers import CommaSeparatedListOutputParser
output_parser = CommaSeparatedListOutputParser()
response = chain.invoke({'format':output_parser.get_format_instructions(), 'documents': documents, 'user_prompt': user_prompt})
print(response)
converted_response = output_parser.parse(response)
print('converted_response:')
print(converted_response)
print('length:')
print(len(converted_response))
print(documents)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


0 (Jamin Warren: Los Angeles), 1 (John Tarnoff
converted_response:
['0 (Jamin Warren: Los Angeles)', '1 (John Tarnoff']
length:
2
ID 0:
Title: Jamin Warren
play, and other seats of culture, from art to music to design.'" ==Personal life== Warren lives in Los Angeles. ==References== Category:Living people Category:1983 births Category:Harvard College alumni Category:American male journalists Category:American chief executives

ID 1:
Title: John Tarnoff
Santa Monica. He grew up in New York and Paris, and lives in Los Angeles. ==References== Category:1952 births Category:Amherst College alumni Category:Carnegie Mellon University faculty Category:Living people Category:American film producers

ID 2:
Title: Michiel Vos
are always present. They live in Greenwich Village in Manhattan, New York City. Vos wrote a book about his favorite places in New York City. ==References== ==External links== * * My America Category:1970 births Category:Dutch emigrants to the United States Category:Dutch film

Your response should be a list of comma separated values, eg: `foo, bar, baz`
