# Load dependencies

In [1]:
import json
import os

import chromadb
import nltk
import spacy
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from transformers import MistralConfig
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.output_parsers import CommaSeparatedListOutputParser
from langchain.prompts import PromptTemplate
from langchain.vectorstores import Chroma
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
[nltk_data] Downloading package stopwords to /home/matlab/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/matlab/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Define helper functions and constants

In [2]:
def load_json(file_path):
    if os.path.exists(file_path):
        with open(file_path, 'r') as file:
            return json.load(file)
    else:
        return {"text":[]}

def save_json(file_path, data):
    with open(file_path, 'w') as file:
        json.dump(data, file, ensure_ascii=False)

def extract_keywords(string):
    # Extract keywords from the prompt
    lemmatizer = WordNetLemmatizer()
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(string)
    keywords = []
    for chunk in doc.noun_chunks:
        if not chunk.text.lower().strip() in nltk.corpus.stopwords.words('english'):
            text = chunk.text
            # Remove indirect articles
            text = text.replace('a ', '').replace('an ', '').strip()
            keywords.append(text)
    # Convert keywords to their singular forms
    keywords_singular = [lemmatizer.lemmatize(word) for word in keywords]
    return keywords_singular

def contains_keywords_filter(keywords, docs):
    # Filter data by keywords
    filtered_data = []
    if len(keywords) > 0:
        for doc in docs:
            el = doc[0].page_content
            if any(keyword in el for keyword in keywords):
                filtered_data.append(doc)
        return filtered_data
    else:
        return docs

def format_docs_for_LLM(docs):
    formated_documents = ""
    for idx, doc in enumerate(docs):
        page_content = "ID: {}\n".format(idx)
        page_content += "Title: {}\n".format(doc[0].metadata['title'])
        page_content += "Content: " + doc[0].page_content.replace("search_document: ", '', 1)
        page_content += "\n\n"
        formated_documents += page_content
    return formated_documents


def extract_titles_from_docs(docs):
    titles = set()
    for idx, doc in enumerate(docs):
        titles.add(doc[0].metadata['title'])
    return titles

file_path = os.path.join('data', 'fine_tuning', 'data.json')

# Prepare chroma

In [3]:
embedding_model = HuggingFaceEmbeddings(
    model_name="nomic-ai/nomic-embed-text-v1",
    model_kwargs={
        'device': 'cuda',
        'trust_remote_code': True
    }
)

chroma_client = chromadb.PersistentClient(path='chroma_data')
langchain_vector_db = Chroma(client=chroma_client, embedding_function=embedding_model)

def search_vector_db(query, vector_db, k=512):
    query = 'search_query: ' + query
    most_similar_docs = vector_db.similarity_search_with_relevance_scores(query, k=k)
    return most_similar_docs

# Peform initial search to load everything into memory
search_vector_db("Sample query", langchain_vector_db, k=1);

You try to use a model that was created with version 2.4.0.dev0, however, your version is 2.3.1. This might cause unexpected behavior or errors. In that case, try to update to the latest version.



  _torch_pytree._register_pytree_node(
<All keys matched successfully>


# Prepare the data and save it to a json file

In [4]:
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", config=MistralConfig)
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")

pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_new_tokens=256, device=0)
LLM = HuggingFacePipeline(pipeline=pipe)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [101]:
user_prompt = """Paraphrase the following sentence: The scientists conducted a rigorous experiment"""
docs_with_score = search_vector_db(user_prompt, langchain_vector_db)

keywords = extract_keywords(user_prompt)
filtered_docs = contains_keywords_filter(keywords, docs_with_score)
filtered_docs = filtered_docs[:8]
documents = format_docs_for_LLM(filtered_docs)

prompt_template = """<s>[INST] Below is a list of documents. Return up to 4 IDs of documents most useful for solving the user_prompt. If no documents are relevant, output -1. {format}. 

<documents>
{documents}
</documents>

user_prompt: {user_prompt}

[/INST]IDs: """
prompt = PromptTemplate.from_template(prompt_template)
chain = prompt | LLM
    
output_parser = CommaSeparatedListOutputParser()
#response = chain.invoke({'format':output_parser.get_format_instructions(), 'documents': documents, 'user_prompt': user_prompt})
print("Below is a list of documents and a user prompt. Select up to 4 documents most useful for an LLM to solve the user prompt. Your output should consist of the IDs of selected documents (e.g. 0,3,7). If no documents are relevant, output 'None'. Do not solve the user prompt itself.\n")
print('user_prompt:')
print(user_prompt)
print('')
print('Documents:')
print(documents)
#print(output_parser.get_format_instructions())

Below is a list of documents and a user prompt. Select up to 4 documents most useful for an LLM to solve the user prompt. Your output should consist of the IDs of selected documents (e.g. 0,3,7). If no documents are relevant, output 'None'. Do not solve the user prompt itself.

user_prompt:
Paraphrase the following sentence: The scientists conducted a rigorous experiment

Documents:
ID: 0
Title: Inductivism
Content: 1962, was first published in the International Encyclopedia of Unified Science—a project begun by logical positivists—and somehow, at last, unified the empirical sciences by withdrawing the physics model, and scrutinizing them via history and sociology. Lacking such heavy use of mathematics and logic's formal language—an approach introduced in the Vienna Circle's Rudolf Carnap in the 1920s—Kuhn's book, powerful and persuasive, used in natural language open to laypersons. Structure explains science as puzzlesolving toward a vision projected by the "ruling class" of a scienti

# Save to json

In [102]:
json_prompt_template = """<s>[INST] Below is a list of documents. Return up to 4 IDs of documents most useful for solving the user_prompt. {}.

<documents>
{}
</documents>

user_prompt: {}

[/INST] {}"""

desired_output = """"""
desired_output += '</s>'
json_prompt_template = json_prompt_template.format(output_parser.get_format_instructions(), documents, user_prompt, desired_output)


# Add new data to the json file
data = load_json(file_path)
data["text"].extend([json_prompt_template])
save_json(file_path, data)