In [None]:
pip install llama_index llama_index.embeddings.huggingface llama_index.llms.huggingface chromadb llama-index-vector-stores-chroma

In [None]:
import time
from ast import parse
from typing import List
from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    Settings,
    StorageContext,
    load_index_from_storage
)
from llama_index.core.node_parser import SimpleFileNodeParser
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.huggingface import HuggingFaceInferenceAPI, HuggingFaceLLM, TextGenerationInference
from llama_index_client import Document
from pydantic import BaseModel
from llama_index.core import PromptTemplate
from llama_index.core.schema import TextNode, NodeRelationship, RelatedNodeInfo
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.vector_stores import SimpleVectorStore
import json
from transformers import pipeline



# import huggingface_hub
# huggingface_hub.login(token="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")

In [None]:


class Offers(BaseModel):
    """Data model of Offers"""

    offering_id: List[str]


class Model:
    def __init__ (self):
        self.LLM_MODEL = HuggingFaceInferenceAPI(
            model_name="mistralai/Mixtral-8x7B-Instruct-v0.1",
            # model_name="mistralai/Mistral-7B-Instruct-v0.3",
            # model_name="01-ai/Yi-1.5-34B-Chat",
            # model_name="Qwen/Qwen1.5-0.5B-Chat",
            # model_name="microsoft/Phi-3-mini-4k-instruct",
            # model_name="google/gemma-1.1-2b-it",
            token="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",
        )

        self.EMBED_MODEL = HuggingFaceEmbedding(
            # model_name="BAAI/bge-small-en-v1.5",
            # model_name="BAAI/bge-large-en-v1.5",

            model_name="mixedbread-ai/mxbai-embed-large-v1",
            # model_name="intfloat/multilingual-e5-large",  #embeddings not generated till now

            # model_name="nvidia/dragon-multiturn-context-encoder", # dont use nvidia/dragon-multiturn-context-encoder for query generation, its only for embeddings generation, instead use query-encoder for querrying

            # model_name="nvidia/dragon-multiturn-query-encoder",

            text_instruction="Given are the offers we provide, where each offer is uniqely identified by its offering_id",
            query_instruction="Retrieve all the relevent offering_ids from the given query"
        )


        self.PERSIST_DIR = "./vector-indexes/" + str(self.EMBED_MODEL.model_name)
        self.DATA_DIR = "./"
        self.VECTOR_INDEX = None



        self.CHROMA_DB = chromadb.Client()
        self.CHROMA_COLLECTION = self.CHROMA_DB.get_or_create_collection("Categories")



        Settings.embed_model = self.EMBED_MODEL
        Settings.llm = self.LLM_MODEL
        # Settings.num_output = 2


    def notifyMessage(self, text):
        print(f"{'='*10}{text}{'='*10}")


    def createDocs(self):
        # Builds nodes from the documents
        self.docs = SimpleDirectoryReader(self.DATA_DIR).load_data(show_progress=True)
        parser = SentenceSplitter(paragraph_separator="},\n")
        self.nodes = parser.get_nodes_from_documents(self.docs, show_progress=True)
        self.notifyMessage(f"Parsed {len(self.nodes)} Nodes")

    def saveVectorIndexToDisk(self):
        # Saves the index into persist directory
        self.VECTOR_INDEX.storage_context.persist(persist_dir=self.PERSIST_DIR)
        self.notifyMessage("Vector Index Saved")



    def simpleChromaDB(self):
        # import chromadb.utils.embedding_functions as embedding_functions
        # huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction(
        #     api_key="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",
        #     model_name="mistralai/Mixtral-8x7B-Instruct-v0.1"
        # )
        client = chromadb.PersistentClient()
        self.CHROMA_COLLECTION = client.get_or_create_collection("Categories")

        file = open("./about.txt", "r")
        self.nodes = json.load(file)

        print("file read")
        counter=1

        for i in self.nodes["data"]:
            counter+=1

            curr_metadata = {"offering_id": "", "tag": "", "subtags": ""}
            curr_id = i["offering_id"]
            curr_metadata["offering_id"] = i["offering_id"]
            curr_embeddings = self.EMBED_MODEL.get_text_embedding(str(i))

            for tag_i in i["tag"]:
                curr_metadata["tag"] = str(tag_i["name"])
                for sub_i in tag_i["subtags"]:
                    curr_metadata["subtags"] = str(sub_i["name"])

            self.CHROMA_COLLECTION.add(
                documents=[str(i)],
                embeddings=[curr_embeddings],
                metadatas=[curr_metadata],
                ids=[curr_id]
            )

            if counter%50 == 0:
                print("parsed ", counter, " nodes")

    def testChromaDB(self, userPrompt, top_res=5):
        client = chromadb.PersistentClient()
        self.CHROMA_COLLECTION = client.get_or_create_collection("Categories")
        print(self.CHROMA_COLLECTION.count())

        text = self.EMBED_MODEL.get_query_embedding(userPrompt)

        ans = self.CHROMA_COLLECTION.query(
            query_embeddings=[text],
            n_results=top_res,
            include=["documents", "distances", "metadatas"],
            where={"tag": f"{userPrompt}"}
        )
        print(ans)
        for i in ans:
            print(i, "   ", ans[i])


# ====================Chroma DB======================
    def createIndexFromChromaStorage(self):
        self.createDocs()

        # Creating chroma vector store
        chroma_vector_store = ChromaVectorStore(self.CHROMA_COLLECTION, persist_dir=self.PERSIST_DIR)

        # creating a storage context out of the chroma vector store
        chroma_storage_context = StorageContext.from_defaults(vector_store=chroma_vector_store)

        # Building the index by using the storage context
        self.VECTOR_INDEX = VectorStoreIndex(
            nodes=self.nodes,
            storage_context=chroma_storage_context,
            show_progress=True,
            embed_model=self.EMBED_MODEL
        )

        # Saving the index into the disk
        self.saveVectorIndexToDisk()

    def loadIndexFromChromaStorage(self):
        # Creating chroma vector store
        chroma_vector_store = ChromaVectorStore(self.CHROMA_COLLECTION, persist_dir=self.PERSIST_DIR)

        # Building the index from the loaded vector store
        self.VECTOR_INDEX = VectorStoreIndex.from_vector_store(
            chroma_vector_store,
            embed_model=self.EMBED_MODEL
        )
        self.notifyMessage("Chroma Vector Index Loaded")


# ==================== Default DB ========================
    def createIndexFromDefaultStorage(self):
        self.createDocs()
        self.VECTOR_INDEX = VectorStoreIndex(
            nodes=self.nodes,
            show_progress=True
        )

        # Saving the index into the disk
        self.saveVectorIndexToDisk()


    def loadIndexFromDefaultStorage(self):
        # creating a storage context from defaults
        storage_context = StorageContext.from_defaults(
            persist_dir=self.PERSIST_DIR,
        )

        # Building the index from the storage context
        self.VECTOR_INDEX = load_index_from_storage(
            storage_context=storage_context
        )
        self.notifyMessage("Vector Index Loaded")


    def generateResponse(self, userPrompt:str):
        template = (
            "We have provided context information below. \n"
            "---------------------\n"
            "You are a JSON search engine whose role is to find all the offering id's of offers where the details or the tag matches content of the question being asked"
            "\n---------------------\n"
            "You should not provide any extra details, only retrieve the offering_id"
            "\n---------------------\n"
            "The output should be in this format: '[offering id's]' "
            "\n---------------------\n"
            "Given this information, please answer the question: give various offer details for {query_str}\n"
        )
        qa_template = PromptTemplate(template)
        message = qa_template.format_messages(query_str=userPrompt)[0].content
        # print(message)

        # print(self.VECTOR_INDEX.as_retriever(similarity_top_k=2).retrieve(userPrompt))
        self.QUERY_ENGINE = self.VECTOR_INDEX.as_query_engine(
            similarity_top_k=5,
        )
        response = self.QUERY_ENGINE.query(message)
        # print(response)
        return response

    def generateResponseFromClassifier(self, userPrompt:str):
        classifier = pipeline("zero-shot-classification",
                        model="facebook/bart-large-mnli")
        
        sequence_to_classify = f"{userPrompt}"
        candidate_labels = [
            'Dining', 
            'Shopping', 
            'Travel', 
            'Electronix Entertainment', 
            'Beauty and Wellness', 
            'Gifting', 
            'Health & Fitness',
        ]

        ans = classifier(sequence_to_classify, candidate_labels)
        print(ans)
        return ans


In [None]:
l = Model()
# l.createDocs()
# l.createIndexFromChromaStorage()
# l.simpleChromaDB()
l.testChromaDB("Travel", 10)
# l.loadIndexFromChromaStorage()



# l.createVectorIndex()
# l.retrieveVectorIndex()
# l.generateResponse("something sweet to eat")
# l.customPipeline("something sweet to eat")

In [None]:
# a = l.nodes[0].get_content()
# print(a)