In [40]:
from langchain_community.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer


model_name = "BAAI/llm-embedder"
model_kwargs = {"device": "cpu"}
embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

vectorstore = Chroma(persist_directory="llm-embedder", embedding_function=embeddings)
model = SentenceTransformer('BAAI/bge-reranker-base')


In [43]:
vectorstore.similarity_search("no content?")

[]

In [29]:
from langchain_core.retrievers import BaseRetriever, RetrieverLike, RetrieverOutputLike
from langchain_core.language_models import BaseLLM
from langchain_core.embeddings import Embeddings
from langchain_core.callbacks import CallbackManagerForRetrieverRun
from langchain_core.documents import Document
from typing import List



class CustomRetriever(BaseRetriever):
  

    # vectorstore = Chroma(persist_directory="llm-embedder", embedding_function=embeddings)
    # model = SentenceTransformer('BAAI/bge-reranker-base')

    vectorstore : RetrieverLike

    model : SentenceTransformer

    def _get_relevant_documents(self, query: str, *, run_manager: CallbackManagerForRetrieverRun) -> List[Document]:

        docs = self.vectorstore.get_relevant_documents(query, k=10)

        queries = [query]
        sentences = []
        for i in docs:
            sentences.append(i.page_content)

        embeddings_1 = self.model.encode(sentences, normalize_embeddings=True)
        embeddings_2 = self.model.encode(queries, normalize_embeddings=True)
        similarity = embeddings_1 @ embeddings_2.T

        sim = [i[0] for i in similarity]

        return [x for _, x in sorted(zip(sim, docs))]



    # return [Document(page_content=query)]


In [30]:
custom = CustomRetriever(vectorstore=vectorstore.as_retriever(), model=model)

In [31]:
custom.get_relevant_documents("what is buggy?")

ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 768 is different from 0)

In [38]:
docs = vectorstore.similarity_search("what is buggy?")
print(docs)
queries = ["what is buggy?"]
sentences = []
for i in docs:
    sentences.append(i.page_content)

embeddings_1 = model.encode(sentences, normalize_embeddings=True)
embeddings_2 = model.encode(queries, normalize_embeddings=True)
similarity = embeddings_1 @ embeddings_2.T

sim = [i[0] for i in similarity]

[x for _, x in sorted(zip(sim, docs))]

[]


ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 768 is different from 0)