In [4]:
from langchain_community.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer


model_name = "BAAI/llm-embedder"
model_kwargs = {"device": "cpu"}
embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

vectorstore = Chroma(persist_directory="llm-embedder", embedding_function=embeddings)
model = SentenceTransformer('BAAI/bge-reranker-base')


No sentence-transformers model found with name BAAI/bge-reranker-base. Creating a new one with MEAN pooling.
Some weights of XLMRobertaModel were not initialized from the model checkpoint at BAAI/bge-reranker-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
for i in vectorstore.similarity_search("What are the majors offered in the MCDS program?", k=10):
    print(i.page_content)

information technology, software services and social media companies. Requirements: The MCDS program offers three majors: Systems, Analytics, and Human-Centered Data Science. All three require the same total number of course credits, split among required core courses, electives, data science seminar and capstone courses specifically defined for each major. The degree can also be earned in two different ways, depending on the length of time you spend working on it.
The MCDS Program offers a core curriculum and several concentrations; students entering the program enroll in core courses in their first semester and select further courses to satisfy at least one concentration (see Section 3.3.6). Students construct their own course of study, in consultation with their academic advisor, in order to satisfy broad guidelines. Thus, a student may tailor their coursework in a given concentration to follow a particular area of emphasis. The MCDS program is
3.3.6.3 Areas of Concentration In addit

In [5]:
from langchain_core.retrievers import BaseRetriever, RetrieverLike, RetrieverOutputLike
from langchain_core.language_models import BaseLLM
from langchain_core.embeddings import Embeddings
from langchain_core.callbacks import CallbackManagerForRetrieverRun
from langchain_core.documents import Document
from typing import List



class CustomRetriever(BaseRetriever):
  

    # vectorstore = Chroma(persist_directory="llm-embedder", embedding_function=embeddings)
    # model = SentenceTransformer('BAAI/bge-reranker-base')

    vectorstore : RetrieverLike

    model : SentenceTransformer

    def _get_relevant_documents(self, query: str, *, run_manager: CallbackManagerForRetrieverRun) -> List[Document]:

        docs = self.vectorstore.get_relevant_documents(query, k=10)

        queries = [query]
        sentences = []
        for i in docs:
            sentences.append(i.page_content)

        embeddings_1 = self.model.encode(sentences, normalize_embeddings=True)
        embeddings_2 = self.model.encode(queries, normalize_embeddings=True)
        similarity = embeddings_1 @ embeddings_2.T

        sim = [i[0] for i in similarity]

        return [x for _, x in sorted(zip(sim, docs), reverse=True)][0:4]



    # return [Document(page_content=query)]


In [6]:
custom = CustomRetriever(vectorstore=vectorstore.as_retriever(), model=model)

In [11]:
custom.get_relevant_documents("How many people does it take to navigate the current Buggy Race course?")

[Document(page_content='Q: what is the young boy riding in the empty parking lotA: Baseline: bikeSPAE: skateboardQ: how many different wines are lined up in glasses on an outdoor tableA: SPAE: 5Q: what bear walking through tall grass A: Baseline: siberianSPAE: grizzlyQ: how many computer screens are displayed with one imageA: SPAE: 3Figure5.Qualitativesamplesofimage-to-textgeneration:imagecaptioningandVQA.WecomparebetweendifferentlayersofSPAE(L1-L6)andabaselinemodelwithoutsemanticguidanceorpyramidSAQ.', metadata={'source': 'Web Scholar PDFs/376f494126d1ea4f571ea0263c43ac2b6331800a.pdf'}),
 Document(page_content='Act : move forward Obs : 1 steps in front of you and 3 steps to the right there is a\n\nclosed blue lockablebox', metadata={'source': 'Web Scholar PDFs/8ba93052c60a266b31e121fd06e8ce9cbd9b1bc0.pdf'}),
 Document(page_content='Kovač, Portelas, Dominey, & Oudeyer\n\nNew episode. Obs : 1 steps in front of you and 1 steps to the left there is a\n\nNew episode. Obs : 2 steps in front