In [1]:
!pip install retriv
!pip install rank_bm25

Collecting retriv
  Downloading retriv-0.2.3-py3-none-any.whl.metadata (10 kB)
Collecting numba>=0.54.1 (from retriv)
  Downloading numba-0.59.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (2.7 kB)
Collecting optuna (from retriv)
  Downloading optuna-3.5.0-py3-none-any.whl.metadata (17 kB)
Collecting krovetzstemmer (from retriv)
  Downloading KrovetzStemmer-0.8.tar.gz (112 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.9/112.9 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting pystemmer==2.0.1 (from retriv)
  Downloading PyStemmer-2.0.1.tar.gz (559 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m559.3/559.3 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting unidecode (from retriv)
  Downloading Unidecode-1.3.8-py3-none-any.whl.metadata (13 kB)
Collecting ranx (from retriv)
  Downloading ranx-0.3.19-

In [1]:
from retriv import HybridRetriever

hr = HybridRetriever(
    # Shared params ------------------------------------------------------------
    index_name="new-index",
    # Sparse retriever params --------------------------------------------------
    sr_model="bm25",
    min_df=1,
    tokenizer="whitespace",
    stemmer="english",
    stopwords="english",
    do_lowercasing=True,
    do_ampersand_normalization=True,
    do_special_chars_normalization=True,
    do_acronyms_normalization=True,
    do_punctuation_removal=True,
    # Dense retriever params ---------------------------------------------------
    dr_model="sentence-transformers/all-MiniLM-L6-v2",
    normalize=True,
    max_length=128,
    use_ann=True,
)


: 

In [2]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever

In [3]:
import pickle as pkl

with open('splitDocuments.pkl','rb') as f:
  all_splits = pkl.load(f)

In [8]:
def flatten_extend(matrix):
    flat_list = []
    for row in matrix:
        flat_list.extend(row)
    return flat_list


bm25_retriever = BM25Retriever.from_documents(flatten_extend(all_splits))

In [19]:
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
from langchain.embeddings import HuggingFaceEmbeddings



embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = Chroma(persist_directory="all-MiniLM-L6-v2DB", embedding_function=embedding_function)
retriever = vectorstore.as_retriever(search_kwargs={"k": 2})

In [20]:
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, retriever], weights=[0.5, 0.5])


In [21]:
ensemble_retriever.get_relevant_documents("What is Andrew Carnegie known for?")

[Document(page_content='Andrew Carnegie\n\nA self-educated "working boy" who loved books, Andrew Carnegie emigrated from Scotland in 1848 and settled in Pittsburgh, Pa. Attending night school and borrowing books, Carnegie went from factory worker in a textile mill to successful entrepreneur and industrialist. He rose to prominence by founding what became the world\'s largest steel producing company by the end of the 19th century.', metadata={'source': 'Data/history_of_cmu/01.txt'}),
 Document(page_content='Who founded Carnegie Mellon University?\n\nCarnegie Technical Schools was founded in 1900 by Andrew Carnegie. Twelve years later it became known as the Carnegie Institute of Technology. In 1967, the school merged with Mellon Institute and became what is known today as Carnegie Mellon University.', metadata={'source': 'Data/Tartan Facts/01.txt'}),
 Document(page_content='engineer students to take courses in humanities and social sciences in order to better understand the needs of soci

In [15]:
question = "What is Andrew Carnegie known for?"
vectorstore.similarity_search(question)


[Document(page_content='Andrew Carnegie\n\nA self-educated "working boy" who loved books, Andrew Carnegie emigrated from Scotland in 1848 and settled in Pittsburgh, Pa. Attending night school and borrowing books, Carnegie went from factory worker in a textile mill to successful entrepreneur and industrialist. He rose to prominence by founding what became the world\'s largest steel producing company by the end of the 19th century.', metadata={'source': 'Data/history_of_cmu/01.txt'}),
 Document(page_content='engineer students to take courses in humanities and social sciences in order to better understand the needs of society. Carnegie died in 1919, but his vision for an educated public lived on after him.', metadata={'source': 'Data/history_of_cmu/03.txt'}),
 Document(page_content='Carnegie Tech\n\nPost\n\nwar Years', metadata={'source': 'Data/history_of_cmu/04.txt'}),
 Document(page_content='At one point the richest man in the world, Carnegie believed that "to die rich is to die disgrac

In [23]:
# 1. test hybrid retriever
# 2. llm-embedder + bge reranker
# 3. filco context filtering


