In [1]:
!pip install langchain
!pip install langchain_community
!pip install chroma
!pip install langchain-chroma
!pip install sentence-transformers
!pip install tiktoken
!pip install docx2txt
!pip install pymupdf

Collecting langchain
  Downloading langchain-0.2.12-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.3.0,>=0.2.27 (from langchain)
  Downloading langchain_core-0.2.29-py3-none-any.whl.metadata (6.2 kB)
Collecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.2-py3-none-any.whl.metadata (2.1 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.98-py3-none-any.whl.metadata (13 kB)
Collecting tenacity!=8.4.0,<9.0.0,>=8.1.0 (from langchain)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.3.0,>=0.2.27->langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting orjson<4.0.0,>=3.9.14 (from langsmith<0.2.0,>=0.1.17->langchain)
  Downloading orjson-3.10.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4

In [2]:
!wget -O "1706.03762.pdf" https://arxiv.org/pdf/1706.03762
!wget -O "Graduation Book Final.docx" https://docs.google.com/document/d/1aqmbXxhSVAPsyjaheszlupWgd-mdzUGyFwdYzU9zils/

--2024-08-08 02:57:54--  https://arxiv.org/pdf/1706.03762
Resolving arxiv.org (arxiv.org)... 151.101.195.42, 151.101.3.42, 151.101.131.42, ...
Connecting to arxiv.org (arxiv.org)|151.101.195.42|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2215244 (2.1M) [application/pdf]
Saving to: ‘1706.03762.pdf’


2024-08-08 02:57:54 (78.9 MB/s) - ‘1706.03762.pdf’ saved [2215244/2215244]

--2024-08-08 02:57:54--  https://docs.google.com/document/d/1aqmbXxhSVAPsyjaheszlupWgd-mdzUGyFwdYzU9zils/
Resolving docs.google.com (docs.google.com)... 173.194.69.102, 173.194.69.138, 173.194.69.101, ...
Connecting to docs.google.com (docs.google.com)|173.194.69.102|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://docs.google.com/document/d/1aqmbXxhSVAPsyjaheszlupWgd-mdzUGyFwdYzU9zils/edit [following]
--2024-08-08 02:57:54--  https://docs.google.com/document/d/1aqmbXxhSVAPsyjaheszlupWgd-mdzUGyFwdYzU9zils/edit
Reusing existing connection

In [4]:
import os
import re
import logging
from sentence_transformers import CrossEncoder
import tiktoken
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import PyMuPDFLoader, Docx2txtLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, MarkdownTextSplitter
from langchain_core.embeddings import Embeddings
from langchain_core.vectorstores import VectorStore
from langchain_core.documents import Document

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)
logger = logging.getLogger(__name__)

# Constants

SEPARATORS = {"PDF": [r"(?<=\w{2}\.\s)", " "],
              "Text": ["\n\n", "\n", r"(?<=\w{2}\.\s)", " "]}

# ===================================================================== #

def count_tokens(text: str) -> int:
    """
    Count the number of tokens in a given text using tiktoken.

    Args:
        text (str): The text to be tokenized.

    Returns:
        int: The number of tokens in the text.
    """
    encoding = tiktoken.get_encoding(encoding_name="cl100k_base")
    return len(encoding.encode(text))

def rerank_docs(query: str, docs: list[Document], top_k: int = 5) -> list[Document]:
    """
    Rerank documents based on relevance to the query using CrossEncoder.

    Args:
        query (str): The search query.
        docs (list[Document]): List of documents to be reranked.
        top_k (int): Number of top documents to return.

    Returns:
        list[Document]: List of top_k reranked documents.
    """
    logger.info(f"Reranking documents for query: {query}")
    model = CrossEncoder("jinaai/jina-reranker-v1-turbo-en", trust_remote_code=True)

    # Prepare the query-document pairs for the model
    documents = [doc.page_content for doc in docs]

    # Rank docs against query
    results = model.rank(query, documents, return_documents=False, top_k=top_k)
    indices = [res['corpus_id'] for res in results]
    reranked_docs = [docs[i] for i in indices]
    return reranked_docs

def clean_chunks(chunks: list[Document]) -> list[Document]:
    """
    Clean text chunks from PDFs by removing unwanted characters and reducing multiple spaces.

    Args:
        chunks (list[Document]): List of document chunks to be cleaned.

    Returns:
        list[Document]: List of cleaned document chunks.
    """
    logger.info("Cleaning text chunks.")
    for chunk in chunks:
        chunk.page_content = re.sub(r'[^\w\s!"#$%&\'()*+,\-./:;<=>?@[\\\]^_`{|}~]', ' ', chunk.page_content)
        chunk.page_content = re.sub(r'[\n\t\r\f\v]', ' ', chunk.page_content)
        chunk.page_content = re.sub(r'\s+', ' ', chunk.page_content).strip()
        chunk.metadata = {'page': chunk.metadata['page']}

def index_file(file_path: str, embedder: Embeddings) -> VectorStore:
    """
    Index a file by splitting it into chunks, cleaning the chunks, and embedding them.

    Args:
        file_path (str): The path to the file to be indexed.
        embedder (Embeddings): The embedding model to use.

    Returns:
        VectorStore: The vector store containing the indexed file chunks.
    """
    logger.info(f"Indexing file: {file_path}")
    _, extension = os.path.splitext(file_path)

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=192, chunk_overlap=16,
                                                   length_function=count_tokens,
                                                   is_separator_regex=True)

    if extension == ".pdf":
        logger.info("Loading and splitting PDF file.")
        text_splitter._separators = SEPARATORS["PDF"]
        chunks = PyMuPDFLoader(file_path).load_and_split(text_splitter)
        clean_chunks(chunks)
    elif extension == ".docx":
        logger.info("Loading and splitting DOCX file.")
        text_splitter._separators = SEPARATORS["Text"]
        chunks = Docx2txtLoader(file_path).load_and_split(text_splitter)

    elif extension == ".md":
        logger.info("Loading and splitting Markdown file.")
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        chunks = MarkdownTextSplitter().split_text(text)
        chunks = [Document(page_content = chunk) for chunk in chunks]
    elif extension == ".txt":
        logger.info("Loading and splitting Text file.")
        text_splitter._separators = SEPARATORS["Text"]
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        chunks = text_splitter.split_text(text)
        chunks = [Document(page_content = chunk) for chunk in chunks]

    else:
        logger.error(f"Unsupported file extension: {extension}")
        raise ValueError(f"Unsupported file extension: {extension}")

    return Chroma.from_documents(chunks, embedder, collection_name='ClippyX')

def search(query: str, index: VectorStore, k: int = 15, top_k: int = 5) -> list[Document]:
    """
    Search for documents in the index that are most relevant to the query.

    Args:
        query (str): The search query.
        index (VectorStore): The vector store to search.
        k (int): Number of top documents to retrieve initially.
        top_k (int): Number of top documents to return after reranking.

    Returns:
        list[Document]: List of top_k relevant documents.
    """
    logger.info(f"Searching for query: {query}")
    docs = index.similarity_search(query, k)
    top_docs = rerank_docs(query, docs, top_k)
    return top_docs
# --------------------------------------------------------------------- #

  from tqdm.autonotebook import tqdm, trange


In [5]:
# Utils
from sentence_transformers import SentenceTransformer
from langchain_core.embeddings import Embeddings

EMBEDDING_MODEL_NAME = "Alibaba-NLP/gte-base-en-v1.5"
RETRIEVAL_PROMPT = "Represent this sentence for searching relevant passages: "
# ===================================================================== #

# HFEmbedding Model
class HFEmbedding(Embeddings):
    def __init__(self, model_name=EMBEDDING_MODEL_NAME,
                prompt=RETRIEVAL_PROMPT):
        self.model = SentenceTransformer(model_name, trust_remote_code=True).cuda()
        self.prompt = prompt

    def embed_documents(self, texts: list[str]):
        return self.model.encode(texts).tolist()

    def embed_query(self, text: str):
        text = self.prompt + text if (self.prompt) else text
        return self.model.encode(text).squeeze().tolist()
# --------------------------------------------------------------------- #

In [6]:
emb = HFEmbedding()

2024-08-08 02:58:33,295 - INFO - Use pytorch device_name: cuda
2024-08-08 02:58:33,296 - INFO - Load pretrained SentenceTransformer: Alibaba-NLP/gte-base-en-v1.5
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/71.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

configuration.py:   0%|          | 0.00/7.13k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- configuration.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling.py:   0%|          | 0.00/59.0k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- modeling.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/547M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

In [12]:
index = index_file('/content/Graduation Book Final.docx', emb)

2024-08-08 03:02:26,829 - INFO - Indexing file: /content/Graduation Book Final.docx
2024-08-08 03:02:26,831 - INFO - Loading and splitting DOCX file.


BadZipFile: File is not a zip file

In [None]:
search('chunk-size trade-off mitigation', index)

2024-08-05 23:09:54,426 - INFO - Searching for query: chunk-size trade-off mitigation


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-08-05 23:09:54,481 - INFO - Reranking documents for query: chunk-size trade-off mitigation
2024-08-05 23:10:00,902 - INFO - Use pytorch device: cuda


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[Document(metadata={'source': '/content/Graduation Book Final.docx'}, page_content='To address this issue, we propose an auto-merging retrieval algorithm. This algorithm mitigates the reduced completeness caused by granular chunking by referencing smaller, related chunks (children) to their larger, aggregated form (parent) at multiple levels and combining multiple retrieved sibling chunks into a coherent context on retrieval.\n\nIn this approach, when the number of child chunks related to a specific parent chunk exceeds a defined threshold, all child chunks are merged to form the parent chunk, as illustrated in Figure 5.4. This merging process, combined with reranking (discussed in the following subsection), has demonstrated in our testing to significantly enhance answer groundedness and context relevance, while also reducing the cost of the LLM prompt compared to basic retrieval methods.\n\n\n\nFigure 5.4: Auto-Merging\n\nReranking'),
 Document(metadata={'source': '/content/Graduation

In [None]:
index.delete_collection()

In [9]:
index = index_file('/content/1706.03762.pdf', emb)

2024-08-08 03:00:56,632 - INFO - Indexing file: /content/1706.03762.pdf
2024-08-08 03:00:56,634 - INFO - Loading and splitting PDF file.
2024-08-08 03:00:59,102 - INFO - Cleaning text chunks.
2024-08-08 03:00:59,825 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

In [25]:
output = search('English Parsing', index)

2024-08-08 03:05:06,251 - INFO - Searching for query: English Parsing


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-08-08 03:05:06,340 - INFO - Reranking documents for query: English Parsing
2024-08-08 03:05:12,470 - INFO - Use pytorch device: cuda


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [26]:
len(output)

5

In [29]:
output

[Document(metadata={'page': 9}, page_content='Table 4: The Transformer generalizes well to English constituency parsing (Results are on Section 23 of WSJ) Parser Training WSJ 23 F1 Vinyals & Kaiser el al. (2014) [37] WSJ only, discriminative 88.3 Petrov et al. (2006) [29] WSJ only, discriminative 90.4 Zhu et al. (2013) [40] WSJ only, discriminative 90.4 Dyer et al. (2016) [8] WSJ only, discriminative 91.7 Transformer (4 layers) WSJ only, discriminative 91.3 Zhu et al.'),
 Document(metadata={'page': 8}, page_content='In row (E) we replace our sinusoidal positional encoding with learned positional embeddings [9], and observe nearly identical results to the base model. 6.3 English Constituency Parsing To evaluate if the Transformer can generalize to other tasks we performed experiments on English constituency parsing. This task presents specific challenges: the output is subject to strong structural constraints and is significantly longer than the input. Furthermore, RNN sequence-to-seque